In [2]:
import pandas as pd

In [3]:
overall_data = pd.read_excel('tfcu_member.xlsx', index_col=0, sheet_name='KPI Data')
df_ = overall_data.drop(overall_data.index[0:3])
df = df_.drop(columns=['Unnamed: 10'])
df_time = df[['Week','Member Retention',
       'Employee Retention', 'Member Satisfaction NPS', 'Fraud rate',
       'Employee satisfaction', 'Number of Employee Relations Issues',
       '# of Member complaints', 'Member acquisition cost (CAC)',
       'Number of Compliance Issues', 'Compliance Investigation Cycle Time',
       'Decline rate', 'Financial impact due to attacks',
       'Attempted vs Fraudulent Transactions', 'Final Approval rate',
       'Fraudulent Vs Declined Transactions', 'System Uptime',
       'Severity of attacks', 'Mean Time Between Failures (MTBF)',
       'Cost per incident', 'Number of cybersecurity incidents reported'
             ]].groupby(['Week']).mean()

In [11]:
from statsmodels.regression.linear_model import OLS
import statsmodels.api as sm
import numpy as np

In [15]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import train_test_split

# Load your dataset
# Assuming your dataset is loaded into a DataFrame called df
df = df_time
# Create an empty DataFrame to store results
results_df = pd.DataFrame(columns=['Dependent_Variable', 'Selected_Features', 'Coefficients'])

# Iterate through each column as dependent variable
for column in df.columns:
    # Split data into train and test sets
    X = df.drop(column, axis=1)  # Independent variables
    y = df[column]                # Dependent variable
    
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Define the linear regression model
    linreg = LinearRegression(fit_intercept=False)
    
    # Perform stepwise selection
    sfs = SFS(linreg, 
              k_features='best', 
              forward=True, 
              floating=False, 
              scoring='neg_mean_squared_error', 
              cv=5)
    
    sfs.fit(X, y)
    
    # Get selected features
    selected_features = list(sfs.k_feature_names_)
    selected_indices = list(sfs.k_feature_idx_)
    
    # Fit the linear regression model on selected features
    linreg.fit(X.iloc[:, list(selected_indices)], y)
    
    # Get coefficients and p-values
    coefficients = linreg.coef_
    intercept = linreg.intercept_
    # p_values = linreg.p
    
    # Create a dictionary to store the results
    result_dict = {
        'Dependent_Variable': column,
        'Selected_Features': ', '.join(selected_features),
        'Coefficients': ', '.join(map(str, coefficients)),
        
        # 'P_Values': ', '.join(map(str, p_values))
    }
    # Append the results to the DataFrame
    results_df = results_df._append(result_dict, ignore_index=True)


In [16]:
results_df.head()

Unnamed: 0,Dependent_Variable,Selected_Features,Coefficients
0,Member Retention,"# of Member complaints, Final Approval rate, S...","0.4155032676522855, 0.5401360711722573, 0.2607..."
1,Employee Retention,"Member Satisfaction NPS, Number of Employee Re...","-0.249536174237157, -0.6051620501475861, 0.100..."
2,Member Satisfaction NPS,"Member Retention, Employee satisfaction, Numbe...","-0.18500169552286458, -1.0503246834831774, -2...."
3,Fraud rate,"Member Retention, Employee Retention, Attempte...","-0.00416097270501884, 0.00279668859930402, -0...."
4,Employee satisfaction,"Member Satisfaction NPS, Fraud rate, # of Memb...","-0.03166957417232354, 1.0687708197608694, -0.0..."


In [17]:
results_df['Dependent_Variable']

0                               Member Retention
1                             Employee Retention
2                        Member Satisfaction NPS
3                                     Fraud rate
4                          Employee satisfaction
5            Number of Employee Relations Issues
6                         # of Member complaints
7                  Member acquisition cost (CAC)
8                    Number of Compliance Issues
9            Compliance Investigation Cycle Time
10                                  Decline rate
11               Financial impact due to attacks
12          Attempted vs Fraudulent Transactions
13                           Final Approval rate
14           Fraudulent Vs Declined Transactions
15                                 System Uptime
16                           Severity of attacks
17             Mean Time Between Failures (MTBF)
18                             Cost per incident
19    Number of cybersecurity incidents reported
Name: Dependent_Vari

In [19]:
from py2neo import Graph
graph = Graph("bolt://localhost:7687", auth=("neo4j", "India@123"))
graph.run("")

n,n_sq
1,1
2,4
3,9


In [21]:
for k in results_df['Dependent_Variable']:
    graph.run("CREATE (p:KPI {name: $x}) ", x=k)

In [34]:
f

Member Retention -0.05174013574950742
Fraud rate -2.0736419860431363
# of Member complaints 0.06310588883621072
Member acquisition cost (CAC) -0.009105123765616195
Compliance Investigation Cycle Time 0.04825839269777032
Final Approval rate 0.08918270452949859


In [36]:
for index, row in results_df.iterrows():
    
    coefficients = [float(value.strip()) for value in row['Coefficients'].split(",")]
    selected_features = [value.strip() for value in row['Selected_Features'].split(",")]
    for feature, coeff in zip(selected_features,coefficients):
       graph.run("""MATCH (n:KPI {name:$f})
                    MATCH (m:KPI {name:$s})
                   CREATE (n)-[rel:LINEAR_RELATION {coeff:$c}]->(m)""", f=feature,
                 s=row['Dependent_Variable'], c=coeff)
