## Data Import

In [2]:
import pandas as pd

In [3]:
overall_data = pd.read_excel('tfcu_member.xlsx', index_col=0, sheet_name='KPI Data')
df_ = overall_data.drop(overall_data.index[0:3])
df = df_.drop(columns=['Unnamed: 10'])
df_time = df[['Week','Member Retention',
       'Employee Retention', 'Member Satisfaction NPS', 'Fraud rate',
       'Employee satisfaction', 'Number of Employee Relations Issues',
       '# of Member complaints', 'Member acquisition cost (CAC)',
       'Number of Compliance Issues', 'Compliance Investigation Cycle Time',
       'Decline rate', 'Financial impact due to attacks',
       'Attempted vs Fraudulent Transactions', 'Final Approval rate',
       'Fraudulent Vs Declined Transactions', 'System Uptime',
       'Severity of attacks', 'Mean Time Between Failures (MTBF)',
       'Cost per incident', 'Number of cybersecurity incidents reported'
             ]].groupby(['Week']).mean()

In [4]:
import numpy as np


In [5]:
np.random.normal(5,1, 10)

array([4.05271884, 3.69197234, 5.48194868, 3.46159058, 5.73617918,
       4.11234879, 4.87707416, 6.13392048, 4.75553515, 3.63708747])

In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import train_test_split

# Load your dataset
# Assuming your dataset is loaded into a DataFrame called df
df = df_time
# Create an empty DataFrame to store results
results_df = pd.DataFrame(columns=['Dependent_Variable', 'Selected_Features', 'Coefficients', 'P_Values'])

# Iterate through each column as dependent variable
for column in df.columns:
    # Split data into train and test sets
    X = df.drop(column, axis=1)  # Independent variables
    y = df[column]                # Dependent variable
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Define the linear regression model
    linreg = LinearRegression()
    
    # Perform stepwise selection
    sfs = SFS(linreg, 
              k_features='best', 
              forward=True, 
              floating=False, 
              scoring='neg_mean_squared_error', 
              cv=5)
    
    sfs.fit(X_train, y_train)
    
    # Get selected features
    selected_features = list(sfs.k_feature_names_)
    selected_indices = list(sfs.k_feature_idx_)
    
    # Fit the linear regression model on selected features
    linreg.fit(X_train.iloc[:, list(selected_indices)], y_train)
    
    # Get coefficients and p-values
    coefficients = linreg.coef_
    intercept = linreg.intercept_
    # p_values = linreg.p
    
    # Create a dictionary to store the results
    result_dict = {
        'Dependent_Variable': column,
        'Selected_Features': ', '.join(selected_features),
        'Coefficients': ', '.join(map(str, coefficients)),
        'Intercept': intercept
        # 'P_Values': ', '.join(map(str, p_values))
    }
    # Append the results to the DataFrame
    results_df = results_df._append(result_dict, ignore_index=True)


In [39]:
features_selected = results_df[results_df[
        "Dependent_Variable"]=="Employee satisfaction"]["Selected_Features"]

Unnamed: 0,Dependent_Variable,Selected_Features,Coefficients,P_Values
4,Employee satisfaction,"Member Satisfaction NPS, Compliance Investigat...","-0.013415566502093171, 0.06287811803585024, -0...",


In [52]:
features_selected = list(map(lambda x: x.strip(),results_df[results_df[
        "Dependent_Variable"]=="Employee satisfaction"][
    "Selected_Features"].values[0].split(',')))
features_selected

['Member Satisfaction NPS',
 'Compliance Investigation Cycle Time',
 'Mean Time Between Failures (MTBF)']

In [53]:
features_coefficient = list(map(lambda x: float(x.strip()),results_df[results_df[
        "Dependent_Variable"]=="Employee satisfaction"][
    "Coefficients"].values[0].split(',')))
features_coefficient

[-0.013415566502093171, 0.06287811803585024, -0.019770677737250875]

In [None]:
intercept_value = 

In [54]:
results_df[results_df[
        "Dependent_Variable"]=="Employee satisfaction"][
    "Intercept"].values[0]

6.282738127386836

In [37]:
results_df.loc[0]['Coefficients']

'0.5852784042144258, -0.13541905034879975, 0.4978049138867525, 1.2259340284291205, -4.814432976655741'

In [21]:
results_df.loc[0]['Coefficients']

'0.5852784042144258, -0.13541905034879975, 0.4978049138867525, 1.2259340284291205, -4.814432976655741'

In [22]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Assuming you have a DataFrame df with your data
df = df_time.copy()
# Create an empty dictionary to store the regression models
regression_models = {}

# Create an empty list to store the details
details_list = []

# Iterate over each pair of variables
for dependent_variable in df.columns:
    for independent_variable in df.columns:
        if dependent_variable != independent_variable:
            # Select the independent and dependent variables
            X = df[[independent_variable]]
            y = df[dependent_variable]
            
            # Split the data into training and testing sets
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            
            # Create and fit the linear regression model
            regression_model = LinearRegression()
            regression_model.fit(X_train, y_train)
            
            # Store the trained model in the dictionary
            regression_models[(dependent_variable, independent_variable)] = regression_model
            
            # Extract coefficient and score
            coefficient = regression_model.coef_[0]
            score = regression_model.score(X_test, y_test)
            
            # Create a dictionary to store the details
            details_dict = {
                'Independent_Variable': independent_variable,
                'Dependent_Variable': dependent_variable,
                'Coefficient': coefficient,
                'Score': score
            }
            
            # Append the dictionary to the list
            details_list.append(details_dict)

# Convert the list of dictionaries into a DataFrame
details_df = pd.DataFrame(details_list)



In [23]:
details_df[details_df['Dependent_Variable']=='Member Retention']

Unnamed: 0,Independent_Variable,Dependent_Variable,Coefficient,Score
0,Employee Retention,Member Retention,-0.159478,-0.076722
1,Member Satisfaction NPS,Member Retention,-0.194718,0.128564
2,Fraud rate,Member Retention,-18.962355,-0.251917
3,Employee satisfaction,Member Retention,-0.718722,-0.08023
4,Number of Employee Relations Issues,Member Retention,-3.04919,-0.443132
5,# of Member complaints,Member Retention,0.222477,-0.110593
6,Member acquisition cost (CAC),Member Retention,-0.149718,-0.400956
7,Number of Compliance Issues,Member Retention,1.031561,-0.024723
8,Compliance Investigation Cycle Time,Member Retention,0.170672,-0.095192
9,Decline rate,Member Retention,0.875515,0.005983


## Cointegration

In [27]:
import pandas as pd
import statsmodels.tsa.stattools as st

# Assuming you have a DataFrame df with your data
df = df_time.copy()
# Create an empty dictionary to store the regression models
regression_models = {}

# Create an empty list to store the details
details_list = []

# Iterate over each pair of variables
for dependent_variable in df.columns:
    for independent_variable in df.columns:
        if dependent_variable != independent_variable:
            # Select the independent and dependent variables
            X = df[[independent_variable]]
            y = df[dependent_variable]
            coint_score = round(st.coint(X, y)[1],5) 
            # # Split the data into training and testing sets
            # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            
            # # Create and fit the linear regression model
            # regression_model = LinearRegression()
            # regression_model.fit(X_train, y_train)
            
            # # Store the trained model in the dictionary
            # regression_models[(dependent_variable, independent_variable)] = regression_model
            
            # # Extract coefficient and score
            # coefficient = regression_model.coef_[0]
            # score = regression_model.score(X_test, y_test)
            
            # Create a dictionary to store the details
            details_dict = {
                'Independent_Variable': independent_variable,
                'Dependent_Variable': dependent_variable,
                'cointegration': coint_score
            }
            if coint_score< 0.05:
            # Append the dictionary to the list
                details_list.append(details_dict)

# Convert the list of dictionaries into a DataFrame
details_df = pd.DataFrame(details_list)



  coint_score = round(st.coint(X, y)[1],5)
  coint_score = round(st.coint(X, y)[1],5)


In [32]:
details_df[details_df["Independent_Variable"] == "Member Retention"]

Unnamed: 0,Independent_Variable,Dependent_Variable,cointegration
18,Member Retention,Employee Retention,2e-05
53,Member Retention,Fraud rate,1e-05
71,Member Retention,Employee satisfaction,0.0
90,Member Retention,Number of Employee Relations Issues,0.0
108,Member Retention,# of Member complaints,1e-05
126,Member Retention,Member acquisition cost (CAC),1e-05
144,Member Retention,Number of Compliance Issues,1e-05
162,Member Retention,Compliance Investigation Cycle Time,1e-05
180,Member Retention,Decline rate,0.0
198,Member Retention,Financial impact due to attacks,0.0


In [34]:
from statsmodels.tsa.vector_ar.vecm import coint_johansen as cj

coint_test_result = cj(df_time[df_time.columns[1:]], det_order=0, k_ar_diff=1)

# Extract the eigenvalues and critical values
tracevalues = coint_test_result.lr1
critical_values = coint_test_result.cvt


# for i, (stock1, stock2) in enumerate(stock_pairs):
#     if (tracevalues[i] > critical_values[:, 1]).all():
#         print(f"Pair {i + 1} ({stock1} and {stock2}) is cointegrated.")
#     else:
#         print(f"Pair {i + 1} ({stock1} and {stock2}) is not cointegrated.")

  coint_test_result = cj(df_time[df_time.columns[1:]], det_order=0, k_ar_diff=1)


UFuncTypeError: Cannot cast ufunc 'inv' input from dtype('O') to dtype('float64') with casting rule 'same_kind'

In [31]:
details_df

Unnamed: 0,Independent_Variable,Dependent_Variable,cointegration
0,Employee Retention,Member Retention,0.00000
1,Member Satisfaction NPS,Member Retention,0.00000
2,Fraud rate,Member Retention,0.00000
3,Number of Employee Relations Issues,Member Retention,0.00000
4,# of Member complaints,Member Retention,0.00000
...,...,...,...
352,Fraudulent Vs Declined Transactions,Number of cybersecurity incidents reported,0.00000
353,System Uptime,Number of cybersecurity incidents reported,0.00000
354,Severity of attacks,Number of cybersecurity incidents reported,0.00151
355,Mean Time Between Failures (MTBF),Number of cybersecurity incidents reported,0.00672


In [35]:
df_time.columns[1:]

Index(['Employee Retention', 'Member Satisfaction NPS', 'Fraud rate',
       'Employee satisfaction', 'Number of Employee Relations Issues',
       '# of Member complaints', 'Member acquisition cost (CAC)',
       'Number of Compliance Issues', 'Compliance Investigation Cycle Time',
       'Decline rate', 'Financial impact due to attacks',
       'Attempted vs Fraudulent Transactions', 'Final Approval rate',
       'Fraudulent Vs Declined Transactions', 'System Uptime',
       'Severity of attacks', 'Mean Time Between Failures (MTBF)',
       'Cost per incident', 'Number of cybersecurity incidents reported'],
      dtype='object')

In [36]:
df_time.head()

Unnamed: 0_level_0,Member Retention,Employee Retention,Member Satisfaction NPS,Fraud rate,Employee satisfaction,Number of Employee Relations Issues,# of Member complaints,Member acquisition cost (CAC),Number of Compliance Issues,Compliance Investigation Cycle Time,Decline rate,Financial impact due to attacks,Attempted vs Fraudulent Transactions,Final Approval rate,Fraudulent Vs Declined Transactions,System Uptime,Severity of attacks,Mean Time Between Failures (MTBF),Cost per incident,Number of cybersecurity incidents reported
Week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
WK 01,74.3875,75.02,62.36,0.475584,5.3525,2.915,10.5675,86.5,2.49,18.31,7.745,14709.1,8.585,79.6875,79.6875,94.3775,5.4725,55.2675,7535.725,2.9575
WK 02,74.405,74.3275,62.0125,0.473602,5.64,3.06,9.815,86.2725,2.4675,18.24,8.1175,15483.0525,8.425,80.255,80.255,94.8075,5.38,54.945,7414.5425,3.08
WK 03,73.115,75.195,62.365,0.505365,5.615,3.04,10.3625,88.7375,2.49,19.195,8.075,14896.315,8.4125,79.9725,79.9725,94.4925,5.545,56.075,7331.6025,3.1425
WK 04,75.265,74.0125,62.4525,0.462778,5.67,3.025,9.625,88.8675,2.325,19.655,8.02,14942.675,8.545,80.06,80.06,94.55,5.33,57.56,7444.6875,3.055
WK 05,74.3925,73.94,63.4925,0.507319,5.695,2.905,9.6375,86.5575,2.4025,18.51,8.195,14797.1975,8.4325,79.93,79.93,94.625,5.6775,56.495,7455.96,2.95
