## Data Import

In [1]:
import pandas as pd

In [2]:
overall_data = pd.read_excel('tfcu_member.xlsx', index_col=0, sheet_name='KPI Data')
df_ = overall_data.drop(overall_data.index[0:3])
df = df_.drop(columns=['Unnamed: 10'])
df_time = df[['Week','Member Retention',
       'Employee Retention', 'Member Satisfaction NPS', 'Fraud rate',
       'Employee satisfaction', 'Number of Employee Relations Issues',
       '# of Member complaints', 'Member acquisition cost (CAC)',
       'Number of Compliance Issues', 'Compliance Investigation Cycle Time',
       'Decline rate', 'Financial impact due to attacks',
       'Attempted vs Fraudulent Transactions', 'Final Approval rate',
       'Fraudulent Vs Declined Transactions', 'System Uptime',
       'Severity of attacks', 'Mean Time Between Failures (MTBF)',
       'Cost per incident', 'Number of cybersecurity incidents reported'
             ]].groupby(['Week']).mean()

In [3]:
import numpy as np


In [4]:
np.random.normal(5,1, 10)

array([4.53930828, 4.3868764 , 3.68945615, 5.2522248 , 6.07343699,
       2.98812276, 4.68257253, 4.67552611, 4.77222839, 4.51434289])

In [5]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import train_test_split

# Load your dataset
# Assuming your dataset is loaded into a DataFrame called df
df = df_time
# Create an empty DataFrame to store results
results_df = pd.DataFrame(columns=['Dependent_Variable', 'Selected_Features', 'Coefficients', 'P_Values'])

# Iterate through each column as dependent variable
for column in df.columns:
    # Split data into train and test sets
    X = df.drop(column, axis=1)  # Independent variables
    y = df[column]                # Dependent variable
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Define the linear regression model
    linreg = LinearRegression()
    
    # Perform stepwise selection
    sfs = SFS(linreg, 
              k_features='best', 
              forward=True, 
              floating=False, 
              scoring='neg_mean_squared_error', 
              cv=5)
    
    sfs.fit(X_train, y_train)
    
    # Get selected features
    selected_features = list(sfs.k_feature_names_)
    selected_indices = list(sfs.k_feature_idx_)
    
    # Fit the linear regression model on selected features
    linreg.fit(X_train.iloc[:, list(selected_indices)], y_train)
    
    # Get coefficients and p-values
    coefficients = linreg.coef_
    # p_values = linreg.p
    
    # Create a dictionary to store the results
    result_dict = {
        'Dependent_Variable': column,
        'Selected_Features': ', '.join(selected_features),
        'Coefficients': ', '.join(map(str, coefficients)),
        # 'P_Values': ', '.join(map(str, p_values))
    }
    
    # Append the results to the DataFrame
    results_df = results_df._append(result_dict, ignore_index=True)




In [6]:
results_df.loc[0]['Selected_Features']

'# of Member complaints, Member acquisition cost (CAC), Compliance Investigation Cycle Time, Attempted vs Fraudulent Transactions, Number of cybersecurity incidents reported'

In [21]:
results_df.loc[0]['Coefficients']

'0.5852784042144258, -0.13541905034879975, 0.4978049138867525, 1.2259340284291205, -4.814432976655741'

In [22]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Assuming you have a DataFrame df with your data
df = df_time.copy()
# Create an empty dictionary to store the regression models
regression_models = {}

# Create an empty list to store the details
details_list = []

# Iterate over each pair of variables
for dependent_variable in df.columns:
    for independent_variable in df.columns:
        if dependent_variable != independent_variable:
            # Select the independent and dependent variables
            X = df[[independent_variable]]
            y = df[dependent_variable]
            
            # Split the data into training and testing sets
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            
            # Create and fit the linear regression model
            regression_model = LinearRegression()
            regression_model.fit(X_train, y_train)
            
            # Store the trained model in the dictionary
            regression_models[(dependent_variable, independent_variable)] = regression_model
            
            # Extract coefficient and score
            coefficient = regression_model.coef_[0]
            score = regression_model.score(X_test, y_test)
            
            # Create a dictionary to store the details
            details_dict = {
                'Independent_Variable': independent_variable,
                'Dependent_Variable': dependent_variable,
                'Coefficient': coefficient,
                'Score': score
            }
            
            # Append the dictionary to the list
            details_list.append(details_dict)

# Convert the list of dictionaries into a DataFrame
details_df = pd.DataFrame(details_list)



In [23]:
details_df[details_df['Dependent_Variable']=='Member Retention']

Unnamed: 0,Independent_Variable,Dependent_Variable,Coefficient,Score
0,Employee Retention,Member Retention,-0.159478,-0.076722
1,Member Satisfaction NPS,Member Retention,-0.194718,0.128564
2,Fraud rate,Member Retention,-18.962355,-0.251917
3,Employee satisfaction,Member Retention,-0.718722,-0.08023
4,Number of Employee Relations Issues,Member Retention,-3.04919,-0.443132
5,# of Member complaints,Member Retention,0.222477,-0.110593
6,Member acquisition cost (CAC),Member Retention,-0.149718,-0.400956
7,Number of Compliance Issues,Member Retention,1.031561,-0.024723
8,Compliance Investigation Cycle Time,Member Retention,0.170672,-0.095192
9,Decline rate,Member Retention,0.875515,0.005983
