# OLS Predictor Model

## Preparation

### Import

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler, QuantileTransformer
import statsmodels.api as sm
import random
import itertools
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Option

In [None]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 1000)

## Function Definition

### Min-Max Scaler

In [None]:
def min_max_scaling_df(df):

    scaler = MinMaxScaler()
    scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    
    return scaled_df

### Prepare Dataframe for Modeling

In [None]:
def prepare_df_for_modeling(df):

  df = df.select_dtypes(include=['int', 'float'])
  df = df[df['theory'] >= 3]
  df.reset_index(drop=True, inplace=True)
  
  return df

### Select Columns by Index

In [None]:
def select_columns_by_index(df, column_indexes):

    selected_columns = df.iloc[:, column_indexes].copy()
    selected_column_names = df.columns[column_indexes].tolist()
    
    return selected_columns, selected_column_names

In [None]:
def select_columns_by_index(df, column_indexes):
  
    cols= column_indexes
    cols.append(len(df.columns)-1)
    selected_columns = df.iloc[:, cols].copy()
    selected_column_names = df.columns[column_indexes].tolist()
    
    return selected_columns, selected_column_names

### Generate Index

In [None]:
def generate_features_index(my_list):
  # Define the ranges for each item
  #selected_features = ['indiv_spoken_time_ratio', 'contribution_index', 'in_group_loyality_score']
  range_1 = [0] + list(range(2, 21))
  range_2 = [21] + list(range(23, 38))
  range_3 = list(range(38,60))

  # Generate all combinations
  combinations = list(itertools.product(range_1, range_2, range_3,range_3))

  # Convert each combination tuple to a list
  combinations_as_list = [list(combination) + my_list for combination in combinations]

  return combinations_as_list

### Rescale Metric

In [None]:
def rescale_rmse(rmse, original_column):
    # Calculate the range of the original column
    column_range = original_column.max() - original_column.min()

    # Rescale the RMSE to the original units
    rmse_rescaled = rmse * column_range

    return rmse_rescaled


### Return Dataframe by Columns

In [None]:
def return_df_by_columns(df,column_names):

  new_df = df[column_names].copy()
  data = pd.concat([new_df, df['theory']], axis=1)
  data = min_max_scaling_df(data)

  return data

### Evaluate Regression Models by Cross-Validation

In [None]:
def evaluate_regression_models(df):
    # Select predictors (X) and target variable (y)
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    # Initialize lists to store scores
    mse_scores_linear = []
    rmse_scores_linear = []
    mae_scores_linear = []
    r2_scores_linear = []

    mse_scores_ridge = []
    rmse_scores_ridge = []
    mae_scores_ridge = []
    r2_scores_ridge = []

    # Define the number of folds for cross-validation
    n_splits = 5

    # Initialize the KFold splitter
    kf = KFold(n_splits=n_splits)

    # Perform cross-validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Initialize and fit the Linear Regression model
        model_linear = LinearRegression()
        model_linear.fit(X_train, y_train)

        # Initialize and fit the Ridge Regression model
        model_ridge = Ridge(alpha=1.0)  # You can adjust the regularization parameter (alpha) as needed
        model_ridge.fit(X_train, y_train)

        # Obtain predictions for Linear Regression
        y_pred_linear = model_linear.predict(X_test)

        # Obtain predictions for Ridge Regression
        y_pred_ridge = model_ridge.predict(X_test)

        # Calculate evaluation metrics for Linear Regression
        mse_scores_linear.append(mean_squared_error(y_test, y_pred_linear))
        rmse_scores_linear.append(np.sqrt(mean_squared_error(y_test, y_pred_linear)))
        mae_scores_linear.append(mean_absolute_error(y_test, y_pred_linear))
        r2_scores_linear.append(r2_score(y_test, y_pred_linear))

        # Calculate evaluation metrics for Ridge Regression
        mse_scores_ridge.append(mean_squared_error(y_test, y_pred_ridge))
        rmse_scores_ridge.append(np.sqrt(mean_squared_error(y_test, y_pred_ridge)))
        mae_scores_ridge.append(mean_absolute_error(y_test, y_pred_ridge))
        r2_scores_ridge.append(r2_score(y_test, y_pred_ridge))

    # Create a dictionary to store the evaluation metrics
    results = {
        'Linear Regression': {
            'Mean Squared Error (MSE)': np.mean(mse_scores_linear),
            'Root Mean Squared Error (RMSE)': np.mean(rmse_scores_linear),
            'Mean Absolute Error (MAE)': np.mean(mae_scores_linear),
            'R-squared (R2)': np.mean(r2_scores_linear)
        },
        'Ridge Regression': {
            'Mean Squared Error (MSE)': np.mean(mse_scores_ridge),
            'Root Mean Squared Error (RMSE)': np.mean(rmse_scores_ridge),
            'Mean Absolute Error (MAE)': np.mean(mae_scores_ridge),
            'R-squared (R2)': np.mean(r2_scores_ridge)
        }
    }

    return results

In [None]:
def evaluate_linear_regression(df):
    # Select predictors (X) and target variable (y)
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    # Initialize lists to store scores
    mse_scores_linear = []
    rmse_scores_linear = []
    mae_scores_linear = []
    r2_scores_linear = []

    # Define the number of folds for cross-validation
    n_splits = 5

    # Initialize the KFold splitter
    kf = KFold(n_splits=n_splits)

    # Perform cross-validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Initialize and fit the Linear Regression model
        model_linear = LinearRegression()
        model_linear.fit(X_train, y_train)

        # Obtain predictions for Linear Regression
        y_pred_linear = model_linear.predict(X_test)

        # Calculate evaluation metrics for Linear Regression
        mse_scores_linear.append(mean_squared_error(y_test, y_pred_linear))
        rmse_scores_linear.append(np.sqrt(mean_squared_error(y_test, y_pred_linear)))
        mae_scores_linear.append(mean_absolute_error(y_test, y_pred_linear))
        r2_scores_linear.append(r2_score(y_test, y_pred_linear))

    # Create a dictionary to store the evaluation metrics
    results = {
        'Linear Regression': {
            'Mean Squared Error (MSE)': np.mean(mse_scores_linear),
            'Root Mean Squared Error (RMSE)': np.mean(rmse_scores_linear),
            'Mean Absolute Error (MAE)': np.mean(mae_scores_linear),
            'R-squared (R2)': np.mean(r2_scores_linear)
        }
    }

    return results

In [None]:
def evaluate_linear_regression(df):
    # Select predictors (X) and target variable (y)
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    # Initialize lists to store scores
    mse_scores_linear = []
    rmse_scores_linear = []
    mae_scores_linear = []
    r2_scores_linear = []

    # Define the number of folds for cross-validation
    n_splits = 5

    # Initialize the KFold splitter
    kf = KFold(n_splits=n_splits)

    # Perform cross-validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Initialize and fit the Linear Regression model
        model_linear = LinearRegression()
        model_linear.fit(X_train, y_train)

        # Obtain predictions for Linear Regression
        y_pred_linear = model_linear.predict(X_test)

        # Calculate evaluation metrics for Linear Regression
        mse_scores_linear.append(mean_squared_error(y_test, y_pred_linear))
        rmse_scores_linear.append(np.sqrt(mean_squared_error(y_test, y_pred_linear)))
        mae_scores_linear.append(mean_absolute_error(y_test, y_pred_linear))
        r2_scores_linear.append(r2_score(y_test, y_pred_linear))

    # Create a dictionary to store the evaluation metrics
    results = {
        'Linear Regression': {
            'Mean Squared Error (MSE)': np.mean(mse_scores_linear),
            'Root Mean Squared Error (RMSE)': np.mean(rmse_scores_linear),
            'Mean Absolute Error (MAE)': np.mean(mae_scores_linear),
            'R-squared (R2)': np.mean(r2_scores_linear)
        }
    }

    # Create a dictionary to store the model and results
    output = {
        'model': model_linear,
        'results': results
    }

    return output


## Use of Function

### Fetch the data

In [None]:
in_file= r'/content/drive/MyDrive/Projects/tps/finals/data/3_individual_features.xlsx'
out_file= r'/content/drive/MyDrive/Projects/tps/finals/data/6_model_regression_full_features.xlsx'

In [None]:
df= pd.read_excel(in_file, index_col='Id')
not_used_columns=['avg_time_without_speaking_ratio', 'max_time_without_speaking_ratio', 'avg_turns_without_speaking_ratio', 'max_turns_without_speaking_ratio', 'avg_words_turn_ratio', 'max_words_turn', 'max_words_turn_ratio', 'messages_total', 'alter_art', 'alter_nudges', 'complexity_avg', 'dummy_question1', 'dummy_question2', 'q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10', 'conservation', 'transcendence', 'coeval', 'project']
df.drop(columns=not_used_columns, inplace=True)

### Prepare the data

In [None]:
df= prepare_df_for_modeling(df)
df= min_max_scaling_df(df)

In [None]:
selected_columns= ['avg_words_turn', 'activity_entanglement', 'social_perceived', 'health_perceived', 'indiv_spoken_time_ratio', 'contribution_index']
#df= return_df_by_columns(df,selected_columns)
df.head()

Unnamed: 0,indiv_spoken_time,indiv_spoken_time_ratio,average_turn_duration,average_turn_duration_ratio,avg_time_without_speaking,max_time_without_speaking,num_turns,num_turns_ratio,avg_turns_without_speaking,max_turns_without_speaking,num_words,num_words_ratio,avg_words_turn,speech_neu,speech_ang,speech_hap,speech_sad,text_joy,text_anger,text_fear,text_sadness,messages_sent,contribution_index,sentiment_avg,emotionality_avg,contribution_index_oscillation,activity_entanglement,ALTERNATIVE_REALITIES_Treehugger,ALTERNATIVE_REALITIES_Fatherlander,ALTERNATIVE_REALITIES_Spiritualism,ALTERNATIVE_REALITIES_Nerd,EMOTIONS_Fear,EMOTIONS_Happy,EMOTIONS_Sad,EMOTIONS_Anger,Groupflow_Beeflow,Groupflow_Leechflow,Groupflow_Antflow,ethical_likelihood,financial_likelihood,health_likelihood,recreational_likelihood,social_likelihood,total_likelihood,ethical_perceived,financial_perceived,health_perceived,recreational_perceived,social_perceived,total_perceived,O,C,E,A,N,harm_care_score,fairness_reciprocity_score,in_group_loyality_score,authority_respect_score,purity_sanctity_score,theory
0,0.0091,0.159657,0.082674,0.123789,0.087042,0.010795,0.04914,0.747201,0.034444,0.006818,0.023084,0.237878,0.082873,0.642186,0.38644,0.307156,0.160233,0.779269,0.288472,0.251824,0.489964,0.242424,0.394366,0.374747,0.158705,0.1,0.556037,0.483112,0.241864,0.089009,0.356028,0.495483,0.197535,0.505977,0.353427,0.763429,0.340511,0.184244,0.512195,0.326087,0.395833,0.295455,0.034483,0.305,0.551724,0.482143,0.52381,0.710526,0.55,0.5,0.35,0.6,0.291667,0.421053,0.5,0.941176,0.611111,0.733333,0.631579,0.818182,0.171206
1,0.047339,0.866398,0.527474,0.735671,0.022087,0.001899,0.058149,0.889212,0.013586,0.011364,0.093908,1.0,0.487778,0.44984,0.491922,0.562257,0.141686,0.667926,0.351012,0.132738,0.921348,0.55303,0.816901,0.308486,0.207085,0.2,0.260331,0.448759,0.115995,0.107596,0.414859,0.450916,0.212901,0.392314,0.540566,0.458956,0.479239,0.362381,0.146341,0.065217,0.0,0.568182,0.448276,0.195,0.37931,0.589286,0.642857,0.657895,0.275,0.5,0.15,0.4,0.541667,0.368421,0.615385,0.647059,0.888889,0.2,0.052632,0.409091,0.249027
2,0.001514,0.019441,0.086528,0.129092,0.089756,0.002743,0.004914,0.050058,0.270625,0.025,0.005341,0.046945,0.152957,0.413946,0.525981,0.457121,0.569071,0.758693,0.325351,0.214353,0.537218,0.272727,0.450704,0.472333,0.247524,0.2,0.601957,0.161053,0.083552,0.05381,0.762142,0.514874,0.201805,0.347123,0.522739,0.344269,0.51878,0.43731,0.512195,0.326087,0.208333,0.681818,0.551724,0.42,0.586207,0.464286,0.5,0.473684,0.25,0.5,0.25,0.466667,0.583333,0.105263,0.807692,0.705882,0.888889,0.066667,0.0,0.227273,0.381323
3,0.002541,0.038425,0.059311,0.091652,0.281816,0.0208,0.014742,0.204979,0.277931,0.056818,0.00851,0.08105,0.096591,0.013899,0.728057,1.0,0.630565,0.803801,0.255744,0.26519,0.472923,0.189394,0.309859,0.309055,0.260954,0.2,0.575362,0.264618,0.211134,0.111873,0.557981,0.676648,0.067056,0.43849,0.446423,0.390537,0.506265,0.404965,0.170732,0.065217,0.0,0.431818,0.310345,0.15,0.758621,0.517857,0.619048,0.578947,0.45,0.5,0.25,0.866667,0.625,0.736842,0.615385,0.352941,0.666667,0.6,0.368421,0.272727,0.474708
4,0.001514,0.019441,0.043057,0.069292,0.063305,0.003457,0.009828,0.127518,0.168522,0.029545,0.00544,0.048019,0.078935,0.190567,0.711064,0.681968,0.282712,0.870162,0.257264,0.289959,0.215557,0.151515,0.239437,0.377288,0.22506,0.1,0.781762,0.371012,0.000905,0.136371,0.502259,0.324557,0.298065,0.691679,0.155269,0.522797,0.261518,0.44119,0.317073,0.217391,0.3125,0.568182,0.068966,0.285,0.310345,0.5,0.238095,0.263158,0.375,0.333333,0.35,0.4,0.0,0.263158,0.153846,0.058824,0.166667,0.2,0.684211,0.090909,0.287938


In [None]:
df.shape

(55, 61)

### Use of Model

In [None]:
selected_columns=['text_anger', 'messages_sent', 'social_perceived', 'health_likelihood', 'indiv_spoken_time_ratio', 'contribution_index']
df_test= return_df_by_columns(df,selected_columns)
results = evaluate_regression_models(df_test)
print(results)
df_test.head()

{'Linear Regression': {'Mean Squared Error (MSE)': 0.04989645695429257, 'Root Mean Squared Error (RMSE)': 0.21889827678368956, 'Mean Absolute Error (MAE)': 0.17345000919350376, 'R-squared (R2)': 0.24033703885033697}, 'Ridge Regression': {'Mean Squared Error (MSE)': 0.0601158445591935, 'Root Mean Squared Error (RMSE)': 0.24127807336374185, 'Mean Absolute Error (MAE)': 0.19834372617344248, 'R-squared (R2)': 0.08261117514074016}}


Unnamed: 0,text_anger,messages_sent,social_perceived,health_likelihood,indiv_spoken_time_ratio,contribution_index,theory
0,0.288472,0.242424,0.55,0.395833,0.159657,0.394366,0.171206
1,0.351012,0.55303,0.275,0.0,0.866398,0.816901,0.249027
2,0.325351,0.272727,0.25,0.208333,0.019441,0.450704,0.381323
3,0.255744,0.189394,0.45,0.0,0.038425,0.309859,0.474708
4,0.257264,0.151515,0.375,0.3125,0.019441,0.239437,0.287938


In [None]:
selected_columns=['text_anger', 'messages_sent', 'social_perceived', 'health_likelihood', 'indiv_spoken_time_ratio', 'contribution_index']
df_test= return_df_by_columns(df,selected_columns)
df_test.head()

# Call the function and get the model and results
output = evaluate_linear_regression(df_test)

# Access the trained model
model = output['model']

coefficients = model.coef_

# Print the coefficients and their corresponding feature names
feature_names = df.columns[:-1]  # Assuming the last column is the target variable
for feature, coefficient in zip(feature_names, coefficients):
    print(f"{feature}: {coefficient}")


indiv_spoken_time: 0.5057372089509534
indiv_spoken_time_ratio: -0.29436897429072006
average_turn_duration: 0.3674918057014924
average_turn_duration_ratio: -0.7643381962723776
avg_time_without_speaking: -0.22283510013414937
max_time_without_speaking: 0.437030389810232


In [None]:
selected_columns = ['text_anger', 'messages_sent', 'social_perceived', 'health_likelihood', 'indiv_spoken_time_ratio', 'contribution_index']
df_test = return_df_by_columns(df, selected_columns)

# Call the function and get the model and results
output = evaluate_linear_regression(df_test)

# Access the trained model
model = output['model']

coefficients = model.coef_

# Print the coefficients and their corresponding feature names
feature_names = df_test.columns[:-1]  # Assuming the last column is the target variable
for feature, coefficient in zip(feature_names, coefficients):
    print(f"{feature}: {coefficient}")


text_anger: 0.5057372089509534
messages_sent: -0.29436897429072006
social_perceived: 0.3674918057014924
health_likelihood: -0.7643381962723776
indiv_spoken_time_ratio: -0.22283510013414937
contribution_index: 0.437030389810232


In [None]:
rmse= 0.125
# Assuming 'rmse' is the RMSE value and 'target_column' is the original column of the objective variable
rescaled_rmse = rescale_rmse(rmse,df['theory'])
print("Rescaled RMSE:", rescaled_rmse)

Rescaled RMSE: 0.125
