# Group Surveys Feature Determination

## Preparation

### Import

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 1000)

## Function Definition

### Apply Different Scalings

In [3]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer
import pandas as pd
import numpy as np

def process_scalings(df, scaler):
    if scaler == 'standard':
        scaler_obj = StandardScaler()
    elif scaler == 'minmax':
        scaler_obj = MinMaxScaler()
    elif scaler == 'robust':
        scaler_obj = RobustScaler()
    elif scaler == 'quartile':
        scaler_obj = QuantileTransformer(output_distribution='uniform')
    else:
        raise ValueError("Invalid scaler. Please choose 'standard', 'minmax', 'robust', or 'quartile'.")

    scaled_df = df.copy()  # Create a copy of the original DataFrame

    # Filter and scale only the numerical columns
    numeric_cols = scaled_df.select_dtypes(include=['float64', 'int64']).columns
    scaled_df[numeric_cols] = scaler_obj.fit_transform(scaled_df[numeric_cols])

    return scaled_df

### Min-Max Scaler

In [4]:
from sklearn.preprocessing import MinMaxScaler

def min_max_scaling_df(df):
    scaler = MinMaxScaler()
    scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    return scaled_df


### Display Correlation Matrix

In [5]:
def corr_matrix(df):

  # Assuming you have a DataFrame named 'df'
  correlation_matrix = df.corr()

  # Set the figure size
  fig, ax = plt.subplots(figsize=(12, 10))

  # Plot the correlation matrix as a heatmap
  sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', ax=ax)
  plt.title('Correlation Matrix')

  plt.show()

### Display Scatter Plot

In [6]:
def scatterplot (df,x_column,y_column):
  import pandas as pd
  import seaborn as sns
  import matplotlib.pyplot as plt
  import numpy as np
  from sklearn.linear_model import LinearRegression
  from sklearn.preprocessing import PolynomialFeatures

  # Assuming your DataFrame is called 'df' and you want to check relationships between columns 'x' and 'y'
  x = df[x_column]
  y = df[y_column]

  # Create a scatter plot to visualize the relationship
  plt.scatter(x, y)
  plt.xlabel('x')
  plt.ylabel('y')
  plt.title('Scatter Plot')
  plt.show()

  # Fit a polynomial regression model
  degree = 2  # Degree of the polynomial
  poly_features = PolynomialFeatures(degree=degree)
  X_poly = poly_features.fit_transform(x.values.reshape(-1, 1))

  model = LinearRegression()
  model.fit(X_poly, y)

  # Calculate predicted values
  y_pred = model.predict(X_poly)

  # Plot the fitted curve
  plt.scatter(x, y)
  plt.plot(x, y_pred, color='red', label=f'Degree {degree} Polynomial Fit')
  plt.xlabel('x')
  plt.ylabel('y')
  plt.title('Polynomial Regression')
  plt.legend()
  plt.show()


### Correlation Significance

In [7]:
def check_correlation_significance(df, col1, col2):
  import scipy.stats as stats

  # Assuming your DataFrame is called 'df'

  # Extract the 'performance' and 'recreational_perceived_mean' columns
  performance = df[col1]
  recreational_mean = df[col2]

  # Perform the hypothesis test
  corr_coeff, p_value = stats.pearsonr(performance, recreational_mean)

  # Print the results
  print("Correlation coefficient:", corr_coeff)
  print("p-value:", p_value)

  # Compare the p-value to the significance level
  alpha = 0.05  # Significance level
  if p_value < alpha:
      print("The correlation is statistically significant (reject H0).")
  else:
      print("The correlation is not statistically significant (fail to reject H0).")


### Apply Scalings given DF and print correlation

In [8]:
def apply_scalings(df, column):
    correlations = []
    scalers = ['minmax', 'quartile']

    for scaler in scalers:
        df = process_scalings(df, scaler)

        # Calculate the correlations
        corr1 = df.corrwith(df[column], numeric_only=True)  # Set numeric_only parameter to True
        correlations.append(corr1)

    df_final = pd.concat(correlations, axis=1)
    df_final.columns = scalers

    return df_final

In [9]:
def apply_scalings(df, column):

  correlations=[]
  scalers= ['quartile']

  for scaler in scalers:

    df= process_scalings(df, scaler)

    # Calculate the correlations
    corr1 = df.corrwith(df[column])
    #filtered_series = corr1[abs(corr1) > 0.6]
    correlations.append(corr1)
    
  df_final = pd.concat(correlations, axis=1)
  df_final.columns = scalers

  #fileterd_df = df_final[(np.abs(df) > 0.6).any(axis=1)]

  return df_final


## Use of Function

In [10]:
in_file= r'/content/drive/MyDrive/Projects/tps/finals/data/4_individual_features.xlsx'
out_file= r'/content/drive/MyDrive/Projects/tps/finals/data/5_individual_features.xlsx'

In [11]:
df= pd.read_excel(in_file, index_col='Id')

df.head(12)

Unnamed: 0_level_0,group,indiv_spoken_time_ratio,average_turn_duration,avg_time_without_speaking,num_turns,num_turns_ratio,avg_turns_without_speaking,max_words_turn_ratio,text_joy,messages_sent,contribution_index,sentiment_avg,emotionality_avg,activity_entanglement,EMOTIONS_Happy,Groupflow_Beeflow,Groupflow_Leechflow,O,C,E,A,N,harm_care_score,fairness_reciprocity_score,in_group_loyality_score,authority_respect_score,purity_sanctity_score,theory,coeval,project
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
1,1,0.148336,7.850746,56.210746,67,0.331683,1.985075,0.078613,0.298809,34,-0.64,0.303561,0.215562,0.487649,0.407447,0.497477,0.156358,0.6,0.716667,0.6,0.633333,0.583333,27,23,19,18,20,3.44,0.9,8.0
2,1,0.746757,33.948718,21.791169,78,0.386139,1.589744,0.720231,0.29295,75,-0.34,0.250396,0.236507,0.398994,0.417419,0.300955,0.220056,0.533333,0.666667,0.7,0.616667,0.633333,22,28,11,7,11,3.64,0.91,8.0
3,1,0.029611,8.076923,57.649231,13,0.064356,6.461538,0.080925,0.297727,38,-0.6,0.38186,0.254014,0.501416,0.410218,0.226931,0.238211,0.566667,0.683333,0.716667,0.533333,0.716667,23,28,9,6,7,3.98,0.9,8.0
4,1,0.045685,6.48,159.4216,25,0.123762,6.6,0.07052,0.3001,27,-0.7,0.250852,0.259828,0.493443,0.32277,0.256794,0.232465,0.566667,0.783333,0.733333,0.733333,0.633333,17,24,17,13,8,4.22,0.9,8.0
5,1,0.029611,5.526316,43.632632,19,0.094059,4.526316,0.049711,0.303593,22,-0.75,0.3056,0.244289,0.555323,0.472688,0.342162,0.120088,0.6,0.666667,0.483333,0.583333,0.433333,12,15,11,19,4,3.74,0.91,8.0
6,2,0.337682,8.518987,19.887848,158,0.364055,1.740506,0.233503,0.29778,22,-0.36,0.404612,0.268756,0.52,0.624326,0.332323,0.221886,0.683333,0.733333,0.883333,0.583333,0.416667,24,26,13,13,15,4.64,0.78,8.5
8,2,0.074762,11.461538,162.1672,26,0.059908,15.384615,0.169543,0.300151,8,-0.71,0.746794,0.579783,0.529475,0.68549,0.535406,0.037475,0.533333,0.683333,0.716667,0.5,0.516667,28,20,21,23,24,3.61,0.78,8.5
9,2,0.174862,8.822785,47.935696,79,0.182028,4.455696,0.260914,0.295444,8,-0.71,0.42855,0.236425,0.518237,0.553516,0.065323,0.31168,0.616667,0.7,0.716667,0.716667,0.55,17,23,16,13,14,4.37,0.77,8.5
10,2,0.069242,6.272727,25.912727,44,0.101382,3.113636,0.079188,0.298211,24,-0.32,0.259263,0.253813,0.497027,0.602558,0.221334,0.277272,0.533333,0.7,0.583333,0.55,0.583333,27,26,18,18,18,5.33,0.9,8.5
11,3,0.166562,9.851852,95.963077,27,0.174194,4.740741,0.166,0.268266,9,-0.7,0.41733,0.146854,0.54717,0.443599,0.399318,0.391894,0.583333,0.733333,0.616667,0.65,0.383333,23,23,20,23,18,5.57,0.91,8.0


In [12]:
#corr_matrix(df)

In [13]:
def drop_bad_measured(df):
  df = df.drop(df[df['group'].isin([1, 3, 5])].index)
  return df

df= drop_bad_measured(df)

df = min_max_scaling_df(df)

In [14]:
df.head()

Unnamed: 0,group,indiv_spoken_time_ratio,average_turn_duration,avg_time_without_speaking,num_turns,num_turns_ratio,avg_turns_without_speaking,max_words_turn_ratio,text_joy,messages_sent,contribution_index,sentiment_avg,emotionality_avg,activity_entanglement,EMOTIONS_Happy,Groupflow_Beeflow,Groupflow_Leechflow,O,C,E,A,N,harm_care_score,fairness_reciprocity_score,in_group_loyality_score,authority_respect_score,purity_sanctity_score,theory,coeval,project
0,0.0,0.383208,0.11681,0.018495,0.122951,0.829028,0.024924,0.340743,0.808161,0.151515,0.788732,0.450749,0.239567,0.844576,0.743922,0.528443,0.483225,0.6,0.714286,1.0,0.263158,0.041667,0.764706,0.818182,0.285714,0.333333,0.590909,0.837545,0.622222,0.833333
1,0.0,0.072662,0.187647,0.286997,0.014754,0.02364,0.857889,0.21153,0.882811,0.045455,0.295775,1.0,1.0,0.884774,0.881614,0.876434,0.081596,0.15,0.5,0.5,0.0,0.291667,1.0,0.545455,0.857143,0.888889,1.0,0.651625,0.622222,0.833333
2,0.0,0.190894,0.124124,0.071425,0.058197,0.347016,0.190685,0.39612,0.734617,0.045455,0.295775,0.489173,0.160519,0.837094,0.584517,0.070929,0.678788,0.4,0.571429,0.5,0.684211,0.375,0.352941,0.681818,0.5,0.333333,0.545455,0.788809,0.6,0.833333
3,0.0,0.066143,0.062736,0.029865,0.029508,0.133466,0.108753,0.02899,0.821753,0.166667,0.84507,0.217443,0.203033,0.747101,0.694919,0.338261,0.60385,0.15,0.571429,0.1,0.157895,0.458333,0.941176,0.818182,0.642857,0.611111,0.727273,0.962094,0.888889,0.833333
4,0.2,0.100326,0.041819,0.123451,0.036066,0.322468,0.209826,0.419539,0.91801,0.060606,0.197183,0.444821,0.364148,0.593326,0.202496,0.131243,0.001431,0.55,0.571429,0.55,0.263158,0.25,0.411765,0.454545,0.714286,0.611111,0.590909,0.662455,0.911111,0.333333


## Regression Models

In [15]:
def grid_search_cv(df):
  from sklearn.model_selection import GridSearchCV, cross_validate, train_test_split
  from sklearn.metrics import make_scorer, mean_squared_error, r2_score
  from sklearn.linear_model import LinearRegression
  from sklearn.tree import DecisionTreeRegressor
  from sklearn.ensemble import RandomForestRegressor

  # Split the data into training and test sets
  X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], df.iloc[:, -1], test_size=0.2, random_state=42)

  # Define the list of models to evaluate
  models = [
      ('Linear Regression', LinearRegression()),
      ('Decision Tree', DecisionTreeRegressor()),
      ('Random Forest', RandomForestRegressor())
  ]

  # Define the grid of hyperparameters for each model
  param_grid = {
      'Linear Regression': {},
      'Decision Tree': {'max_depth': [None, 5, 10]},
      'Random Forest': {'n_estimators': [100, 200, 300]}
  }

  # Define the scoring metrics for evaluation
  scoring = {
      'RMSE': make_scorer(mean_squared_error, squared=False),
      'R2': make_scorer(r2_score)
  }

  # Perform grid search and cross-validation for each model
  results = {}
  for name, model in models:
      grid_search = GridSearchCV(model, param_grid[name], scoring=scoring, refit='RMSE', cv=5)
      grid_search.fit(X_train, y_train)
      
      # Cross-validation scores for all parameter combinations
      cv_results = cross_validate(grid_search.best_estimator_, X_train, y_train, scoring=scoring, cv=5)
      
      results[name] = {
          'best_params': grid_search.best_params_,
          'best_estimator': grid_search.best_estimator_,
          'cv_RMSE': cv_results['test_RMSE'].mean(),
          'cv_R2': cv_results['test_R2'].mean()
      }

  # Print the results for all models and parameter combinations
  for name, result in results.items():
      print("Model:", name)
      print("Best Parameters:", result['best_params'])
      print("CV RMSE:", result['cv_RMSE'])
      print("CV R2:", result['cv_R2'])
      print()

  # Train the best model on the entire training dataset
  best_model = min(results, key=lambda x: results[x]['cv_RMSE'])
  final_model = results[best_model]['best_estimator']
  final_model.fit(X_train, y_train)

  # Evaluate the performance of the best model on the test set
  y_pred = final_model.predict(X_test)
  mse = mean_squared_error(y_test, y_pred)
  rmse = np.sqrt(mse)
  r2 = r2_score(y_test, y_pred)
  print("Best Model:", best_model)
  print("Test RMSE:", rmse)
  print("Test R2:", r2)


In [16]:
df = df.drop(columns=['group','project', 'theory']) #'coeval', 'project','theory','group'

In [17]:
grid_search_cv(df)

Model: Linear Regression
Best Parameters: {}
CV RMSE: 0.9735106565534346
CV R2: -163.5666619962373

Model: Decision Tree
Best Parameters: {'max_depth': 10}
CV RMSE: 0.4296976225641028
CV R2: -25.78872865412999

Model: Random Forest
Best Parameters: {'n_estimators': 200}
CV RMSE: 0.3295141410491841
CV R2: -9.13382170815414

Best Model: Random Forest
Test RMSE: 0.21930819961814682
Test R2: -10.916858874622292
