# Group Surveys Feature Determination

## Preparation

### Import

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 1000)

## Function Definition

### Read and Concatenate Tables

In [None]:
def read_collapse_tables(whats_features, meetings_features, surveys_features, grades):

  df_whats= pd.read_csv(whats_features, index_col='id',delimiter=";", header=0, usecols=['id', 'group', 'messages_sent', 'messages_total', 'contribution_index', 'ego_art', 'ego_nudges', 'alter_art', 'alter_nudges',
                                                                    'sentiment_avg', 'emotionality_avg', 'complexity_avg', 'influence_message_avg', 'influence_total_in',
                                                                    'influence_message_avg_in', 'influence_total', 'contribution_index_oscillation', 'activity_entanglement',
                                                                    'ALTERNATIVE_REALITIES_Treehugger', 'ALTERNATIVE_REALITIES_Fatherlander', 'ALTERNATIVE_REALITIES_Spiritualism',
                                                                    'ALTERNATIVE_REALITIES_Nerd',  'EMOTIONS_Fear', 'EMOTIONS_Happy',
                                                                    'EMOTIONS_Sad', 'EMOTIONS_Anger', 'Groupflow_Beeflow', 'Groupflow_Leechflow',
                                                                    'Groupflow_Antflow'])
  df_meetings= pd.read_excel(meetings_features,header=0,skiprows=range(1, 4)).rename_axis('Id')
  df_meetings.index += 1
  df_meetings= df_meetings.drop(columns=['group','shown_face','ID'], axis=1)
  df_surveys= pd.read_csv(surveys_features, index_col=0,usecols=['ID', 'group','ethical_likelihood', 'financial_likelihood', 'health_likelihood',
                                                                      'recreational_likelihood', 'social_likelihood', 'total_likelihood',
                                                                      'ethical_perceived', 'financial_perceived', 'health_perceived',
                                                                      'recreational_perceived', 'social_perceived', 'total_perceived',
                                                                      'O', 'C', 'E', 'A', 'N','harm_care_score', 'fairness_reciprocity_score',
                                                                      'in_group_loyality_score', 'authority_respect_score',
                                                                      'purity_sanctity_score', 'dummy_question1', 'dummy_question2',
                                                                      'q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10', 'conservation', 'transcendence'])
  df_surveys= df_surveys.drop(columns=['group'], axis=1)
  df_grades= pd.read_excel(grades, index_col='ID', usecols=['ID','TOTAL TEORÍA (6)',	'COEVAL (1)',	'TRABAJO FINAL (1)'])
  df_grades.columns=['theory','coeval','project']
  df = pd.concat([df_meetings, df_whats, df_surveys, df_grades], axis=1)

  df = df.dropna()

  return df

### Apply Different Scalings

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

def process_scalings(df, scaler):
    if scaler == 'standard':
        scaler_obj = StandardScaler()
    elif scaler == 'minmax':
        scaler_obj = MinMaxScaler()
    elif scaler == 'robust':
        scaler_obj = RobustScaler()
    elif scaler == 'quartile':
        scaler_obj = QuantileTransformer(output_distribution='uniform', n_quantiles= len(df))
    else:
        raise ValueError("Invalid scaler. Please choose 'standard', 'minmax', 'robust', or 'quartile'.")

    scaled_df = df.copy()  # Create a copy of the original DataFrame

    # Scale the numerical columns
    numeric_cols = scaled_df.select_dtypes(include=['float64', 'int64']).columns
    scaled_df[numeric_cols] = scaler_obj.fit_transform(scaled_df[numeric_cols])

    return scaled_df


In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer
import pandas as pd
import numpy as np

def process_scalings(df, scaler):
    if scaler == 'standard':
        scaler_obj = StandardScaler()
    elif scaler == 'minmax':
        scaler_obj = MinMaxScaler()
    elif scaler == 'robust':
        scaler_obj = RobustScaler()
    elif scaler == 'quartile':
        scaler_obj = QuantileTransformer(output_distribution='uniform')
    else:
        raise ValueError("Invalid scaler. Please choose 'standard', 'minmax', 'robust', or 'quartile'.")

    scaled_df = df.copy()  # Create a copy of the original DataFrame

    # Filter and scale only the numerical columns
    numeric_cols = scaled_df.select_dtypes(include=['float64', 'int64']).columns
    scaled_df[numeric_cols] = scaler_obj.fit_transform(scaled_df[numeric_cols])

    return scaled_df

### Min-Max Scaler

In [None]:
from sklearn.preprocessing import MinMaxScaler

def min_max_scaling_df(df):
    scaler = MinMaxScaler()
    scaled_df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
    return scaled_df


### Display Correlation Matrix

In [None]:
def corr_matrix(df):
  correlation_matrix = df.corr()

  # Create a figure and axes
  fig, ax = plt.subplots(figsize=(10, 8))

  # Create the heatmap using seaborn
  #sns.heatmap(correlation_matrix, cmap='coolwarm', ax=ax)

  # Set the title and axis labels
  ax.set_title('Correlation Matrix')
  ax.set_xlabel('Variables')
  ax.set_ylabel('Variables')

  # Rotate the x-axis labels for better readability
  plt.xticks(rotation=90)

  # Display the heatmap
  plt.show()

### Display Scatter Plot

In [None]:
def scatterplot (df,x_column,y_column):
  import pandas as pd
  import seaborn as sns
  import matplotlib.pyplot as plt
  import numpy as np
  from sklearn.linear_model import LinearRegression
  from sklearn.preprocessing import PolynomialFeatures

  # Assuming your DataFrame is called 'df' and you want to check relationships between columns 'x' and 'y'
  x = df[x_column]
  y = df[y_column]

  # Create a scatter plot to visualize the relationship
  plt.scatter(x, y)
  plt.xlabel('x')
  plt.ylabel('y')
  plt.title('Scatter Plot')
  plt.show()

  # Fit a polynomial regression model
  degree = 2  # Degree of the polynomial
  poly_features = PolynomialFeatures(degree=degree)
  X_poly = poly_features.fit_transform(x.values.reshape(-1, 1))

  model = LinearRegression()
  model.fit(X_poly, y)

  # Calculate predicted values
  y_pred = model.predict(X_poly)

  # Plot the fitted curve
  plt.scatter(x, y)
  plt.plot(x, y_pred, color='red', label=f'Degree {degree} Polynomial Fit')
  plt.xlabel('x')
  plt.ylabel('y')
  plt.title('Polynomial Regression')
  plt.legend()
  plt.show()


### Correlation Significance

In [None]:
def check_correlation_significance(df, col1, col2):
  import scipy.stats as stats

  # Assuming your DataFrame is called 'df'

  # Extract the 'performance' and 'recreational_perceived_mean' columns
  performance = df[col1]
  recreational_mean = df[col2]

  # Perform the hypothesis test
  corr_coeff, p_value = stats.pearsonr(performance, recreational_mean)

  # Print the results
  print("Correlation coefficient:", corr_coeff)
  print("p-value:", p_value)

  # Compare the p-value to the significance level
  alpha = 0.05  # Significance level
  if p_value < alpha:
      print("The correlation is statistically significant (reject H0).")
  else:
      print("The correlation is not statistically significant (fail to reject H0).")


### Apply Scalings given DF and print correlation

In [None]:
def apply_scalings(df, column):
    correlations = []
    scalers = ['minmax', 'quartile']

    for scaler in scalers:
        df = process_scalings(df, scaler)

        # Calculate the correlations
        corr1 = df.corrwith(df[column], numeric_only=True)  # Set numeric_only parameter to True
        correlations.append(corr1)

    df_final = pd.concat(correlations, axis=1)
    df_final.columns = scalers

    return df_final

In [None]:
def apply_scalings(df, column):

  correlations=[]
  scalers= ['quartile']

  for scaler in scalers:

    df= process_scalings(df, scaler)

    # Calculate the correlations
    corr1 = df.corrwith(df[column])
    #filtered_series = corr1[abs(corr1) > 0.6]
    correlations.append(corr1)

  df_final = pd.concat(correlations, axis=1)
  df_final.columns = scalers

  #fileterd_df = df_final[(np.abs(df) > 0.6).any(axis=1)]

  return df_final


## Use of Function

In [None]:
grades = r'/content/drive/MyDrive/Projects/tps/grades/data/1_participants_grades.xlsx'
whats_features= r'/content/drive/MyDrive/Projects/tps/whatsapp/data/1_nodes_mixed.csv'
meetings_features= r'/content/drive/MyDrive/Projects/tps/meetings/data/12. features/4_individual_features_final_ratio.xlsx'
surveys_features=  r'/content/drive/MyDrive/Projects/tps/surveys/data/1_happimeter_individual_surveys.csv'

In [None]:
df= read_collapse_tables(whats_features, meetings_features, surveys_features, grades)

In [None]:
#df = min_max_scaling_df(df)
df.head(12)

Unnamed: 0,indiv_spoken_time,indiv_spoken_time_ratio,average_turn_duration,average_turn_duration_ratio,avg_time_without_speaking,avg_time_without_speaking_ratio,max_time_without_speaking,max_time_without_speaking_ratio,num_turns,num_turns_ratio,avg_turns_without_speaking,avg_turns_without_speaking_ratio,max_turns_without_speaking,max_turns_without_speaking_ratio,num_words,num_words_ratio,avg_words_turn,avg_words_turn_ratio,max_words_turn,max_words_turn_ratio,speech_neu,speech_ang,speech_hap,speech_sad,text_joy,text_anger,text_fear,text_sadness,group,messages_sent,messages_total,contribution_index,ego_art,ego_nudges,alter_art,alter_nudges,sentiment_avg,emotionality_avg,complexity_avg,influence_message_avg,influence_total_in,influence_message_avg_in,influence_total,contribution_index_oscillation,activity_entanglement,ALTERNATIVE_REALITIES_Treehugger,ALTERNATIVE_REALITIES_Fatherlander,ALTERNATIVE_REALITIES_Spiritualism,ALTERNATIVE_REALITIES_Nerd,EMOTIONS_Fear,EMOTIONS_Happy,EMOTIONS_Sad,EMOTIONS_Anger,Groupflow_Beeflow,Groupflow_Leechflow,Groupflow_Antflow,ethical_likelihood,financial_likelihood,health_likelihood,recreational_likelihood,social_likelihood,total_likelihood,ethical_perceived,financial_perceived,health_perceived,recreational_perceived,social_perceived,total_perceived,O,C,E,A,N,harm_care_score,fairness_reciprocity_score,in_group_loyality_score,authority_respect_score,purity_sanctity_score,dummy_question1,dummy_question2,q1,q2,q3,q4,q5,q6,q7,q8,q9,q10,conservation,transcendence,theory,coeval,project
6,1346.0,0.337682,8.518987,0.185779,19.887848,0.070957,169.82,0.04443,158.0,0.364055,1.740506,0.064235,9.0,0.028846,4025.0,0.329109,25.474684,0.187253,230.0,0.233503,0.76806,0.18186,0.043375,0.006704,0.29778,0.237582,0.227811,0.236827,2.0,22.0,69.0,-0.36,1.326.435.903,1.628.846.139,5.609.305.556,4.304.166.675,0.404612,0.268756,8.831.858.439,0.275771772,0.030908656,0.030908656,0.412769952,4.0,0.52,0.362597,0.134296,0.00189,0.501216,0.152454,0.624326,0.145505,0.077715,0.332323,0.221886,0.445791,1.833333,3.833333,3.666667,6.166667,5.5,4.2,3.5,4.0,6.333333,4.333333,2.833333,4.0,0.683333,0.733333,0.883333,0.583333,0.416667,24.0,26.0,13.0,13.0,15.0,1.0,5.0,6.0,7.0,7.0,6.0,5.0,8.0,8.0,3.0,4.0,4.0,0.38,-1.59,4.64,0.78,8.5
8,298.0,0.074762,11.461538,0.249949,162.1672,0.578592,1529.54,0.400177,26.0,0.059908,15.384615,0.567781,125.0,0.400641,764.0,0.062469,29.384615,0.215993,167.0,0.169543,0.782264,0.203949,0.01221,0.001577,0.300151,0.233183,0.230322,0.236344,2.0,8.0,55.0,-0.71,3.302.708.333,3.037.500.024,1.476.083.333,2.912.499.994,0.746794,0.579783,988.272.047,0.0,0.959899291,0.959899291,0,4.0,0.529475,0.029491,0.125285,0.000506,0.844718,0.121923,0.68549,0.083875,0.108711,0.535406,0.037475,0.427119,4.666667,3.666667,4.333333,4.5,4.666667,4.366667,3.666667,4.833333,6.0,5.333333,4.333333,5.0,0.533333,0.683333,0.716667,0.5,0.516667,28.0,20.0,21.0,23.0,24.0,3.0,3.0,4.0,6.0,5.0,4.0,6.0,4.0,5.0,4.0,6.0,8.0,1.77,-1.18,3.61,0.78,8.5
9,697.0,0.174862,8.822785,0.192404,47.935696,0.171028,1421.86,0.372004,79.0,0.182028,4.455696,0.164441,108.0,0.346154,2062.0,0.168602,26.101266,0.191859,257.0,0.260914,0.735439,0.228609,0.03055,0.005402,0.295444,0.239446,0.225465,0.239646,2.0,8.0,55.0,-0.71,2.095.583.333,3.866.666.635,1.623.819.444,2.291.666.687,0.42855,0.236425,9.147.384.644,0.061817313,0.254338987,0.127169494,0.061817313,1.0,0.518237,0.249223,0.057101,0.193481,0.500194,0.070225,0.553516,0.30907,0.067189,0.065323,0.31168,0.622997,3.0,2.666667,3.166667,5.0,5.0,3.766667,3.666667,5.833333,4.333333,3.0,2.833333,4.0,0.616667,0.7,0.716667,0.716667,0.55,17.0,23.0,16.0,13.0,14.0,2.0,4.0,6.0,7.0,6.0,4.0,4.0,3.0,6.0,2.0,6.0,4.0,1.72,-2.0,4.37,0.77,8.5
10,276.0,0.069242,6.272727,0.136793,25.912727,0.092453,218.84,0.057256,44.0,0.101382,3.113636,0.114911,24.0,0.076923,857.0,0.070074,19.477273,0.143169,78.0,0.079188,0.710042,0.214038,0.066733,0.009187,0.298211,0.23699,0.226584,0.238215,2.0,24.0,71.0,-0.32,1.213.291.667,1.720.833.361,9.400.256.481,287.179.486,0.259263,0.253813,9.067.036.629,0.03761133,0.098445511,0.098445511,0.03761133,3.0,0.497027,0.193271,0.03851,0.087476,0.680743,0.150721,0.602558,0.153305,0.093417,0.221334,0.277272,0.501394,1.5,2.0,1.333333,3.166667,5.0,2.6,5.333333,4.5,6.166667,5.5,3.166667,5.0,0.533333,0.7,0.583333,0.55,0.583333,27.0,26.0,18.0,18.0,18.0,0.0,5.0,3.0,5.0,4.0,3.0,4.0,8.0,8.0,3.0,3.0,8.0,0.9,-0.13,5.33,0.9,8.5
17,281.0,0.098183,5.403846,0.152668,75.503846,0.377295,828.7,0.269106,52.0,0.172757,4.769231,0.372967,17.0,0.202381,787.0,0.114224,15.134615,0.175843,112.0,0.272506,0.743853,0.230877,0.022211,0.003059,0.301269,0.235008,0.228314,0.235408,4.0,10.0,92.0,-0.78,2.975.396.806,3.552.380.959,1.746.031.736,3.392.857.154,0.400919,0.319711,9.305.763.351,0.171088866,1.156.129.841,0.702050603,0.535487495,4.0,0.460784,0.398799,0.10002,0.100924,0.400257,0.183592,0.383818,0.372693,0.059897,0.100521,0.000667,0.898812,2.666667,4.0,3.666667,5.166667,4.0,3.9,3.333333,4.166667,4.166667,3.0,2.5,3.0,0.666667,0.7,0.733333,0.583333,0.5,18.0,18.0,19.0,18.0,15.0,0.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,6.0,5.0,4.0,4.0,1.48,-0.83,3.67,0.91,7.0
18,1357.0,0.474144,13.989691,0.395234,29.030928,0.145069,789.04,0.256227,97.0,0.322259,2.082474,0.162855,25.0,0.297619,3292.0,0.477794,33.938144,0.394314,142.0,0.345499,0.767905,0.19692,0.030697,0.004479,0.291022,0.241411,0.228907,0.238661,4.0,27.0,109.0,-0.5,2.206.481.481,2.561.111.053,1.083.601.933,3.578.022.083,0.319963,0.264397,9.177.843.902,0.412296666,0.599065708,0.228996351,0.858611457,5.0,0.435232,0.328924,0.000958,0.222025,0.448093,0.241462,0.396402,0.220258,0.141878,0.215835,0.1706,0.613565,4.666667,4.5,4.5,5.666667,5.166667,4.9,2.166667,1.833333,2.0,2.333333,2.833333,2.0,0.6,0.65,0.65,0.6,0.516667,21.0,22.0,20.0,19.0,22.0,2.0,5.0,7.0,6.0,7.0,7.0,7.0,5.0,7.0,7.0,6.0,6.0,1.57,-1.72,3.31,0.92,7.0
19,507.0,0.177149,7.567164,0.213786,54.32597,0.271469,734.16,0.238405,67.0,0.222591,3.41791,0.26729,27.0,0.321429,1242.0,0.180261,18.537313,0.215378,66.0,0.160584,0.769652,0.216395,0.013313,0.00064,0.295263,0.238384,0.227542,0.238811,4.0,45.0,127.0,-0.29,0.586655162,1.521.439.393,1.687.948.727,3.292.307.695,0.381981,0.245785,942.392.534,0.850955986,0.813625523,0.406059115,1.335.673.656,2.0,0.408099,0.31044,0.001019,0.111654,0.576888,0.268006,0.400095,0.198053,0.133846,0.273488,0.137877,0.588636,2.0,4.166667,3.666667,5.333333,5.333333,4.1,4.333333,4.833333,5.333333,3.5,3.0,4.0,0.7,0.683333,0.683333,0.716667,0.583333,23.0,22.0,14.0,12.0,12.0,2.0,4.0,6.0,6.0,8.0,7.0,7.0,6.0,7.0,3.0,3.0,3.0,-0.16,-2.04,3.89,0.91,7.0
20,717.0,0.250524,8.435294,0.238312,41.257857,0.206167,727.56,0.236262,85.0,0.282392,2.517647,0.196887,15.0,0.178571,1569.0,0.227721,18.458824,0.214466,91.0,0.221411,0.740058,0.205669,0.047375,0.006897,0.299525,0.236698,0.228422,0.235355,4.0,18.0,100.0,-0.64,0.997948727,4.958.974.441,224.889.978,2.330.718.915,0.242325,0.293497,8.801.185.159,0.289654027,1.006.603.864,0.503301932,0.765176561,3.0,0.432015,0.344994,0.056175,0.169649,0.429182,0.278736,0.386497,0.203583,0.131184,0.307068,0.142967,0.549965,2.0,3.5,3.0,4.166667,6.833333,3.9,2.666667,7.166667,4.166667,5.166667,2.833333,4.0,0.533333,0.583333,0.6,0.6,0.6,12.0,20.0,12.0,9.0,6.0,0.0,5.0,2.0,4.0,4.0,7.0,7.0,4.0,4.0,3.0,3.0,5.0,0.11,-1.19,3.33,0.88,7.0
26,125.0,0.261506,7.8125,0.248093,27.79625,0.184931,58.68,0.12117,16.0,0.219178,3.5,0.194661,9.0,0.152542,419.0,0.263854,26.1875,0.25037,72.0,0.257143,0.881454,0.101825,0.015245,0.001476,0.294511,0.240904,0.226163,0.238422,6.0,35.0,117.0,-0.4,0.55625579,1.609.722.197,0.676236259,4.241.758.287,0.356499,0.216244,9.075.793.219,0.262719279,1.179.140.156,0.396994369,0.731161302,1.0,0.320945,0.447982,0.000517,0.115638,0.435863,0.130925,0.564427,0.15441,0.150238,0.506875,0.137876,0.355248,2.5,4.333333,2.666667,5.666667,5.333333,4.1,4.333333,5.0,6.166667,5.166667,4.166667,5.0,0.733333,0.783333,0.633333,0.816667,0.566667,26.0,23.0,9.0,15.0,15.0,2.0,5.0,8.0,8.0,6.0,7.0,7.0,8.0,5.0,4.0,4.0,8.0,0.6,-2.35,4.25,0.9,8.7
27,131.0,0.274059,7.705882,0.244707,23.864706,0.158774,64.64,0.133477,17.0,0.232877,3.117647,0.173395,8.0,0.135593,451.0,0.284005,26.529412,0.253639,84.0,0.3,0.812568,0.149444,0.033656,0.004333,0.287701,0.243578,0.229601,0.239121,6.0,13.0,95.0,-0.73,0.559732144,4.426.785.707,0.606458333,2.275.000.036,0.432405,0.323053,8.460.725.149,0.179308278,104.495.116,0.228052837,0.830719291,0.0,0.397542,0.476179,0.077695,0.000954,0.445171,0.266541,0.540086,0.094756,0.098617,0.193743,0.330441,0.475816,2.833333,3.333333,2.5,4.666667,6.333333,3.933333,6.333333,6.666667,6.0,6.0,5.0,6.0,0.683333,0.65,0.7,0.716667,0.483333,21.0,30.0,15.0,10.0,14.0,0.0,2.0,1.0,2.0,8.0,8.0,7.0,7.0,8.0,0.0,6.0,8.0,-0.31,-0.34,3.75,0.88,8.7


In [None]:
df = df.drop(df[df['group'].isin([1, 3, 5])].index)
final_project= apply_scalings(df, 'coeval')
#final_project= final_project[(np.abs(final_project) > 0.6).any(axis=1)]
final_project.head(500)

  corr1 = df.corrwith(df[column])


Unnamed: 0,quartile
indiv_spoken_time,0.120541
indiv_spoken_time_ratio,-0.099972
average_turn_duration,-0.13501
average_turn_duration_ratio,-0.030942
avg_time_without_speaking,-0.005142
avg_time_without_speaking_ratio,0.010546
max_time_without_speaking,0.192099
max_time_without_speaking_ratio,-0.010808
num_turns,0.151865
num_turns_ratio,0.119147


In [None]:
df = df.drop([1, 3, 5])
max_theory= apply_scalings(df, 'theory')
max_theory= max_theory[(np.abs(max_theory) > 0.6).any(axis=1)]
max_theory.head(50)

In [None]:
df_drop = df.drop([1, 3, 5])
max_theory= apply_scalings(df_drop, 'max_theory')
max_theory= max_theory[(np.abs(max_theory) > 0.6).any(axis=1)]
max_theory.head(50)

In [None]:
apply_scalings(df, 'min_theory')