# Group Surveys Feature Determination

## Preparation

### Import

In [20]:
import itertools
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import label_binarize
from scipy import interp
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 1000)

## Function Definition

### Inlcude Marks Classification

In [3]:
def include_mark_classification(df):

    df = df[df['theory'] >= 0.5]
    limit1=0.65
    limit2= 0.8

    # Define the conditions and corresponding values
    conditions = [
        (df['theory'] >= 0.5) & (df['theory'] < limit1),
        (df['theory'] >= limit1) & (df['theory'] < limit2),
        (df['theory'] >= limit2) & (df['theory'] <= 1)
    ]
    values = [1,2,3]

    # Create the 'mark' column using np.select()
    df['mark'] = np.select(conditions, values, default='np.NaN')

    df = df.dropna(how='any')

    return df

### Inlcude Groupflow Classification

In [4]:
import pandas as pd

def assign_groupflow(df):
    # Create a new column 'groupflow' with default value as None
    df['groupflow'] = None

    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        # Get the values of the three columns for the current row
        beeflow = row['Groupflow_Beeflow']
        leechflow = row['Groupflow_Leechflow']
        antflow = row['Groupflow_Antflow']

        # Compare the values and assign the corresponding category to 'groupflow'
        if beeflow >= leechflow and beeflow >= antflow:
            df.at[index, 'groupflow'] = 'bee'
        elif leechflow >= beeflow and leechflow >= antflow:
            df.at[index, 'groupflow'] = 'leech'
        else:
            df.at[index, 'groupflow'] = 'ant'

    # Return the modified DataFrame
    return df


### Select Columns by Index

In [5]:
def select_columns_by_index(df, column_indexes):

    cols= column_indexes
    cols.append(len(df.columns)-1)
    selected_columns = df.iloc[:, cols].copy()
    selected_column_names = df.columns[column_indexes].tolist()

    return selected_columns, selected_column_names

### Prepare DataFrame for Modeling

In [6]:
def prepare_df_for_modeling(df):

  # Strip off leading and trailing whitespace from non-numeric columns
  df = df.select_dtypes(include=['int', 'float'])
  df = df[df['theory'] >= 0.5]
  df.reset_index(drop=True, inplace=True)

  return df

### Return Dataframe by Columns

In [7]:
def return_df_by_columns(df,column_names):

  new_df = df[column_names].copy()
  data = pd.concat([new_df, df['theory']], axis=1)

  return data

### Get Indexes by Column names

In [8]:
def get_column_indexes(df, columns):
    # Get the indexes of the specified columns
    indexes = [df.columns.get_loc(col) for col in columns]
    return indexes

## Use of Function

### Import

In [9]:
in_file= r'/content/drive/MyDrive/Projects/tps/finals/data/3_individual_features.xlsx'

# Read the Excel file into a DataFrame, using 'Id' column as the index
df= pd.read_excel(in_file, index_col='Id')

### Preparation

In [10]:
# Drop unnecesary columns
not_used_columns=['avg_time_without_speaking_ratio', 'max_time_without_speaking_ratio', 'avg_turns_without_speaking_ratio', 'max_turns_without_speaking_ratio', 'avg_words_turn_ratio', 'max_words_turn', 'max_words_turn_ratio', 'messages_total', 'alter_art', 'alter_nudges', 'complexity_avg', 'dummy_question1', 'dummy_question2', 'q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10', 'conservation', 'transcendence', 'coeval', 'project']
df.drop(columns=not_used_columns, inplace=True)

# Transform marks into 0-1
df['theory'] = df['theory'] / 6

In [11]:
df.head()

Unnamed: 0_level_0,indiv_spoken_time,indiv_spoken_time_ratio,average_turn_duration,average_turn_duration_ratio,avg_time_without_speaking,max_time_without_speaking,num_turns,num_turns_ratio,avg_turns_without_speaking,max_turns_without_speaking,num_words,num_words_ratio,avg_words_turn,speech_neu,speech_ang,speech_hap,speech_sad,text_joy,text_anger,text_fear,text_sadness,group,messages_sent,contribution_index,ego_art,ego_nudges,sentiment_avg,emotionality_avg,influence_message_avg,influence_total_in,influence_message_avg_in,influence_total,contribution_index_oscillation,activity_entanglement,ALTERNATIVE_REALITIES_Treehugger,ALTERNATIVE_REALITIES_Fatherlander,ALTERNATIVE_REALITIES_Spiritualism,ALTERNATIVE_REALITIES_Nerd,EMOTIONS_Fear,EMOTIONS_Happy,EMOTIONS_Sad,EMOTIONS_Anger,Groupflow_Beeflow,Groupflow_Leechflow,Groupflow_Antflow,ethical_likelihood,financial_likelihood,health_likelihood,recreational_likelihood,social_likelihood,total_likelihood,ethical_perceived,financial_perceived,health_perceived,recreational_perceived,social_perceived,total_perceived,O,C,E,A,N,harm_care_score,fairness_reciprocity_score,in_group_loyality_score,authority_respect_score,purity_sanctity_score,theory
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1
1,526,0.148336,7.850746,0.126865,56.210746,451.54,67,0.331683,1.985075,7,978,0.1708,14.597015,0.832695,0.137502,0.026347,0.003456,0.298809,0.236814,0.232361,0.232015,1,34,-0.64,1.332.855.889,2.468.578.279,0.303561,0.215562,0.346125353,1.227.594.788,0.346135338,1.730.626.767,1,0.487649,0.472084,0.083889,0.08914,0.354886,0.236101,0.407447,0.251288,0.105165,0.497477,0.156358,0.346165,4.5,3.5,4.166667,3.166667,3.5,3.766667,4.833333,5.666667,5.666667,5.833333,4.833333,5,0.6,0.716667,0.6,0.633333,0.583333,27,23,19,18,20,0.573333
2,2648,0.746757,33.948718,0.548598,21.791169,109.24,78,0.386139,1.589744,9,3816,0.666434,48.923077,0.795958,0.155654,0.045315,0.003074,0.29295,0.239695,0.227768,0.239587,1,75,-0.34,7.778.626.875,1.363.151.848,0.250396,0.236507,0.428515776,0.997035569,0.231981331,1.988.107.685,2,0.398994,0.438515,0.040232,0.107725,0.413528,0.215535,0.417419,0.208064,0.158982,0.300955,0.220056,0.478989,2.0,1.5,1.0,5.166667,5.5,3.033333,4.0,6.666667,6.5,5.5,3.0,5,0.533333,0.666667,0.7,0.616667,0.633333,22,28,11,7,11,0.606667
3,105,0.029611,8.076923,0.13052,57.649231,141.72,13,0.064356,6.461538,15,267,0.046629,20.538462,0.789103,0.161515,0.037497,0.011885,0.297727,0.238513,0.230916,0.232844,1,38,-0.6,1.006.371.139,2.010.089.278,0.38186,0.254014,0.236068168,0.866593847,0.199153824,103.894.739,2,0.501416,0.157377,0.02898,0.053947,0.759697,0.245048,0.410218,0.190879,0.153855,0.226931,0.238211,0.534858,4.5,3.5,2.666667,6.0,6.0,4.533333,5.0,5.5,5.5,4.333333,2.833333,5,0.566667,0.683333,0.716667,0.533333,0.716667,23,28,9,6,7,0.663333
4,162,0.045685,6.48,0.104714,159.4216,836.46,25,0.123762,6.6,29,394,0.068809,15.76,0.712697,0.196289,0.077861,0.013153,0.3001,0.235307,0.232877,0.231716,1,27,-0.7,1.197.692.111,2.333.088.249,0.250852,0.259828,0.486946776,2.016.371.149,0.467754404,1.159.895.116,2,0.493443,0.258578,0.073231,0.112001,0.55619,0.319698,0.32277,0.225624,0.131909,0.256794,0.232465,0.51074,2.166667,1.5,1.0,4.166667,4.833333,2.733333,5.833333,6.0,6.333333,5.0,4.166667,5,0.566667,0.783333,0.733333,0.733333,0.633333,17,24,17,13,8,0.703333
5,105,0.029611,5.526316,0.089303,43.632632,169.2,19,0.094059,4.526316,17,271,0.047328,14.263158,0.746439,0.193365,0.054215,0.005981,0.303593,0.235377,0.233832,0.227198,1,22,-0.75,8.437.013.889,2.777.083.337,0.3056,0.244289,0.36035958,178.005.748,0.421674039,0.970075875,1,0.555323,0.362543,0.000314,0.136496,0.500648,0.157227,0.472688,0.321906,0.048179,0.342162,0.120088,0.537751,3.166667,2.666667,3.5,5.166667,3.666667,3.633333,3.666667,5.833333,3.666667,3.0,3.666667,4,0.6,0.666667,0.483333,0.583333,0.433333,12,15,11,19,4,0.623333


In [12]:
df.head(70)

Unnamed: 0_level_0,indiv_spoken_time,indiv_spoken_time_ratio,average_turn_duration,average_turn_duration_ratio,avg_time_without_speaking,max_time_without_speaking,num_turns,num_turns_ratio,avg_turns_without_speaking,max_turns_without_speaking,num_words,num_words_ratio,avg_words_turn,speech_neu,speech_ang,speech_hap,speech_sad,text_joy,text_anger,text_fear,text_sadness,group,messages_sent,contribution_index,ego_art,ego_nudges,sentiment_avg,emotionality_avg,influence_message_avg,influence_total_in,influence_message_avg_in,influence_total,contribution_index_oscillation,activity_entanglement,ALTERNATIVE_REALITIES_Treehugger,ALTERNATIVE_REALITIES_Fatherlander,ALTERNATIVE_REALITIES_Spiritualism,ALTERNATIVE_REALITIES_Nerd,EMOTIONS_Fear,EMOTIONS_Happy,EMOTIONS_Sad,EMOTIONS_Anger,Groupflow_Beeflow,Groupflow_Leechflow,Groupflow_Antflow,ethical_likelihood,financial_likelihood,health_likelihood,recreational_likelihood,social_likelihood,total_likelihood,ethical_perceived,financial_perceived,health_perceived,recreational_perceived,social_perceived,total_perceived,O,C,E,A,N,harm_care_score,fairness_reciprocity_score,in_group_loyality_score,authority_respect_score,purity_sanctity_score,theory
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1
1,526,0.148336,7.850746,0.126865,56.210746,451.54,67,0.331683,1.985075,7,978,0.1708,14.597015,0.832695,0.137502,0.026347,0.003456,0.298809,0.236814,0.232361,0.232015,1,34,-0.64,1.332.855.889,2.468.578.279,0.303561,0.215562,0.346125353,1.227.594.788,0.346135338,1.730.626.767,1,0.487649,0.4720845,0.08388939,0.08914,0.3548859,0.236101,0.407447,0.251288,0.105165,0.497477,0.156358,0.346165,4.5,3.5,4.166667,3.166667,3.5,3.766667,4.833333,5.666667,5.666667,5.833333,4.833333,5,0.6,0.716667,0.6,0.633333,0.583333,27,23,19,18,20,0.573333
2,2648,0.746757,33.948718,0.548598,21.791169,109.24,78,0.386139,1.589744,9,3816,0.666434,48.923077,0.795958,0.155654,0.045315,0.003074,0.29295,0.239695,0.227768,0.239587,1,75,-0.34,7.778.626.875,1.363.151.848,0.250396,0.236507,0.428515776,0.997035569,0.231981331,1.988.107.685,2,0.398994,0.4385152,0.04023221,0.107725,0.4135278,0.215535,0.417419,0.208064,0.158982,0.300955,0.220056,0.478989,2.0,1.5,1.0,5.166667,5.5,3.033333,4.0,6.666667,6.5,5.5,3.0,5,0.533333,0.666667,0.7,0.616667,0.633333,22,28,11,7,11,0.606667
3,105,0.029611,8.076923,0.13052,57.649231,141.72,13,0.064356,6.461538,15,267,0.046629,20.538462,0.789103,0.161515,0.037497,0.011885,0.297727,0.238513,0.230916,0.232844,1,38,-0.6,1.006.371.139,2.010.089.278,0.38186,0.254014,0.236068168,0.866593847,0.199153824,103.894.739,2,0.501416,0.1573771,0.02897965,0.053947,0.7596967,0.245048,0.410218,0.190879,0.153855,0.226931,0.238211,0.534858,4.5,3.5,2.666667,6.0,6.0,4.533333,5.0,5.5,5.5,4.333333,2.833333,5,0.566667,0.683333,0.716667,0.533333,0.716667,23,28,9,6,7,0.663333
4,162,0.045685,6.48,0.104714,159.4216,836.46,25,0.123762,6.6,29,394,0.068809,15.76,0.712697,0.196289,0.077861,0.013153,0.3001,0.235307,0.232877,0.231716,1,27,-0.7,1.197.692.111,2.333.088.249,0.250852,0.259828,0.486946776,2.016.371.149,0.467754404,1.159.895.116,2,0.493443,0.2585777,0.07323091,0.112001,0.5561903,0.319698,0.32277,0.225624,0.131909,0.256794,0.232465,0.51074,2.166667,1.5,1.0,4.166667,4.833333,2.733333,5.833333,6.0,6.333333,5.0,4.166667,5,0.566667,0.783333,0.733333,0.733333,0.633333,17,24,17,13,8,0.703333
5,105,0.029611,5.526316,0.089303,43.632632,169.2,19,0.094059,4.526316,17,271,0.047328,14.263158,0.746439,0.193365,0.054215,0.005981,0.303593,0.235377,0.233832,0.227198,1,22,-0.75,8.437.013.889,2.777.083.337,0.3056,0.244289,0.36035958,178.005.748,0.421674039,0.970075875,1,0.555323,0.3625429,0.000314,0.136496,0.5006478,0.157227,0.472688,0.321906,0.048179,0.342162,0.120088,0.537751,3.166667,2.666667,3.5,5.166667,3.666667,3.633333,3.666667,5.833333,3.666667,3.0,3.666667,4,0.6,0.666667,0.483333,0.583333,0.433333,12,15,11,19,4,0.623333
6,1346,0.337682,8.518987,0.185779,19.887848,169.82,158,0.364055,1.740506,9,4025,0.329109,25.474684,0.76806,0.18186,0.043375,0.006704,0.29778,0.237582,0.227811,0.236827,2,22,-0.36,1.326.435.903,1.628.846.139,0.404612,0.268756,0.275771772,0.030908656,0.030908656,0.412769952,4,0.52,0.3625971,0.1342965,0.00189,0.5012164,0.152454,0.624326,0.145505,0.077715,0.332323,0.221886,0.445791,1.833333,3.833333,3.666667,6.166667,5.5,4.2,3.5,4.0,6.333333,4.333333,2.833333,4,0.683333,0.733333,0.883333,0.583333,0.416667,24,26,13,13,15,0.773333
8,298,0.074762,11.461538,0.249949,162.1672,1529.54,26,0.059908,15.384615,125,764,0.062469,29.384615,0.782264,0.203949,0.01221,0.001577,0.300151,0.233183,0.230322,0.236344,2,8,-0.71,3.302.708.333,3.037.500.024,0.746794,0.579783,0,0.959899291,0.959899291,0,4,0.529475,0.02949133,0.1252847,0.000506,0.8447182,0.121923,0.68549,0.083875,0.108711,0.535406,0.037475,0.427119,4.666667,3.666667,4.333333,4.5,4.666667,4.366667,3.666667,4.833333,6.0,5.333333,4.333333,5,0.533333,0.683333,0.716667,0.5,0.516667,28,20,21,23,24,0.601667
9,697,0.174862,8.822785,0.192404,47.935696,1421.86,79,0.182028,4.455696,108,2062,0.168602,26.101266,0.735439,0.228609,0.03055,0.005402,0.295444,0.239446,0.225465,0.239646,2,8,-0.71,2.095.583.333,3.866.666.635,0.42855,0.236425,0.061817313,0.254338987,0.127169494,0.061817313,1,0.518237,0.2492233,0.05710145,0.193481,0.5001939,0.070225,0.553516,0.30907,0.067189,0.065323,0.31168,0.622997,3.0,2.666667,3.166667,5.0,5.0,3.766667,3.666667,5.833333,4.333333,3.0,2.833333,4,0.616667,0.7,0.716667,0.716667,0.55,17,23,16,13,14,0.728333
10,276,0.069242,6.272727,0.136793,25.912727,218.84,44,0.101382,3.113636,24,857,0.070074,19.477273,0.710042,0.214038,0.066733,0.009187,0.298211,0.23699,0.226584,0.238215,2,24,-0.32,1.213.291.667,1.720.833.361,0.259263,0.253813,0.03761133,0.098445511,0.098445511,0.03761133,3,0.497027,0.1932707,0.03850988,0.087476,0.6807433,0.150721,0.602558,0.153305,0.093417,0.221334,0.277272,0.501394,1.5,2.0,1.333333,3.166667,5.0,2.6,5.333333,4.5,6.166667,5.5,3.166667,5,0.533333,0.7,0.583333,0.55,0.583333,27,26,18,18,18,0.888333
11,266,0.166562,9.851852,0.207658,95.963077,1446.52,27,0.174194,4.740741,40,543,0.164945,20.111111,0.85065,0.140287,0.007156,0.001907,0.268266,0.257794,0.243564,0.230376,3,9,-0.7,4.099.444.444,2.966.666.698,0.41733,0.146854,0.900596442,0.174141428,0.080865228,224.194.098,4,0.54717,0.4430857,0.000356,0.333701,0.2228573,0.290449,0.443599,0.177075,0.088877,0.399318,0.391894,0.208788,1.666667,5.666667,3.166667,6.5,5.0,4.4,6.666667,4.166667,6.0,4.666667,4.0,5,0.583333,0.733333,0.616667,0.65,0.383333,23,23,20,23,18,0.928333


In [13]:
df = assign_groupflow(df)

In [14]:
# Make sure every column is composed by numbers
df= prepare_df_for_modeling(df)

# Include the classification (new column: 'mark')
df= include_mark_classification(df)

### Analysis

In [15]:
def print_correlation_matrix(df, limit):
    correlation_matrix = df.corr()

    for i in range(len(correlation_matrix.columns)):
        for j in range(i + 1, len(correlation_matrix.columns)):
            if abs(correlation_matrix.iloc[i, j]) > limit:
                print(f"Feature 1: {correlation_matrix.columns[i]}")
                print(f"Feature 2: {correlation_matrix.columns[j]}")
                print(f"Correlation: {correlation_matrix.iloc[i, j]}")
                print("---")


In [16]:
# Assuming you have a dataframe called 'my_dataframe'
print_correlation_matrix(df, 0.5)

Feature 1: indiv_spoken_time
Feature 2: average_turn_duration
Correlation: 0.5032249705771975
---
Feature 1: indiv_spoken_time
Feature 2: average_turn_duration_ratio
Correlation: 0.5515056470283205
---
Feature 1: indiv_spoken_time
Feature 2: num_turns
Correlation: 0.8124619163102694
---
Feature 1: indiv_spoken_time
Feature 2: num_words
Correlation: 0.9657245388494239
---
Feature 1: indiv_spoken_time_ratio
Feature 2: average_turn_duration
Correlation: 0.6993863149081885
---
Feature 1: indiv_spoken_time_ratio
Feature 2: average_turn_duration_ratio
Correlation: 0.8953001508503806
---
Feature 1: indiv_spoken_time_ratio
Feature 2: num_turns_ratio
Correlation: 0.8438468272864222
---
Feature 1: indiv_spoken_time_ratio
Feature 2: avg_turns_without_speaking
Correlation: -0.5333825597803394
---
Feature 1: indiv_spoken_time_ratio
Feature 2: num_words
Correlation: 0.5046008171798629
---
Feature 1: indiv_spoken_time_ratio
Feature 2: num_words_ratio
Correlation: 0.9838300485156376
---
Feature 1: ind

  correlation_matrix = df.corr()


In [17]:

# Group the DataFrame by 'mark' and calculate the average of other columns
grouped_df = df.groupby('mark').mean()

grouped_df.head()


Unnamed: 0_level_0,indiv_spoken_time,indiv_spoken_time_ratio,average_turn_duration,average_turn_duration_ratio,avg_time_without_speaking,max_time_without_speaking,num_turns,num_turns_ratio,avg_turns_without_speaking,max_turns_without_speaking,num_words,num_words_ratio,avg_words_turn,speech_neu,speech_ang,speech_hap,speech_sad,text_joy,text_anger,text_fear,text_sadness,group,messages_sent,contribution_index,sentiment_avg,emotionality_avg,contribution_index_oscillation,activity_entanglement,ALTERNATIVE_REALITIES_Treehugger,ALTERNATIVE_REALITIES_Fatherlander,ALTERNATIVE_REALITIES_Spiritualism,ALTERNATIVE_REALITIES_Nerd,EMOTIONS_Fear,EMOTIONS_Happy,EMOTIONS_Sad,EMOTIONS_Anger,Groupflow_Beeflow,Groupflow_Leechflow,Groupflow_Antflow,ethical_likelihood,financial_likelihood,health_likelihood,recreational_likelihood,social_likelihood,total_likelihood,ethical_perceived,financial_perceived,health_perceived,recreational_perceived,social_perceived,total_perceived,O,C,E,A,N,harm_care_score,fairness_reciprocity_score,in_group_loyality_score,authority_respect_score,purity_sanctity_score,theory
mark,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1
1,633.133333,0.211852,11.638873,0.213002,72.145966,627.696667,49.433333,0.200377,4.66425,31.566667,1329.0,0.210984,25.064528,0.792493,0.178984,0.025318,0.003206,0.29384,0.237682,0.234083,0.234395,7.133333,31.9,-0.639,0.381294,0.272791,2.866667,0.481637,0.268946,0.080709,0.15633,0.494014,0.205704,0.468833,0.196784,0.128678,0.273088,0.181244,0.545668,3.061111,3.666667,3.594444,5.233333,5.166667,4.144444,4.588889,5.227778,5.083333,4.238889,3.483333,4.433333,0.621111,0.675,0.658889,0.604444,0.548333,20.233333,21.766667,16.566667,14.566667,12.833333,0.572389
2,4098.411765,0.212928,12.863763,0.212254,96.742884,3406.538824,161.352941,0.202204,5.362742,29.058824,4139.470588,0.203808,24.54195,0.786723,0.175513,0.031701,0.006062,0.28881,0.24094,0.236963,0.233287,5.529412,19.411765,-0.578824,0.360572,0.228439,2.176471,0.480445,0.334654,0.054918,0.075818,0.534609,0.195958,0.474192,0.208322,0.121528,0.310736,0.187877,0.501387,2.313725,2.470588,2.460784,4.392157,5.058824,3.339216,4.745098,5.617647,5.647059,4.558824,3.196078,4.705882,0.610784,0.710784,0.678431,0.622549,0.556863,22.352941,23.411765,14.176471,13.882353,12.764706,0.727745
3,920.625,0.137845,8.237331,0.152557,152.77756,9855.18,132.75,0.172704,7.351467,94.875,2173.75,0.152714,20.273865,0.804051,0.164769,0.027591,0.00359,0.286585,0.240915,0.240299,0.232201,6.375,20.25,-0.575,0.371942,0.227397,3.125,0.490459,0.260956,0.04387,0.142861,0.552312,0.218862,0.456523,0.187953,0.136662,0.294069,0.215151,0.49078,2.25,2.354167,2.145833,3.979167,4.958333,3.1375,5.104167,5.520833,5.854167,5.020833,4.083333,5.0,0.59375,0.691667,0.604167,0.622917,0.55,22.0,24.0,16.125,16.875,14.75,0.861875


In [26]:
def perform_anova_by_category(df, column):
    categories = df[column].unique()

    for feature in df.columns:
        if feature != column:
            print(f"Feature: {feature}")
            grouped_data = [df[df[column] == category][feature] for category in categories]
            f_value, p_value = stats.f_oneway(*grouped_data)
            if(p_value<0.05):
              print(f"F-value: {f_value}")
              print(f"P-value: {p_value}")
              print("---")

# Assuming you have a dataframe called 'my_dataframe' with a 'groupflow' column
perform_anova_by_category(df, 'mark')

Feature: indiv_spoken_time
Feature: indiv_spoken_time_ratio
Feature: average_turn_duration
Feature: average_turn_duration_ratio
Feature: avg_time_without_speaking
Feature: max_time_without_speaking
F-value: 4.417450088429096
P-value: 0.016908852694873404
---
Feature: num_turns
Feature: num_turns_ratio
Feature: avg_turns_without_speaking
Feature: max_turns_without_speaking
F-value: 3.7564922323321177
P-value: 0.029935185797158717
---
Feature: num_words
Feature: num_words_ratio
Feature: avg_words_turn
Feature: speech_neu
Feature: speech_ang
Feature: speech_hap
Feature: speech_sad
Feature: text_joy
Feature: text_anger
Feature: text_fear
Feature: text_sadness
Feature: group
Feature: messages_sent
Feature: contribution_index
Feature: sentiment_avg
Feature: emotionality_avg
Feature: contribution_index_oscillation
Feature: activity_entanglement
Feature: ALTERNATIVE_REALITIES_Treehugger
Feature: ALTERNATIVE_REALITIES_Fatherlander
Feature: ALTERNATIVE_REALITIES_Spiritualism
Feature: ALTERNATIVE