# Group Surveys Feature Determination

## Preparation

### Import

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, cross_validate, train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 1000)

## Function Definition

### Gnerate Combinations

In [None]:
def generate_features_index():

  range_1 = list(range(21))
  range_2 = list(range(21, 38))
  range_3 = list(range(38,60))

  # Generate all combinations
  combinations = list(itertools.product(range_1, range_2, range_3))

  return combinations

### Min-Max Scaler

In [None]:
from sklearn.preprocessing import MinMaxScaler

def min_max_scaling_df(df, columns):

    scaler = MinMaxScaler()
    df_scaled = pd.DataFrame(scaler.fit_transform(df[columns]), columns=columns)
    df[df_scaled.columns] = df_scaled

    return df


### Display Correlation Matrix

In [None]:
def corr_matrix(df):

  # Assuming you have a DataFrame named 'df'
  correlation_matrix = df.corr()

  # Set the figure size
  fig, ax = plt.subplots(figsize=(24, 20))

  # Plot the correlation matrix as a heatmap
  sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', ax=ax)
  plt.title('Correlation Matrix')

  plt.show()

### Display Scatter Plot

In [None]:
def scatterplot (df,x_column,y_column):
  import pandas as pd
  import seaborn as sns
  import matplotlib.pyplot as plt
  import numpy as np
  from sklearn.linear_model import LinearRegression
  from sklearn.preprocessing import PolynomialFeatures

  # Assuming your DataFrame is called 'df' and you want to check relationships between columns 'x' and 'y'
  x = df[x_column]
  y = df[y_column]

  # Create a scatter plot to visualize the relationship
  plt.scatter(x, y)
  plt.xlabel('x')
  plt.ylabel('y')
  plt.title('Scatter Plot')
  plt.show()

  # Fit a polynomial regression model
  degree = 2  # Degree of the polynomial
  poly_features = PolynomialFeatures(degree=degree)
  X_poly = poly_features.fit_transform(x.values.reshape(-1, 1))

  model = LinearRegression()
  model.fit(X_poly, y)

  # Calculate predicted values
  y_pred = model.predict(X_poly)

  # Plot the fitted curve
  plt.scatter(x, y)
  plt.plot(x, y_pred, color='red', label=f'Degree {degree} Polynomial Fit')
  plt.xlabel('x')
  plt.ylabel('y')
  plt.title('Polynomial Regression')
  plt.legend()
  plt.show()


### Generate Index

In [None]:
def generate_features_index(my_list):
  # Define the ranges for each item
  #selected_features = ['indiv_spoken_time_ratio', 'contribution_index', 'in_group_loyality_score']
  range_1 = [0] + list(range(2, 21))
  range_2 = [21] + list(range(23, 38))
  range_3 = list(range(38,60))

  # Generate all combinations
  combinations = list(itertools.product(range_1, range_2, range_3,range_3))

  # Convert each combination tuple to a list
  combinations_as_list = [list(combination) + my_list for combination in combinations]

  return combinations_as_list

### Correlation Significance

In [None]:
def check_correlation_significance(df, col1, col2):
  import scipy.stats as stats

  # Assuming your DataFrame is called 'df'

  # Extract the 'performance' and 'recreational_perceived_mean' columns
  performance = df[col1]
  recreational_mean = df[col2]

  # Perform the hypothesis test
  corr_coeff, p_value = stats.pearsonr(performance, recreational_mean)

  # Print the results
  print("Correlation coefficient:", corr_coeff)
  print("p-value:", p_value)

  # Compare the p-value to the significance level
  alpha = 0.05  # Significance level
  if p_value < alpha:
      print("The correlation is statistically significant (reject H0).")
  else:
      print("The correlation is not statistically significant (fail to reject H0).")


### Apply Scalings given DF and print correlation

In [None]:
def apply_scalings(df, column):
    correlations = []
    scalers = ['minmax', 'quartile']

    for scaler in scalers:
        df = process_scalings(df, scaler)

        # Calculate the correlations
        corr1 = df.corrwith(df[column], numeric_only=True)  # Set numeric_only parameter to True
        correlations.append(corr1)

    df_final = pd.concat(correlations, axis=1)
    df_final.columns = scalers

    return df_final

In [None]:
def apply_scalings(df, column):

  correlations=[]
  scalers= ['quartile']

  for scaler in scalers:

    df= process_scalings(df, scaler)

    # Calculate the correlations
    corr1 = df.corrwith(df[column])
    #filtered_series = corr1[abs(corr1) > 0.6]
    correlations.append(corr1)

  df_final = pd.concat(correlations, axis=1)
  df_final.columns = scalers

  #fileterd_df = df_final[(np.abs(df) > 0.6).any(axis=1)]

  return df_final


In [None]:
def drop_bad_measured(df):
  df = df.drop(df[df['group'].isin([1, 3, 5])].index)
  return df

#df= drop_bad_measured(df)

### Plot Histograms

In [None]:
def plot_histo_1p(df,column):

  data = df[column]

  # Create the histogram
  hist, bins, _ = plt.hist(data, bins=range(0, 11, 1), rwidth=1)

  # Add count annotations
  for i in range(len(hist)):
      count = int(hist[i])
      plt.text(bins[i] + 0.5, count, str(count), ha='center', va='bottom')

  # Set the labels and title
  plt.xlabel('Theory')
  plt.ylabel('Frequency')
  plt.title('Histogram of Theory')

  # Show the histogram
  plt.show()


In [None]:
def plot_histo_10p(df,column):

  data = df[column]

  # Create the histogram with 50 bins for the range from 5 to 10
  hist, bins, _ = plt.hist(data, bins=50, range=(5, 10), rwidth=1)

  # Add count annotations
  for i in range(len(hist)):
      count = int(hist[i])
      plt.text(bins[i] + (bins[i+1] - bins[i]) / 2, count, str(count), ha='center', va='bottom')

  # Set the labels and title
  plt.xlabel('Theory')
  plt.ylabel('Frequency')
  plt.title('Histogram of Theory (Range: 5 to 10)')

  # Show the histogram
  plt.show()


### Inlcude Marks Classification

In [None]:
def include_mark_classification(df):
    df = df[df['theory'] >= 0.5]
    limit1=0.65
    limit2= 0.8

    # Define the conditions and corresponding values
    conditions = [
        (df['theory'] >= 0.5) & (df['theory'] < limit1),
        (df['theory'] >= limit1) & (df['theory'] < limit2),
        (df['theory'] >= limit2) & (df['theory'] <= 1)
    ]
    values = [1,2,3]

    # Create the 'mark' column using np.select()
    df['mark'] = np.select(conditions, values, default='np.NaN')

    df = df.dropna(how='any')

    return df

### Select Columns by Index

In [None]:
def select_columns_by_index(df, column_indexes):

    cols= column_indexes
    cols.append(len(df.columns)-1)
    selected_columns = df.iloc[:, cols].copy()
    selected_column_names = df.columns[column_indexes].tolist()

    return selected_columns, selected_column_names

### Prepare DataFrame for Modeling

In [None]:
def prepare_df_for_modeling(df):

  # Strip off leading and trailing whitespace from non-numeric columns
  df = df.select_dtypes(include=['int', 'float'])
  df = df[df['theory'] >= 0.5]
  df.reset_index(drop=True, inplace=True)

  return df

### Evaluate Classification Model

In [None]:
def evaluate_classification_models(df, cv=5):
    # Split the dataframe into features (X) and target (y)
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]

    # Initialize the models
    models = {
        'Logistic Regression': LogisticRegression(),
        #'Decision Tree': DecisionTreeClassifier(),
        'Random Forest': RandomForestClassifier()
    }

    # Evaluate each model using cross-validation
    results = {}
    for model_name, model in models.items():
        # Perform cross-validation
        scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv)

        # Calculate mean and standard deviation of the scores
        mean_accuracy = scores.mean()
        std_accuracy = scores.std()

        # Store the results in the dictionary
        results[model_name] = {
            'Mean Accuracy': mean_accuracy,
            'Standard Deviation': std_accuracy
        }

    return results


## Use of Function

### Visualization

In [None]:
in_file= r'/content/drive/MyDrive/Projects/tps/finals/data/3_individual_features.xlsx'


In [None]:
df= pd.read_excel(in_file, index_col='Id')
not_used_columns=['avg_time_without_speaking_ratio', 'max_time_without_speaking_ratio', 'avg_turns_without_speaking_ratio', 'max_turns_without_speaking_ratio', 'avg_words_turn_ratio', 'max_words_turn', 'max_words_turn_ratio', 'messages_total', 'alter_art', 'alter_nudges', 'complexity_avg', 'dummy_question1', 'dummy_question2', 'q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10', 'conservation', 'transcendence', 'coeval', 'project']
df.drop(columns=not_used_columns, inplace=True)

In [None]:
df['theory'] = df['theory'] / 6

In [None]:
#plot_histo_1p(df,'theory')

In [None]:
#plot_histo_10p(df,'theory')

In [None]:
df.head()

Unnamed: 0_level_0,indiv_spoken_time,indiv_spoken_time_ratio,average_turn_duration,average_turn_duration_ratio,avg_time_without_speaking,max_time_without_speaking,num_turns,num_turns_ratio,avg_turns_without_speaking,max_turns_without_speaking,num_words,num_words_ratio,avg_words_turn,speech_neu,speech_ang,speech_hap,speech_sad,text_joy,text_anger,text_fear,text_sadness,messages_sent,contribution_index,ego_art,ego_nudges,sentiment_avg,emotionality_avg,influence_message_avg,influence_total_in,influence_message_avg_in,influence_total,contribution_index_oscillation,activity_entanglement,ALTERNATIVE_REALITIES_Treehugger,ALTERNATIVE_REALITIES_Fatherlander,ALTERNATIVE_REALITIES_Spiritualism,ALTERNATIVE_REALITIES_Nerd,EMOTIONS_Fear,EMOTIONS_Happy,EMOTIONS_Sad,EMOTIONS_Anger,Groupflow_Beeflow,Groupflow_Leechflow,Groupflow_Antflow,ethical_likelihood,financial_likelihood,health_likelihood,recreational_likelihood,social_likelihood,total_likelihood,ethical_perceived,financial_perceived,health_perceived,recreational_perceived,social_perceived,total_perceived,O,C,E,A,N,harm_care_score,fairness_reciprocity_score,in_group_loyality_score,authority_respect_score,purity_sanctity_score,theory
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1
1,526,0.148336,7.850746,0.126865,56.210746,451.54,67,0.331683,1.985075,7,978,0.1708,14.597015,0.832695,0.137502,0.026347,0.003456,0.298809,0.236814,0.232361,0.232015,34,-0.64,1.332.855.889,2.468.578.279,0.303561,0.215562,0.346125353,1.227.594.788,0.346135338,1.730.626.767,1,0.487649,0.472084,0.083889,0.08914,0.354886,0.236101,0.407447,0.251288,0.105165,0.497477,0.156358,0.346165,4.5,3.5,4.166667,3.166667,3.5,3.766667,4.833333,5.666667,5.666667,5.833333,4.833333,5,0.6,0.716667,0.6,0.633333,0.583333,27,23,19,18,20,0.573333
2,2648,0.746757,33.948718,0.548598,21.791169,109.24,78,0.386139,1.589744,9,3816,0.666434,48.923077,0.795958,0.155654,0.045315,0.003074,0.29295,0.239695,0.227768,0.239587,75,-0.34,7.778.626.875,1.363.151.848,0.250396,0.236507,0.428515776,0.997035569,0.231981331,1.988.107.685,2,0.398994,0.438515,0.040232,0.107725,0.413528,0.215535,0.417419,0.208064,0.158982,0.300955,0.220056,0.478989,2.0,1.5,1.0,5.166667,5.5,3.033333,4.0,6.666667,6.5,5.5,3.0,5,0.533333,0.666667,0.7,0.616667,0.633333,22,28,11,7,11,0.606667
3,105,0.029611,8.076923,0.13052,57.649231,141.72,13,0.064356,6.461538,15,267,0.046629,20.538462,0.789103,0.161515,0.037497,0.011885,0.297727,0.238513,0.230916,0.232844,38,-0.6,1.006.371.139,2.010.089.278,0.38186,0.254014,0.236068168,0.866593847,0.199153824,103.894.739,2,0.501416,0.157377,0.02898,0.053947,0.759697,0.245048,0.410218,0.190879,0.153855,0.226931,0.238211,0.534858,4.5,3.5,2.666667,6.0,6.0,4.533333,5.0,5.5,5.5,4.333333,2.833333,5,0.566667,0.683333,0.716667,0.533333,0.716667,23,28,9,6,7,0.663333
4,162,0.045685,6.48,0.104714,159.4216,836.46,25,0.123762,6.6,29,394,0.068809,15.76,0.712697,0.196289,0.077861,0.013153,0.3001,0.235307,0.232877,0.231716,27,-0.7,1.197.692.111,2.333.088.249,0.250852,0.259828,0.486946776,2.016.371.149,0.467754404,1.159.895.116,2,0.493443,0.258578,0.073231,0.112001,0.55619,0.319698,0.32277,0.225624,0.131909,0.256794,0.232465,0.51074,2.166667,1.5,1.0,4.166667,4.833333,2.733333,5.833333,6.0,6.333333,5.0,4.166667,5,0.566667,0.783333,0.733333,0.733333,0.633333,17,24,17,13,8,0.703333
5,105,0.029611,5.526316,0.089303,43.632632,169.2,19,0.094059,4.526316,17,271,0.047328,14.263158,0.746439,0.193365,0.054215,0.005981,0.303593,0.235377,0.233832,0.227198,22,-0.75,8.437.013.889,2.777.083.337,0.3056,0.244289,0.36035958,178.005.748,0.421674039,0.970075875,1,0.555323,0.362543,0.000314,0.136496,0.500648,0.157227,0.472688,0.321906,0.048179,0.342162,0.120088,0.537751,3.166667,2.666667,3.5,5.166667,3.666667,3.633333,3.666667,5.833333,3.666667,3.0,3.666667,4,0.6,0.666667,0.483333,0.583333,0.433333,12,15,11,19,4,0.623333


### Model

In [None]:
df= prepare_df_for_modeling(df)
df= include_mark_classification(df)
df= min_max_scaling_df(df,df.columns.to_list()[:-2])
df.drop(columns=['theory'], inplace=True)

In [None]:
df.shape

(55, 61)

In [None]:
#df.head(100)
import random

In [None]:
# Create an empty list to store the results
results_list = []

my_list = [1, 22]
#selected_features = ['indiv_spoken_time_ratio', 'contribution_index', 'in_group_loyality_score']

all_comb = generate_features_index(my_list)
sample_size = 2
all_comb = random.sample(all_comb, sample_size)

for combination in all_comb:
    df_test, index = select_columns_by_index(df, [15, 24, 40, 52, 1, 22])
    results = evaluate_classification_models(df_test)

    logistic_results = results.get('Logistic Regression', {})
    dt_results = results.get('Decision Tree', {})
    rf_results = results.get('Random Forest', {})

    result_dict = {
        'Index': index,
        'logistic_regression_mean': logistic_results.get('Mean Accuracy'),
        'logistic_regression_stdev': logistic_results.get('Standard Deviation'),
        #'decision_tree_mean': dt_results.get('Mean Accuracy'),
        #'decision_tree_stdev': dt_results.get('Standard Deviation'),
        'random_forest_mean': rf_results.get('Mean Accuracy'),
        'random_forest_stdev': rf_results.get('Standard Deviation')
    }

    results_list.append(result_dict)

# Create the DataFrame from the results list
results_df = pd.DataFrame(results_list)

In [None]:
results_df = results_df.sort_values('random_forest_mean', ascending=False)
results_df.head()

Unnamed: 0,Index,logistic_regression_mean,logistic_regression_stdev,random_forest_mean,random_forest_stdev
0,"[speech_hap, emotionality_avg, health_likeliho...",0.545455,0.057496,0.672727,0.13606
1,"[speech_hap, emotionality_avg, health_likeliho...",0.545455,0.057496,0.672727,0.044536


In [None]:
out_file= r'/content/drive/MyDrive/Projects/tps/finals/data/classification_0.33_model_all_features.xlsx'
results_df.head(1000).to_excel(out_file, index=False)

In [None]:
results_df.describe()

Unnamed: 0,logistic_regression_mean,logistic_regression_stdev,random_forest_mean,random_forest_stdev
count,30000.0,30000.0,30000.0,30000.0
mean,0.529418,0.046486,0.453821,0.089508
std,0.027331,0.02648,0.056016,0.034676
min,0.418182,0.0,0.218182,0.0
25%,0.509091,0.036364,0.418182,0.06803
50%,0.527273,0.044536,0.454545,0.089072
75%,0.545455,0.06803,0.490909,0.109091
max,0.636364,0.185419,0.727273,0.265977
