# Group Surveys Feature Determination

## Preparation

### Import

In [None]:
import itertools
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import label_binarize
from scipy import interp
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 1000)

## Function Definition

### Inlcude Marks Classification

In [None]:
def include_mark_classification(df):

    df = df[df['theory'] >= 0.5]
    limit1=0.65
    limit2= 0.8

    # Define the conditions and corresponding values
    conditions = [
        (df['theory'] >= 0.5) & (df['theory'] < limit1),
        (df['theory'] >= limit1) & (df['theory'] < limit2),
        (df['theory'] >= limit2) & (df['theory'] <= 1)
    ]
    values = [1,2,3]

    # Create the 'mark' column using np.select()
    df['mark'] = np.select(conditions, values, default='np.NaN')

    df = df.dropna(how='any')

    return df

### Select Columns by Index

In [None]:
def select_columns_by_index(df, column_indexes):

    cols= column_indexes
    cols.append(len(df.columns)-1)
    selected_columns = df.iloc[:, cols].copy()
    selected_column_names = df.columns[column_indexes].tolist()

    return selected_columns, selected_column_names

### Prepare DataFrame for Modeling

In [None]:
def prepare_df_for_modeling(df):

  # Strip off leading and trailing whitespace from non-numeric columns
  df = df.select_dtypes(include=['int', 'float'])
  df = df[df['theory'] >= 0.5]
  df.reset_index(drop=True, inplace=True)

  return df

### Return Dataframe by Columns

In [None]:
def return_df_by_columns(df,column_names):

  new_df = df[column_names].copy()
  data = pd.concat([new_df, df['theory']], axis=1)

  return data

### Get Indexes by Column names

In [None]:
def get_column_indexes(df, columns):
    # Get the indexes of the specified columns
    indexes = [df.columns.get_loc(col) for col in columns]
    return indexes

## Use of Function

### Import

In [None]:
in_file= r'/content/drive/MyDrive/Projects/tps/finals/data/3_individual_features.xlsx'

# Read the Excel file into a DataFrame, using 'Id' column as the index
df= pd.read_excel(in_file, index_col='Id')

### Cleaning

In [None]:
# Drop unnecesary columns
not_used_columns=['avg_time_without_speaking_ratio', 'max_time_without_speaking_ratio', 'avg_turns_without_speaking_ratio', 'max_turns_without_speaking_ratio', 'avg_words_turn_ratio', 'max_words_turn', 'max_words_turn_ratio', 'messages_total', 'alter_art', 'alter_nudges', 'complexity_avg', 'dummy_question1', 'dummy_question2', 'q1', 'q2', 'q3', 'q4', 'q5', 'q6', 'q7', 'q8', 'q9', 'q10', 'conservation', 'transcendence', 'coeval', 'project']
df.drop(columns=not_used_columns, inplace=True)

# Transform marks into 0-1
df['theory'] = df['theory'] / 6

In [None]:
df.head(70)

Unnamed: 0,indiv_spoken_time,indiv_spoken_time_ratio,average_turn_duration,average_turn_duration_ratio,avg_time_without_speaking,max_time_without_speaking,num_turns,num_turns_ratio,avg_turns_without_speaking,max_turns_without_speaking,num_words,num_words_ratio,avg_words_turn,speech_neu,speech_ang,speech_hap,speech_sad,text_joy,text_anger,text_fear,text_sadness,messages_sent,contribution_index,sentiment_avg,emotionality_avg,contribution_index_oscillation,activity_entanglement,ALTERNATIVE_REALITIES_Treehugger,ALTERNATIVE_REALITIES_Fatherlander,ALTERNATIVE_REALITIES_Spiritualism,ALTERNATIVE_REALITIES_Nerd,EMOTIONS_Fear,EMOTIONS_Happy,EMOTIONS_Sad,EMOTIONS_Anger,Groupflow_Beeflow,Groupflow_Leechflow,Groupflow_Antflow,ethical_likelihood,financial_likelihood,health_likelihood,recreational_likelihood,social_likelihood,total_likelihood,ethical_perceived,financial_perceived,health_perceived,recreational_perceived,social_perceived,total_perceived,O,C,E,A,N,harm_care_score,fairness_reciprocity_score,in_group_loyality_score,authority_respect_score,purity_sanctity_score,theory,mark
0,526,0.148336,7.850746,0.126865,56.210746,451.54,67,0.331683,1.985075,7,978,0.1708,14.597015,0.832695,0.137502,0.026347,0.003456,0.298809,0.236814,0.232361,0.232015,34,-0.64,0.303561,0.215562,1,0.487649,0.4720845,0.08388939,0.08914,0.3548859,0.236101,0.407447,0.251288,0.105165,0.497477,0.156358,0.346165,4.5,3.5,4.166667,3.166667,3.5,3.766667,4.833333,5.666667,5.666667,5.833333,4.833333,5,0.6,0.716667,0.6,0.633333,0.583333,27,23,19,18,20,0.573333,1
1,2648,0.746757,33.948718,0.548598,21.791169,109.24,78,0.386139,1.589744,9,3816,0.666434,48.923077,0.795958,0.155654,0.045315,0.003074,0.29295,0.239695,0.227768,0.239587,75,-0.34,0.250396,0.236507,2,0.398994,0.4385152,0.04023221,0.107725,0.4135278,0.215535,0.417419,0.208064,0.158982,0.300955,0.220056,0.478989,2.0,1.5,1.0,5.166667,5.5,3.033333,4.0,6.666667,6.5,5.5,3.0,5,0.533333,0.666667,0.7,0.616667,0.633333,22,28,11,7,11,0.606667,1
2,105,0.029611,8.076923,0.13052,57.649231,141.72,13,0.064356,6.461538,15,267,0.046629,20.538462,0.789103,0.161515,0.037497,0.011885,0.297727,0.238513,0.230916,0.232844,38,-0.6,0.38186,0.254014,2,0.501416,0.1573771,0.02897965,0.053947,0.7596967,0.245048,0.410218,0.190879,0.153855,0.226931,0.238211,0.534858,4.5,3.5,2.666667,6.0,6.0,4.533333,5.0,5.5,5.5,4.333333,2.833333,5,0.566667,0.683333,0.716667,0.533333,0.716667,23,28,9,6,7,0.663333,2
3,162,0.045685,6.48,0.104714,159.4216,836.46,25,0.123762,6.6,29,394,0.068809,15.76,0.712697,0.196289,0.077861,0.013153,0.3001,0.235307,0.232877,0.231716,27,-0.7,0.250852,0.259828,2,0.493443,0.2585777,0.07323091,0.112001,0.5561903,0.319698,0.32277,0.225624,0.131909,0.256794,0.232465,0.51074,2.166667,1.5,1.0,4.166667,4.833333,2.733333,5.833333,6.0,6.333333,5.0,4.166667,5,0.566667,0.783333,0.733333,0.733333,0.633333,17,24,17,13,8,0.703333,2
4,105,0.029611,5.526316,0.089303,43.632632,169.2,19,0.094059,4.526316,17,271,0.047328,14.263158,0.746439,0.193365,0.054215,0.005981,0.303593,0.235377,0.233832,0.227198,22,-0.75,0.3056,0.244289,1,0.555323,0.3625429,0.000314,0.136496,0.5006478,0.157227,0.472688,0.321906,0.048179,0.342162,0.120088,0.537751,3.166667,2.666667,3.5,5.166667,3.666667,3.633333,3.666667,5.833333,3.666667,3.0,3.666667,4,0.6,0.666667,0.483333,0.583333,0.433333,12,15,11,19,4,0.623333,1
5,1346,0.337682,8.518987,0.185779,19.887848,169.82,158,0.364055,1.740506,9,4025,0.329109,25.474684,0.76806,0.18186,0.043375,0.006704,0.29778,0.237582,0.227811,0.236827,22,-0.36,0.404612,0.268756,4,0.52,0.3625971,0.1342965,0.00189,0.5012164,0.152454,0.624326,0.145505,0.077715,0.332323,0.221886,0.445791,1.833333,3.833333,3.666667,6.166667,5.5,4.2,3.5,4.0,6.333333,4.333333,2.833333,4,0.683333,0.733333,0.883333,0.583333,0.416667,24,26,13,13,15,0.773333,2
6,298,0.074762,11.461538,0.249949,162.1672,1529.54,26,0.059908,15.384615,125,764,0.062469,29.384615,0.782264,0.203949,0.01221,0.001577,0.300151,0.233183,0.230322,0.236344,8,-0.71,0.746794,0.579783,4,0.529475,0.02949133,0.1252847,0.000506,0.8447182,0.121923,0.68549,0.083875,0.108711,0.535406,0.037475,0.427119,4.666667,3.666667,4.333333,4.5,4.666667,4.366667,3.666667,4.833333,6.0,5.333333,4.333333,5,0.533333,0.683333,0.716667,0.5,0.516667,28,20,21,23,24,0.601667,1
7,697,0.174862,8.822785,0.192404,47.935696,1421.86,79,0.182028,4.455696,108,2062,0.168602,26.101266,0.735439,0.228609,0.03055,0.005402,0.295444,0.239446,0.225465,0.239646,8,-0.71,0.42855,0.236425,1,0.518237,0.2492233,0.05710145,0.193481,0.5001939,0.070225,0.553516,0.30907,0.067189,0.065323,0.31168,0.622997,3.0,2.666667,3.166667,5.0,5.0,3.766667,3.666667,5.833333,4.333333,3.0,2.833333,4,0.616667,0.7,0.716667,0.716667,0.55,17,23,16,13,14,0.728333,2
8,276,0.069242,6.272727,0.136793,25.912727,218.84,44,0.101382,3.113636,24,857,0.070074,19.477273,0.710042,0.214038,0.066733,0.009187,0.298211,0.23699,0.226584,0.238215,24,-0.32,0.259263,0.253813,3,0.497027,0.1932707,0.03850988,0.087476,0.6807433,0.150721,0.602558,0.153305,0.093417,0.221334,0.277272,0.501394,1.5,2.0,1.333333,3.166667,5.0,2.6,5.333333,4.5,6.166667,5.5,3.166667,5,0.533333,0.7,0.583333,0.55,0.583333,27,26,18,18,18,0.888333,3
9,266,0.166562,9.851852,0.207658,95.963077,1446.52,27,0.174194,4.740741,40,543,0.164945,20.111111,0.85065,0.140287,0.007156,0.001907,0.268266,0.257794,0.243564,0.230376,9,-0.7,0.41733,0.146854,4,0.54717,0.4430857,0.000356,0.333701,0.2228573,0.290449,0.443599,0.177075,0.088877,0.399318,0.391894,0.208788,1.666667,5.666667,3.166667,6.5,5.0,4.4,6.666667,4.166667,6.0,4.666667,4.0,5,0.583333,0.733333,0.616667,0.65,0.383333,23,23,20,23,18,0.928333,3


### Preparation

In [None]:
# Make sure every column is composed by numbers
df= prepare_df_for_modeling(df)

# Include the classification (new column: 'mark')
df= include_mark_classification(df)

In [None]:

# Group the DataFrame by 'mark' and calculate the average of other columns
grouped_df = df.groupby('mark').mean()

grouped_df.head()


Unnamed: 0_level_0,indiv_spoken_time,indiv_spoken_time_ratio,average_turn_duration,average_turn_duration_ratio,avg_time_without_speaking,max_time_without_speaking,num_turns,num_turns_ratio,avg_turns_without_speaking,max_turns_without_speaking,num_words,num_words_ratio,avg_words_turn,speech_neu,speech_ang,speech_hap,speech_sad,text_joy,text_anger,text_fear,text_sadness,messages_sent,contribution_index,sentiment_avg,emotionality_avg,contribution_index_oscillation,activity_entanglement,ALTERNATIVE_REALITIES_Treehugger,ALTERNATIVE_REALITIES_Fatherlander,ALTERNATIVE_REALITIES_Spiritualism,ALTERNATIVE_REALITIES_Nerd,EMOTIONS_Fear,EMOTIONS_Happy,EMOTIONS_Sad,EMOTIONS_Anger,Groupflow_Beeflow,Groupflow_Leechflow,Groupflow_Antflow,ethical_likelihood,financial_likelihood,health_likelihood,recreational_likelihood,social_likelihood,total_likelihood,ethical_perceived,financial_perceived,health_perceived,recreational_perceived,social_perceived,total_perceived,O,C,E,A,N,harm_care_score,fairness_reciprocity_score,in_group_loyality_score,authority_respect_score,purity_sanctity_score,theory
mark,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1
1,633.133333,0.211852,11.638873,0.213002,72.145966,627.696667,49.433333,0.200377,4.66425,31.566667,1329.0,0.210984,25.064528,0.792493,0.178984,0.025318,0.003206,0.29384,0.237682,0.234083,0.234395,31.9,-0.639,0.381294,0.272791,2.866667,0.481637,0.268946,0.080709,0.15633,0.494014,0.205704,0.468833,0.196784,0.128678,0.273088,0.181244,0.545668,3.061111,3.666667,3.594444,5.233333,5.166667,4.144444,4.588889,5.227778,5.083333,4.238889,3.483333,4.433333,0.621111,0.675,0.658889,0.604444,0.548333,20.233333,21.766667,16.566667,14.566667,12.833333,0.572389
2,4098.411765,0.212928,12.863763,0.212254,96.742884,3406.538824,161.352941,0.202204,5.362742,29.058824,4139.470588,0.203808,24.54195,0.786723,0.175513,0.031701,0.006062,0.28881,0.24094,0.236963,0.233287,19.411765,-0.578824,0.360572,0.228439,2.176471,0.480445,0.334654,0.054918,0.075818,0.534609,0.195958,0.474192,0.208322,0.121528,0.310736,0.187877,0.501387,2.313725,2.470588,2.460784,4.392157,5.058824,3.339216,4.745098,5.617647,5.647059,4.558824,3.196078,4.705882,0.610784,0.710784,0.678431,0.622549,0.556863,22.352941,23.411765,14.176471,13.882353,12.764706,0.727745
3,920.625,0.137845,8.237331,0.152557,152.77756,9855.18,132.75,0.172704,7.351467,94.875,2173.75,0.152714,20.273865,0.804051,0.164769,0.027591,0.00359,0.286585,0.240915,0.240299,0.232201,20.25,-0.575,0.371942,0.227397,3.125,0.490459,0.260956,0.04387,0.142861,0.552312,0.218862,0.456523,0.187953,0.136662,0.294069,0.215151,0.49078,2.25,2.354167,2.145833,3.979167,4.958333,3.1375,5.104167,5.520833,5.854167,5.020833,4.083333,5.0,0.59375,0.691667,0.604167,0.622917,0.55,22.0,24.0,16.125,16.875,14.75,0.861875


In [None]:
import sys
sys.exit()

In [None]:


# Ssaling informative variables
df= min_max_scaling_df(df,df.columns.to_list()[:-2])

# Drop the mark itself
df.drop(columns=['theory'], inplace=True)

In [None]:
df.head()

### Feature Selection

In [None]:
# Select the features
features_names= ['speech_hap', 'emotionality_avg', 'health_likelihood', 'E', 'indiv_spoken_time_ratio', 'contribution_index']

# Get Features Indexes
features_indexes= get_column_indexes(df, features_names)

# Get the DataFrame with the features selected
df,columns = select_columns_by_index(df, features_indexes)

In [None]:
df.head(50)

In [None]:
df.shape

### Data Understanding

In [None]:
# Histograms of explanatory variables
for column in df.columns:
  if column!='mark':
    # Plot histogram
    plt.hist(df[column], bins=10, range=(0, 1))
    plt.title(column)  # Set column name as the title
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.show()  # Show the histogram


In [None]:
# Histograms of the objective variable:

# Count the frequency of values in the 'theory' column
value_counts = df['mark'].value_counts()

# Plot bar chart
plt.bar(value_counts.index, value_counts.values)
plt.title('Classes')  # Set column name as the title
plt.xlabel('Value')
plt.ylabel('Frequency')

plt.xticks([1, 2, 3])  # Set the x-ticks to match the values

plt.show()  # Show the bar chart

### Search and Return the best Model Hyperparameters

In [None]:
# Call the function and store the returned dictionary
result_dict = evaluate_random_forest_hyper_parameters(df, cv=5)

### Feature Importance

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
y = pd.to_numeric(y)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Compute the correlation matrix
correlation_matrix = df.corr()

# Plot the correlation matrix as a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()


In [None]:
import shap
import matplotlib.pyplot as plt

# Access the best model from the result_dict
best_model = result_dict['Best Model']

best_model.fit(X, y)

# Create an explainer using the trained model and the feature data
explainer = shap.Explainer(best_model, X)

# Compute SHAP values for all features
shap_values = explainer.shap_values(X)

# Plot feature importances
shap.summary_plot(shap_values, X, plot_type='bar', show=False)

plt.xlabel('Mean Absolute SHAP Value')
plt.ylabel('Features')
plt.title('Feature Importance')
plt.tight_layout()
plt.show()


In [None]:
!pip install eli5


In [None]:
import eli5
from eli5.sklearn import PermutationImportance

# Fit the PermutationImportance model
perm = PermutationImportance(best_model).fit(X, y)

# Plot feature importances
eli5.show_weights(perm, feature_names=X.columns.tolist())


In [None]:
!pip install lime


In [None]:
from lime import lime_tabular

# Create an explainer object
explainer = lime_tabular.LimeTabularExplainer(X.values, feature_names=X.columns, class_names=y.unique(), mode='regression')

# Select an instance for explanation
instance = X.iloc[0]

# Generate an explanation for the instance using the best model's predict function
explanation = explainer.explain_instance(instance.values, best_model.predict, num_features=len(X.columns))

# Plot the feature importance
explanation.as_pyplot_figure()



In [None]:
import shap
import matplotlib.pyplot as plt

# Access the best model from the result_dict
best_model = result_dict['Best Model']

best_model.fit(X, y)

# Create an explainer using the trained model and the feature data
explainer = shap.Explainer(best_model, X)

# Compute SHAP values for all features
shap_values = explainer(X)

# Plot feature importances
plt.figure(figsize=(16, 4))  # Increase the figure width and decrease the height
shap.summary_plot(shap_values, X.values, feature_names=X.columns, max_display=6)  # Set max_display to 6
plt.xticks(rotation=90, fontsize=8)  # Rotate and reduce font size of x-axis tick labels
plt.tight_layout()  # Apply tight layout to adjust spacing between subplots
plt.show()


In [None]:
import shap

# Access the best model from the result_dict
best_model = result_dict['Best Model']

best_model.fit(X, y)

# Create an explainer using the trained model and the feature data
explainer = shap.Explainer(best_model, X)

# Compute SHAP values for all features
shap_values = explainer(X)

# Plot the summary plot
shap.summary_plot(shap_values, X.values, feature_names=X.columns)

# Show the plot
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Calculate feature importance
feature_importance = np.abs(shap_values.values.mean(0))

# Calculate feature correlations
correlations = X.corrwith(y)

# Sort features by importance
sorted_indices = np.argsort(feature_importance)
sorted_features = X.columns[sorted_indices]

# Create a colormap for positive and negative correlations
cmap = ['red' if corr < 0 else 'green' for corr in correlations.loc[sorted_features]]

# Plot the feature importance histogram
fig, ax = plt.subplots(figsize=(10, 6))
ax.barh(range(len(sorted_features)), feature_importance[sorted_indices], color=cmap)

# Set the y-axis ticks and labels
ax.set_yticks(range(len(sorted_features)))
ax.set_yticklabels(sorted_features)

# Set the x-axis label
ax.set_xlabel('Feature Importance')

# Add color legend
pos_patch = plt.Rectangle((0, 0), 1, 1, fc='green')
neg_patch = plt.Rectangle((0, 0), 1, 1, fc='red')
ax.legend([pos_patch, neg_patch], ['Positive Correlation', 'Negative Correlation'])

plt.tight_layout()
plt.show()


In [None]:

# Access the best model from the result_dict
best_model = result_dict['Best Model']

best_model.fit(X, y)

import numpy as np
import matplotlib.pyplot as plt

# Get feature importances from the best model
importances = best_model.feature_importances_

# Sort feature importances in descending order
sorted_indices = np.argsort(importances)[::-1]
sorted_importances = importances[sorted_indices]
sorted_features = X.columns[sorted_indices]

# Select numeric columns from X
numeric_columns = X.select_dtypes(include=np.number)

# Get the correlation of each numeric feature with the target variable
feature_target_correlation = numeric_columns.corrwith(y)

# Create a list to store the direction of feature importance
importance_direction = ['Positive' if imp >= 0 else 'Negative' for imp in sorted_importances]

# Plot feature importances and their correlation with the target variable
fig, ax = plt.subplots(figsize=(10, 6))
ax.barh(range(len(sorted_features)), sorted_importances, align='center')
ax.set_yticks(range(len(sorted_features)))
ax.set_yticklabels(sorted_features)
ax.set_xlabel('Feature Importance')
ax.set_ylabel('Features')
ax.set_title('Feature Importance and Correlation with Target Variable')

# Add arrows to indicate the direction of importance
for i in range(len(sorted_features)):
    color = 'green' if importance_direction[i] == 'Positive' else 'red'
    ax.annotate(importance_direction[i], xy=(sorted_importances[i], i),
                xytext=(10, 0), textcoords='offset points', color=color)

# Add correlation values as text
for i, (feature, correlation) in enumerate(feature_target_correlation.iteritems()):
    ax.annotate(f'Corr: {correlation:.2f}', xy=(0, i),
                xytext=(-60, 0), textcoords='offset points', ha='right')

plt.tight_layout()
plt.show()




In [None]:
# Access the best model from the result_dict
best_model = result_dict['Best Model']

best_model.fit(X, y)

import numpy as np
import matplotlib.pyplot as plt

# Get feature importances from the best model
importances = best_model.feature_importances_

# Sort feature importances in descending order
sorted_indices = np.argsort(importances)[::-1]
sorted_importances = importances[sorted_indices]
sorted_features = X.columns[sorted_indices]

# Select numeric columns from X
numeric_columns = X.select_dtypes(include=np.number)

# Get the correlation of each numeric feature with the target variable
feature_target_correlation = numeric_columns.corrwith(y)

# Create a list to store the direction of feature importance
importance_direction = ['Positive' if imp >= 0 else 'Negative' for imp in sorted_importances]

# Plot feature importances and their correlation with the target variable
fig, ax = plt.subplots(figsize=(10, 6))
ax.barh(range(len(sorted_features)), sorted_importances, align='center')
ax.set_yticks(range(len(sorted_features)))
ax.set_yticklabels(sorted_features)
ax.set_xlabel('Feature Importance')
ax.set_ylabel('Features')
ax.set_title('Feature Importance and Correlation with Target Variable')

# Add correlation at the end of each bar
for i in range(len(sorted_features)):
    correlation = feature_target_correlation[sorted_features[i]]
    color = 'green' if correlation >= 0 else 'red'
    ax.annotate(f'Corr: {correlation:.2f}', xy=(sorted_importances[i], i),
                xytext=(sorted_importances[i] + 0.01, i),
                textcoords='data', color=color)

plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Get feature importances from the best model
importances = best_model.feature_importances_

# Sort feature importances in descending order
sorted_indices = np.argsort(importances)[::-1]
sorted_importances = importances[sorted_indices]
sorted_features = X.columns[sorted_indices]

# Calculate correlations with the target variable
feature_target_correlation = X.corrwith(y)

# Create a list to store the direction of feature importance
importance_direction = ['Positive' if imp >= 0 else 'Negative' for imp in sorted_importances]

# Plot feature importances and their correlation with the target variable
fig, ax = plt.subplots(figsize=(10, 6))
ax.barh(range(len(sorted_features)), sorted_importances, align='center')
ax.set_yticks(range(len(sorted_features)))
ax.set_yticklabels(sorted_features)
ax.set_xlabel('Feature Importance')
ax.set_ylabel('Features')
ax.set_title('Feature Importance and Correlation with Target Variable')

# Add correlation (sign) at the end of each bar
for i in range(len(sorted_features)):
    correlation = feature_target_correlation[sorted_features[i]]
    sign = '+' if correlation >= 0 else '-'
    color = 'green' if correlation >= 0 else 'red'
    ax.annotate(f'Corr: {sign}', xy=(sorted_importances[i], i),
                xytext=(sorted_importances[i] + 0.01, i),
                textcoords='data', color=color)

plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Get feature importances from the best model
importances = best_model.feature_importances_

# Sort feature importances in descending order
sorted_indices = np.argsort(importances)[::-1]
sorted_importances = importances[sorted_indices]
sorted_features = X.columns[sorted_indices]

# Calculate correlations with the target variable
feature_target_correlation = X.corrwith(y)

# Create a list to store the direction of feature importance
importance_direction = ['Positive' if imp >= 0 else 'Negative' for imp in sorted_importances]

# Plot feature importances and their correlation with the target variable
fig, ax = plt.subplots(figsize=(6, 4))
ax.bar(sorted_features, sorted_importances)
ax.set_xticklabels(sorted_features, rotation=90)
ax.set_ylabel('Feature Importance')
ax.set_xlabel('Features')

# Add correlation (sign) above each bar
for i in range(len(sorted_features)):
    correlation = feature_target_correlation[sorted_features[i]]
    sign = '+' if correlation >= 0 else '-'
    color = 'green' if correlation >= 0 else 'red'
    ax.annotate(f'Corr: {sign}', xy=(sorted_features[i], sorted_importances[i]),
                xytext=(0, 5), textcoords='offset points', ha='center', color=color)

plt.tight_layout()
plt.show()


In [None]:
import shap
import matplotlib.pyplot as plt

# Access the best model from the result_dict
best_model = result_dict['Best Model']

# Create an explainer using the trained model and the feature data
explainer = shap.Explainer(best_model, X)

# Compute SHAP values for all features
shap_values = explainer(X)

# Calculate the mean absolute SHAP values for each feature
mean_abs_shap_values = np.mean(np.abs(shap_values.values), axis=0)

# Sort the features and their corresponding SHAP values in descending order
sorted_indices = np.argsort(mean_abs_shap_values)[::-1]
sorted_features = X.columns[sorted_indices]
sorted_shap_values = mean_abs_shap_values[sorted_indices]

# Plot the feature importances
plt.figure(figsize=(10, 6))
plt.barh(range(len(sorted_features)), sorted_shap_values)  # Use the sorted SHAP values directly
plt.yticks(range(len(sorted_features)), sorted_features)
plt.xlabel('Mean Absolute SHAP Value')
plt.ylabel('Features')
plt.title('Feature Importances')
plt.tight_layout()
plt.show()


In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Access the best model from the result_dict
best_model = result_dict['Best Model']

# Retrain the model using the entire dataset
best_model.fit(X, y)

# Perform cross-validation to evaluate model performance
cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='accuracy')

# Print the cross-validation scores
print("Cross-Validation Scores:", cv_scores)
print("Mean Accuracy:", np.mean(cv_scores))

# Analyze feature importance
feature_importances = best_model.feature_importances_

# Sort feature importances in descending order
sorted_indices = np.argsort(feature_importances)[::-1]
sorted_feature_importances = feature_importances[sorted_indices]
sorted_feature_names = df.columns[sorted_indices]  # Use the column names of your DataFrame

# Plot feature importances
plt.figure(figsize=(10, 6))
plt.bar(range(len(sorted_feature_importances)), sorted_feature_importances)
plt.xticks(range(len(sorted_feature_importances)), sorted_feature_names, rotation=90)
plt.xlabel('Features')
plt.ylabel('Importance')
plt.title('Feature Importance (Cross-Validation)')
plt.tight_layout()
plt.show()


### Confusion Matrix

In [None]:
# Perform cross-validation predictions
skf = StratifiedKFold(n_splits=5)
y_pred_cv = np.zeros_like(y)

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    best_model.fit(X_train, y_train)
    y_pred_cv[test_index] = best_model.predict(X_test)

# Compute confusion matrix
cm = confusion_matrix(y, y_pred_cv)

# Define class labels
class_labels = ['Class 1', 'Class 2', 'Class 3']  # Replace with your actual class labels

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Blues', fmt='g', xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()


### Curve ROC

In [None]:
# Perform cross-validation predictions
skf = StratifiedKFold(n_splits=5)
y_bin = label_binarize(y, classes=np.unique(y))
n_classes = y_bin.shape[1]
y_prob_cv = np.zeros((len(y), len(np.unique(y))))

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    best_model.fit(X_train, y_train)
    y_prob_cv[test_index] = best_model.predict_proba(X_test)

# Compute ROC curve and ROC AUC score for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], y_prob_cv[:, i])
    roc_auc[i] = roc_auc_score(y_bin[:, i], y_prob_cv[:, i])

# Compute micro-average ROC curve and ROC AUC score
fpr["micro"], tpr["micro"], _ = roc_curve(y_bin.ravel(), y_prob_cv.ravel())
roc_auc["micro"] = roc_auc_score(y_bin, y_prob_cv, average="micro")

# Define class labels
class_labels = ['Class 1', 'Class 2', 'Class 3']  # Replace with your actual class labels

# Plot ROC curves for each class
plt.figure(figsize=(8, 6))
colors = itertools.cycle(['blue', 'red', 'green', 'purple', 'orange'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2, label='ROC curve for {} (AUC = {:.2f})'.format(class_labels[i], roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--')  # Random guess line
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc='lower right')
plt.show()
