In [226]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

In [253]:
df = pd.read_csv("models/EEG.machinelearing_data_BRMH.csv")

In [254]:
df['specific.disorder'].unique()

array(['Alcohol use disorder', 'Acute stress disorder',
       'Depressive disorder', 'Healthy control',
       'Behavioral addiction disorder', 'Obsessive compulsitve disorder',
       'Schizophrenia', 'Panic disorder', 'Social anxiety disorder',
       'Posttraumatic stress disorder', 'Adjustment disorder',
       'Bipolar disorder'], dtype=object)

In [255]:
df.head()

Unnamed: 0,no.,sex,age,eeg.date,education,IQ,main.disorder,specific.disorder,AB.A.delta.a.FP1,AB.A.delta.b.FP2,...,COH.F.gamma.o.Pz.p.P4,COH.F.gamma.o.Pz.q.T6,COH.F.gamma.o.Pz.r.O1,COH.F.gamma.o.Pz.s.O2,COH.F.gamma.p.P4.q.T6,COH.F.gamma.p.P4.r.O1,COH.F.gamma.p.P4.s.O2,COH.F.gamma.q.T6.r.O1,COH.F.gamma.q.T6.s.O2,COH.F.gamma.r.O1.s.O2
0,1,M,57.0,2012.8.30,,,Addictive disorder,Alcohol use disorder,36.0,21.72,...,55.99,16.74,23.45,45.68,30.17,16.92,48.85,9.42,34.51,28.61
1,2,M,37.0,2012.9.6,6.0,120.0,Addictive disorder,Alcohol use disorder,13.43,11.0,...,45.6,17.51,26.78,28.2,57.11,32.38,60.35,13.9,57.83,43.46
2,3,M,32.0,2012.9.10,16.0,113.0,Addictive disorder,Alcohol use disorder,29.94,27.54,...,99.48,70.65,39.13,69.92,71.06,38.53,69.91,27.18,64.8,31.49
3,4,M,35.0,2012.10.8,18.0,126.0,Addictive disorder,Alcohol use disorder,21.5,21.85,...,59.99,63.82,36.48,47.12,84.66,24.72,50.3,35.32,79.82,41.14
4,5,M,36.0,2012.10.18,16.0,112.0,Addictive disorder,Alcohol use disorder,37.78,33.61,...,61.46,59.17,51.47,58.64,80.69,62.14,75.89,61.0,87.46,70.53


In [256]:
df.columns

Index(['no.', 'sex', 'age', 'eeg.date', 'education', 'IQ', 'main.disorder',
       'specific.disorder', 'AB.A.delta.a.FP1', 'AB.A.delta.b.FP2',
       ...
       'COH.F.gamma.o.Pz.p.P4', 'COH.F.gamma.o.Pz.q.T6',
       'COH.F.gamma.o.Pz.r.O1', 'COH.F.gamma.o.Pz.s.O2',
       'COH.F.gamma.p.P4.q.T6', 'COH.F.gamma.p.P4.r.O1',
       'COH.F.gamma.p.P4.s.O2', 'COH.F.gamma.q.T6.r.O1',
       'COH.F.gamma.q.T6.s.O2', 'COH.F.gamma.r.O1.s.O2'],
      dtype='object', length=1149)

In [231]:
# df.drop(columns=["Unnamed: 0"], axis = 1, inplace=True)

In [232]:
# df.dropna(inplace=True) 

In [257]:
df.isna().sum().sum()

973

In [234]:
# df.columns

Index(['no.', 'sex', 'age', 'eeg.date', 'education', 'IQ', 'main.disorder',
       'specific.disorder', 'AB.A.delta.a.FP1', 'AB.A.delta.b.FP2',
       ...
       'COH.F.gamma.o.Pz.p.P4', 'COH.F.gamma.o.Pz.q.T6',
       'COH.F.gamma.o.Pz.r.O1', 'COH.F.gamma.o.Pz.s.O2',
       'COH.F.gamma.p.P4.q.T6', 'COH.F.gamma.p.P4.r.O1',
       'COH.F.gamma.p.P4.s.O2', 'COH.F.gamma.q.T6.r.O1',
       'COH.F.gamma.q.T6.s.O2', 'COH.F.gamma.r.O1.s.O2'],
      dtype='object', length=1149)

In [258]:
"""iq_conditions = [(df['IQ']<85), (df['IQ']>115), (df['IQ']<=115) and (df['IQ']>=85)]
iq_bins = ['low', 'high', 'mid']
age_conditions = [(df['age']<25), (df['age']>55), (df['age']<=55) and (df['age']>=25)]
age_bins = ['young', 'old', 'mid']"""

df['IQ_bins'] = np.where(df['IQ']<85, 'low',
                         np.where(df['IQ']>115, 'high', 'mid'))
df['age_bins'] =np.where(df['age']<25, 'young',
                         np.where(df['age']>55, 'old', 'mid'))

In [259]:
df

Unnamed: 0,no.,sex,age,eeg.date,education,IQ,main.disorder,specific.disorder,AB.A.delta.a.FP1,AB.A.delta.b.FP2,...,COH.F.gamma.o.Pz.r.O1,COH.F.gamma.o.Pz.s.O2,COH.F.gamma.p.P4.q.T6,COH.F.gamma.p.P4.r.O1,COH.F.gamma.p.P4.s.O2,COH.F.gamma.q.T6.r.O1,COH.F.gamma.q.T6.s.O2,COH.F.gamma.r.O1.s.O2,IQ_bins,age_bins
0,1,M,57.00,2012.8.30,,,Addictive disorder,Alcohol use disorder,36.00,21.72,...,23.45,45.68,30.17,16.92,48.85,9.42,34.51,28.61,mid,old
1,2,M,37.00,2012.9.6,6.00,120.00,Addictive disorder,Alcohol use disorder,13.43,11.00,...,26.78,28.20,57.11,32.38,60.35,13.90,57.83,43.46,high,mid
2,3,M,32.00,2012.9.10,16.00,113.00,Addictive disorder,Alcohol use disorder,29.94,27.54,...,39.13,69.92,71.06,38.53,69.91,27.18,64.80,31.49,mid,mid
3,4,M,35.00,2012.10.8,18.00,126.00,Addictive disorder,Alcohol use disorder,21.50,21.85,...,36.48,47.12,84.66,24.72,50.30,35.32,79.82,41.14,high,mid
4,5,M,36.00,2012.10.18,16.00,112.00,Addictive disorder,Alcohol use disorder,37.78,33.61,...,51.47,58.64,80.69,62.14,75.89,61.00,87.46,70.53,mid,mid
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
940,941,M,22.00,2014.8.28,13.00,116.00,Healthy control,Healthy control,41.85,36.77,...,63.97,63.98,51.24,62.20,62.06,31.01,31.18,98.33,high,young
941,942,M,26.00,2014.9.19,13.00,118.00,Healthy control,Healthy control,18.99,19.40,...,44.76,49.79,98.91,54.02,93.90,52.74,92.81,56.32,high,mid
942,943,M,26.00,2014.9.27,16.00,113.00,Healthy control,Healthy control,28.78,32.37,...,45.55,33.64,46.69,19.38,41.05,7.05,41.96,19.09,mid,mid
943,944,M,24.00,2014.9.20,13.00,107.00,Healthy control,Healthy control,19.93,25.20,...,41.25,28.19,48.67,42.01,28.74,27.18,27.53,20.03,mid,young


In [260]:
"""iq_conditions = [(df['IQ']<85), (df['IQ']>115), (df['IQ']<=115) and (df['IQ']>=85)]
iq_bins = ['low', 'high', 'mid']
age_conditions = [(df['age']<25), (df['age']>55), (df['age']<=55) and (df['age']>=25)]
age_bins = ['young', 'old', 'mid']"""

df['IQ_bins'] = np.where(df['IQ']<85, 'low',
                         np.where(df['IQ']>115, 'high', 'mid'))
df['age_bins'] = np.where(df['age']<25, 'young',
                         np.where(df['age']>55, 'old', 'mid'))
df_coh_a = [col for col in df.columns if col[:5]=='COH.A']
df_coh_b = [col for col in df.columns if col[:5]=='COH.B']
df_coh_c = [col for col in df.columns if col[:5]=='COH.C']
df_coh_d = [col for col in df.columns if col[:5]=='COH.D']
df_coh_e = [col for col in df.columns if col[:5]=='COH.E']
df_coh_f = [col for col in df.columns if col[:5]=='COH.F']
df_ab = [col for col in df.columns if col[:2]=='AB']
df['COH_A_MEAN'] = df[df_coh_a].mean(axis=1)
df['COH_B_MEAN'] = df[df_coh_a].mean(axis=1)
df['COH_C_MEAN'] = df[df_coh_a].mean(axis=1)
df['COH_D_MEAN'] = df[df_coh_a].mean(axis=1)
df['COH_E_MEAN'] = df[df_coh_a].mean(axis=1)
df['COH_F_MEAN'] = df[df_coh_a].mean(axis=1)
df['AB_MEAN'] = df[df_ab].mean(axis=1)
df_new = df[['no.', 'sex','main.disorder','specific.disorder', 'IQ_bins', 'age_bins',
       'COH_A_MEAN', 'COH_B_MEAN', 'COH_C_MEAN', 'COH_D_MEAN', 'COH_E_MEAN',
       'COH_F_MEAN', 'AB_MEAN']]
agg_cols_new = ['COH_A_MEAN', 'COH_B_MEAN', 'COH_C_MEAN', 'COH_D_MEAN', 'COH_E_MEAN',
       'COH_F_MEAN', 'AB_MEAN']

In [261]:
df_new_check.columns

Index(['no.', 'sex', 'main.disorder', 'specific.disorder', 'IQ_bins',
       'age_bins', 'COH_A_MEAN', 'COH_B_MEAN', 'COH_C_MEAN', 'COH_D_MEAN',
       'COH_E_MEAN', 'COH_F_MEAN', 'AB_MEAN'],
      dtype='object')

In [268]:
X.columns

Index(['no.', 'sex', 'specific.disorder', 'IQ_bins', 'age_bins', 'COH_A_MEAN',
       'COH_B_MEAN', 'COH_C_MEAN', 'COH_D_MEAN', 'COH_E_MEAN', 'COH_F_MEAN',
       'AB_MEAN'],
      dtype='object')

In [270]:
rfc=models['Random Forest']

In [273]:
rfc.feature_names_in_

array(['no.', 'sex', 'specific.disorder', 'IQ_bins', 'age_bins',
       'COH_A_MEAN', 'COH_B_MEAN', 'COH_C_MEAN', 'COH_D_MEAN',
       'COH_E_MEAN', 'COH_F_MEAN', 'AB_MEAN'], dtype=object)

In [275]:
df_new.copy()

Unnamed: 0,no.,sex,main.disorder,specific.disorder,IQ_bins,age_bins,COH_A_MEAN,COH_B_MEAN,COH_C_MEAN,COH_D_MEAN,COH_E_MEAN,COH_F_MEAN,AB_MEAN
0,1,M,Addictive disorder,Alcohol use disorder,mid,old,16.43,16.43,16.43,16.43,16.43,16.43,14.81
1,2,M,Addictive disorder,Alcohol use disorder,high,mid,33.36,33.36,33.36,33.36,33.36,33.36,13.86
2,3,M,Addictive disorder,Alcohol use disorder,mid,mid,28.26,28.26,28.26,28.26,28.26,28.26,11.04
3,4,M,Addictive disorder,Alcohol use disorder,high,mid,18.13,18.13,18.13,18.13,18.13,18.13,8.94
4,5,M,Addictive disorder,Alcohol use disorder,mid,mid,25.83,25.83,25.83,25.83,25.83,25.83,12.66
...,...,...,...,...,...,...,...,...,...,...,...,...,...
940,941,M,Healthy control,Healthy control,high,young,42.48,42.48,42.48,42.48,42.48,42.48,15.21
941,942,M,Healthy control,Healthy control,high,mid,81.03,81.03,81.03,81.03,81.03,81.03,7.60
942,943,M,Healthy control,Healthy control,mid,mid,29.12,29.12,29.12,29.12,29.12,29.12,9.17
943,944,M,Healthy control,Healthy control,mid,young,38.21,38.21,38.21,38.21,38.21,38.21,14.96


In [278]:
df_new_check = df_new.copy()
# df_new_check.drop(columns=['specific.disorder'], axis = 1, inplace = True)

# Separate features and target variable
X = df_new_check.drop('main.disorder', axis=1) # Features
y = df_new_check['main.disorder']  # Target variable

# Encode categorical features (optional, but recommended)
le = LabelEncoder()
for col in X.select_dtypes(include=['object']):
    X[col] = le.fit_transform(X[col])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [267]:
df['specific.disorder'].unique()

array(['Alcohol use disorder', 'Acute stress disorder',
       'Depressive disorder', 'Healthy control',
       'Behavioral addiction disorder', 'Obsessive compulsitve disorder',
       'Schizophrenia', 'Panic disorder', 'Social anxiety disorder',
       'Posttraumatic stress disorder', 'Adjustment disorder',
       'Bipolar disorder'], dtype=object)

In [288]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier  # Corrected XGBoost import
from sklearn.preprocessing import LabelEncoder

# Separate features and target variable
X = df_new_check.drop('main.disorder', axis=1)  # Features
y = df_new_check['main.disorder']  # Target variable

# Encode categorical features (optional, but recommended)
le = LabelEncoder()
for col in X.select_dtypes(include=['object']):
    X[col] = le.fit_transform(X[col])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and train models
models = {
    "Logistic Regression": LogisticRegression(solver='lbfgs'),
    "Random Forest": RandomForestClassifier()
}

for model_name, model in models.items():
    # Train the model
    model.fit(X_train.to_numpy(), y_train.to_numpy())

    # Make predictions on the testing set
    y_pred = model.predict(X_test.to_numpy())

    # Evaluate the model performance
    accuracy = accuracy_score(y_test.to_numpy(), y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    print(f"{model.class_weight}")
    print(f"\n**{model_name} Results:**")
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)


None

**Logistic Regression Results:**
Accuracy: 0.48148148148148145
Precision: 0.3738170190744716
Recall: 0.48148148148148145
None

**Random Forest Results:**
Accuracy: 0.9682539682539683
Precision: 0.9715545477450239
Recall: 0.9682539682539683


In [285]:
y_pred

array(['Anxiety disorder', 'Schizophrenia',
       'Obsessive compulsive disorder', 'Addictive disorder',
       'Addictive disorder', 'Addictive disorder', 'Schizophrenia',
       'Schizophrenia', 'Trauma and stress related disorder',
       'Obsessive compulsive disorder', 'Mood disorder', 'Mood disorder',
       'Trauma and stress related disorder', 'Healthy control',
       'Mood disorder', 'Mood disorder', 'Mood disorder', 'Mood disorder',
       'Mood disorder', 'Addictive disorder',
       'Trauma and stress related disorder', 'Healthy control',
       'Addictive disorder', 'Trauma and stress related disorder',
       'Anxiety disorder', 'Mood disorder', 'Schizophrenia',
       'Mood disorder', 'Mood disorder', 'Mood disorder',
       'Addictive disorder', 'Anxiety disorder', 'Addictive disorder',
       'Mood disorder', 'Trauma and stress related disorder',
       'Mood disorder', 'Healthy control', 'Healthy control',
       'Obsessive compulsive disorder', 'Healthy control',
 

In [284]:
models['Random Forest'].class_weight

In [240]:
import pickle

In [280]:
pickle.dump(models['Random Forest'], open('EEGClassifier.pkl', 'wb'))

In [283]:
print(rfc.class_weight)

None


In [286]:
df.iloc[5:6:]

Unnamed: 0,no.,sex,age,eeg.date,education,IQ,main.disorder,specific.disorder,AB.A.delta.a.FP1,AB.A.delta.b.FP2,...,COH.F.gamma.r.O1.s.O2,IQ_bins,age_bins,COH_A_MEAN,COH_B_MEAN,COH_C_MEAN,COH_D_MEAN,COH_E_MEAN,COH_F_MEAN,AB_MEAN
5,6,F,24.0,2012.11.21,14.0,105.0,Addictive disorder,Alcohol use disorder,13.48,14.1,...,86.13,mid,young,70.14,70.14,70.14,70.14,70.14,70.14,8.36


In [None]:
models['Random Forest Results'].predict()

In [266]:
X['specific.disorder'].

0      2
1      2
2      2
3      2
4      2
      ..
940    6
941    6
942    6
943    6
944    6
Name: specific.disorder, Length: 945, dtype: int32

In [90]:
agg_cols = []
key_cols = ['no.', 'sex', 'age', 'eeg.date', 'education', 'IQ', 'main.disorder',
       'specific.disorder']
for col in df.columns:
  if(col not in key_cols):
    agg_cols.append(col)

df.groupby(["main.disorder", "specific.disorder", "sex", "age"])[agg_cols].aggregate('mean')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,AB.A.delta.a.FP1,AB.A.delta.b.FP2,AB.A.delta.c.F7,AB.A.delta.d.F3,AB.A.delta.e.Fz,AB.A.delta.f.F4,AB.A.delta.g.F8,AB.A.delta.h.T3,AB.A.delta.i.C3,AB.A.delta.j.Cz,...,COH.F.gamma.o.Pz.p.P4,COH.F.gamma.o.Pz.q.T6,COH.F.gamma.o.Pz.r.O1,COH.F.gamma.o.Pz.s.O2,COH.F.gamma.p.P4.q.T6,COH.F.gamma.p.P4.r.O1,COH.F.gamma.p.P4.s.O2,COH.F.gamma.q.T6.r.O1,COH.F.gamma.q.T6.s.O2,COH.F.gamma.r.O1.s.O2
main.disorder,specific.disorder,sex,age,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
Addictive disorder,Alcohol use disorder,F,19.00,12.03,12.75,9.72,14.71,17.07,15.22,10.62,8.62,14.08,17.16,...,21.18,26.11,9.49,17.00,34.28,4.67,17.64,5.86,37.36,8.97
Addictive disorder,Alcohol use disorder,F,20.00,13.68,12.63,9.12,14.40,14.49,13.69,12.21,8.38,11.09,13.65,...,83.46,50.46,60.02,68.11,64.57,57.42,76.18,47.47,70.60,70.67
Addictive disorder,Alcohol use disorder,F,24.00,13.48,14.10,12.85,11.73,13.13,11.63,14.98,6.90,9.75,14.14,...,92.84,82.30,83.94,88.21,90.97,77.44,89.55,72.58,89.46,86.13
Addictive disorder,Alcohol use disorder,F,25.00,14.61,14.34,13.65,14.04,14.25,14.01,24.09,8.85,13.32,13.63,...,45.02,43.47,53.04,84.70,34.15,23.17,63.80,27.44,62.22,66.76
Addictive disorder,Alcohol use disorder,F,26.00,21.78,26.66,17.69,28.34,34.36,30.45,17.91,10.66,30.53,32.82,...,73.02,50.59,56.25,46.57,69.62,51.81,56.50,52.53,62.80,56.71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Trauma and stress related disorder,Posttraumatic stress disorder,M,52.98,30.24,33.94,18.09,15.83,16.10,14.08,15.51,7.33,9.23,10.99,...,84.61,84.58,57.49,84.32,99.03,57.85,98.57,57.83,98.64,57.72
Trauma and stress related disorder,Posttraumatic stress disorder,M,56.60,25.16,37.32,16.99,15.60,19.11,22.05,22.54,19.98,18.91,16.97,...,86.59,55.16,65.03,68.54,76.62,55.88,78.42,42.81,73.60,70.58
Trauma and stress related disorder,Posttraumatic stress disorder,M,58.18,20.60,13.51,19.41,16.35,13.49,13.50,12.61,13.38,14.60,7.92,...,57.07,30.46,56.88,48.38,51.91,38.14,49.47,22.84,37.11,55.41
Trauma and stress related disorder,Posttraumatic stress disorder,M,59.44,16.12,14.33,11.26,11.26,7.25,7.24,7.21,12.35,12.71,12.57,...,86.36,67.65,74.00,78.71,75.80,66.89,82.77,57.45,81.77,74.28


In [194]:
df.head()

Unnamed: 0,no.,sex,age,eeg.date,education,IQ,main.disorder,specific.disorder,AB.A.delta.a.FP1,AB.A.delta.b.FP2,...,COH.F.gamma.r.O1.s.O2,IQ_bins,age_bins,COH_A_MEAN,COH_B_MEAN,COH_C_MEAN,COH_D_MEAN,COH_E_MEAN,COH_F_MEAN,AB_MEAN
0,1,M,57.0,2012.8.30,,,Addictive disorder,Alcohol use disorder,36.0,21.72,...,28.61,mid,old,16.43,16.43,16.43,16.43,16.43,16.43,14.81
1,2,M,37.0,2012.9.6,6.0,120.0,Addictive disorder,Alcohol use disorder,13.43,11.0,...,43.46,high,mid,33.36,33.36,33.36,33.36,33.36,33.36,13.86
2,3,M,32.0,2012.9.10,16.0,113.0,Addictive disorder,Alcohol use disorder,29.94,27.54,...,31.49,mid,mid,28.26,28.26,28.26,28.26,28.26,28.26,11.04
3,4,M,35.0,2012.10.8,18.0,126.0,Addictive disorder,Alcohol use disorder,21.5,21.85,...,41.14,high,mid,18.13,18.13,18.13,18.13,18.13,18.13,8.94
4,5,M,36.0,2012.10.18,16.0,112.0,Addictive disorder,Alcohol use disorder,37.78,33.61,...,70.53,mid,mid,25.83,25.83,25.83,25.83,25.83,25.83,12.66


In [195]:
df_new = df[['no.', 'sex','main.disorder','specific.disorder', 'IQ_bins', 'age_bins',
       'COH_A_MEAN', 'COH_B_MEAN', 'COH_C_MEAN', 'COH_D_MEAN', 'COH_E_MEAN',
       'COH_F_MEAN', 'AB_MEAN']]
agg_cols_new = ['COH_A_MEAN', 'COH_B_MEAN', 'COH_C_MEAN', 'COH_D_MEAN', 'COH_E_MEAN',
       'COH_F_MEAN', 'AB_MEAN']

In [196]:
df_new

Unnamed: 0,no.,sex,main.disorder,specific.disorder,IQ_bins,age_bins,COH_A_MEAN,COH_B_MEAN,COH_C_MEAN,COH_D_MEAN,COH_E_MEAN,COH_F_MEAN,AB_MEAN
0,1,M,Addictive disorder,Alcohol use disorder,mid,old,16.43,16.43,16.43,16.43,16.43,16.43,14.81
1,2,M,Addictive disorder,Alcohol use disorder,high,mid,33.36,33.36,33.36,33.36,33.36,33.36,13.86
2,3,M,Addictive disorder,Alcohol use disorder,mid,mid,28.26,28.26,28.26,28.26,28.26,28.26,11.04
3,4,M,Addictive disorder,Alcohol use disorder,high,mid,18.13,18.13,18.13,18.13,18.13,18.13,8.94
4,5,M,Addictive disorder,Alcohol use disorder,mid,mid,25.83,25.83,25.83,25.83,25.83,25.83,12.66
...,...,...,...,...,...,...,...,...,...,...,...,...,...
940,941,M,Healthy control,Healthy control,high,young,42.48,42.48,42.48,42.48,42.48,42.48,15.21
941,942,M,Healthy control,Healthy control,high,mid,81.03,81.03,81.03,81.03,81.03,81.03,7.60
942,943,M,Healthy control,Healthy control,mid,mid,29.12,29.12,29.12,29.12,29.12,29.12,9.17
943,944,M,Healthy control,Healthy control,mid,young,38.21,38.21,38.21,38.21,38.21,38.21,14.96


In [26]:
df_new.groupby(["main.disorder", "specific.disorder", "sex", "age_bins", "IQ_bins"])[agg_cols_new].aggregate('mean')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,COH_A_MEAN,COH_B_MEAN,COH_C_MEAN,COH_D_MEAN,COH_E_MEAN,COH_F_MEAN,AB_MEAN
main.disorder,specific.disorder,sex,age_bins,IQ_bins,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Addictive disorder,Alcohol use disorder,F,mid,mid,37.878931,37.878931,37.878931,37.878931,37.878931,37.878931,21.936699
Addictive disorder,Alcohol use disorder,F,young,mid,70.139067,70.139067,70.139067,70.139067,70.139067,70.139067,8.3622
Addictive disorder,Alcohol use disorder,M,mid,high,33.559595,33.559595,33.559595,33.559595,33.559595,33.559595,13.414534
Addictive disorder,Alcohol use disorder,M,mid,mid,38.070659,38.070659,38.070659,38.070659,38.070659,38.070659,13.427793
Addictive disorder,Alcohol use disorder,M,young,high,43.225626,43.225626,43.225626,43.225626,43.225626,43.225626,10.939754
Addictive disorder,Alcohol use disorder,M,young,mid,38.402035,38.402035,38.402035,38.402035,38.402035,38.402035,11.650059


In [27]:
df_new.shape

(18, 13)

In [197]:
df_new.columns

Index(['no.', 'sex', 'main.disorder', 'specific.disorder', 'IQ_bins',
       'age_bins', 'COH_A_MEAN', 'COH_B_MEAN', 'COH_C_MEAN', 'COH_D_MEAN',
       'COH_E_MEAN', 'COH_F_MEAN', 'AB_MEAN'],
      dtype='object')

In [198]:
df_new_check

Unnamed: 0,no.,sex,main.disorder,specific.disorder,IQ_bins,age_bins,COH_A_MEAN,COH_B_MEAN,COH_C_MEAN,COH_D_MEAN,COH_E_MEAN,COH_F_MEAN,AB_MEAN
0,1,M,Addictive disorder,Alcohol use disorder,mid,old,16.43,16.43,16.43,16.43,16.43,16.43,14.81
1,2,M,Addictive disorder,Alcohol use disorder,high,mid,33.36,33.36,33.36,33.36,33.36,33.36,13.86
2,3,M,Addictive disorder,Alcohol use disorder,mid,mid,28.26,28.26,28.26,28.26,28.26,28.26,11.04
3,4,M,Addictive disorder,Alcohol use disorder,high,mid,18.13,18.13,18.13,18.13,18.13,18.13,8.94
4,5,M,Addictive disorder,Alcohol use disorder,mid,mid,25.83,25.83,25.83,25.83,25.83,25.83,12.66
...,...,...,...,...,...,...,...,...,...,...,...,...,...
940,941,M,Healthy control,Healthy control,high,young,42.48,42.48,42.48,42.48,42.48,42.48,15.21
941,942,M,Healthy control,Healthy control,high,mid,81.03,81.03,81.03,81.03,81.03,81.03,7.60
942,943,M,Healthy control,Healthy control,mid,mid,29.12,29.12,29.12,29.12,29.12,29.12,9.17
943,944,M,Healthy control,Healthy control,mid,young,38.21,38.21,38.21,38.21,38.21,38.21,14.96


In [199]:
df_new_check = df_new.copy()
# df_new_check.drop(columns=['specific.disorder'], axis = 1, inplace = True)

# Separate features and target variable
X = df_new_check.drop('main.disorder', axis=1)  # Features
y = df_new_check['main.disorder']  # Target variable

# Encode categorical features (optional, but recommended)
le = LabelEncoder()
for col in X.select_dtypes(include=['object']):
    X[col] = le.fit_transform(X[col])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the logistic regression model
model = LogisticRegression(solver='lbfgs')
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model performance (e.g., accuracy, precision, recall)
from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

Accuracy: 0.48148148148148145
Precision: 0.3738170190744716
Recall: 0.48148148148148145


In [200]:
y.unique()

array(['Addictive disorder', 'Trauma and stress related disorder',
       'Mood disorder', 'Healthy control',
       'Obsessive compulsive disorder', 'Schizophrenia',
       'Anxiety disorder'], dtype=object)

In [201]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier  # Corrected XGBoost import
from sklearn.preprocessing import LabelEncoder

# Separate features and target variable
X = df_new_check.drop('main.disorder', axis=1)  # Features
y = df_new_check['main.disorder']  # Target variable

# Encode categorical features (optional, but recommended)
le = LabelEncoder()
for col in X.select_dtypes(include=['object']):
    X[col] = le.fit_transform(X[col])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and train models
models = {
    "Logistic Regression": LogisticRegression(solver='lbfgs'),
    "Random Forest": RandomForestClassifier()
}

for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the testing set
    y_pred = model.predict(X_test)

    # Evaluate the model performance
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')

    print(f"\n**{model_name} Results:**")
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)



**Logistic Regression Results:**
Accuracy: 0.48148148148148145
Precision: 0.3738170190744716
Recall: 0.48148148148148145

**Random Forest Results:**
Accuracy: 0.9629629629629629
Precision: 0.9687578735197783
Recall: 0.9629629629629629


In [205]:
models['Random Forest']

In [31]:
!pip install lazypredict

Collecting lazypredict
  Obtaining dependency information for lazypredict from https://files.pythonhosted.org/packages/bb/df/936639581c018ac59b1096d34e42d39e967d97bf09b1fdb1645e64852e36/lazypredict-0.2.12-py2.py3-none-any.whl.metadata
  Downloading lazypredict-0.2.12-py2.py3-none-any.whl.metadata (12 kB)
Collecting lightgbm (from lazypredict)
  Obtaining dependency information for lightgbm from https://files.pythonhosted.org/packages/e1/4c/4685ccfae9806f561de716e32549190c1f533dde5bcadaf83bdf23972cf0/lightgbm-4.3.0-py3-none-win_amd64.whl.metadata
  Downloading lightgbm-4.3.0-py3-none-win_amd64.whl.metadata (19 kB)
Downloading lazypredict-0.2.12-py2.py3-none-any.whl (12 kB)
Downloading lightgbm-4.3.0-py3-none-win_amd64.whl (1.3 MB)
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
   - -------------------------------------- 0.0/1.3 MB 991.0 kB/s eta 0:00:02
   ---- ----------------------------------- 0.1/1.3 MB 1.7 MB/s eta 0:00:01
   ---------- -----------------------

In [202]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from lazypredict.Supervised import LazyClassifier

# Load your data into a pandas DataFrame (replace with your actual data)
data = pd.DataFrame({
    # ... your data columns
})

# Separate features and target variable
X = data.drop('main.disorder', axis=1)  # Features
y = data['main.disorder']  # Target variable

# Encode categorical features (optional, but recommended)
le = LabelEncoder()
for col in X.select_dtypes(include=['object']):
    X[col] = le.fit_transform(X[col])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and train models using lazypredict
clf = LazyClassifier(verbose=0)  # Set verbose=0 to suppress output during exploration
models = clf.fit(X_train, y_train)

# Evaluate and compare models using lazypredict
print("\n**lazypredict Model Exploration:**")
models.explore_estimates()  # Explore performance estimates of different models
models.estimate()  # Get detailed performance metrics for each model

# Individual model evaluation (optional)
print("\n**Individual Model Evaluation:**")

# Logistic Regression
model_name = "Logistic Regression"
model = LogisticRegression(solver='lbfgs')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
print(f"\n**{model_name} Results:**")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

# Random Forest
model_name = "Random Forest"
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
print(f"\n**{model_name} Results:**")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)


KeyError: "['main.disorder'] not found in axis"

In [203]:
df.drop(columns=[u for u in df.columns if 'AB' in u or u for u in df.columns if 'COH.' in u])

Unnamed: 0,no.,sex,age,eeg.date,education,IQ,main.disorder,specific.disorder,AB.A.delta.a.FP1,AB.A.delta.b.FP2,...,Unnamed: 122,IQ_bins,age_bins,COH_A_MEAN,COH_B_MEAN,COH_C_MEAN,COH_D_MEAN,COH_E_MEAN,COH_F_MEAN,AB_MEAN
0,1,M,57.00,2012.8.30,,,Addictive disorder,Alcohol use disorder,36.00,21.72,...,,mid,old,16.43,16.43,16.43,16.43,16.43,16.43,14.81
1,2,M,37.00,2012.9.6,6.00,120.00,Addictive disorder,Alcohol use disorder,13.43,11.00,...,,high,mid,33.36,33.36,33.36,33.36,33.36,33.36,13.86
2,3,M,32.00,2012.9.10,16.00,113.00,Addictive disorder,Alcohol use disorder,29.94,27.54,...,,mid,mid,28.26,28.26,28.26,28.26,28.26,28.26,11.04
3,4,M,35.00,2012.10.8,18.00,126.00,Addictive disorder,Alcohol use disorder,21.50,21.85,...,,high,mid,18.13,18.13,18.13,18.13,18.13,18.13,8.94
4,5,M,36.00,2012.10.18,16.00,112.00,Addictive disorder,Alcohol use disorder,37.78,33.61,...,,mid,mid,25.83,25.83,25.83,25.83,25.83,25.83,12.66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
940,941,M,22.00,2014.8.28,13.00,116.00,Healthy control,Healthy control,41.85,36.77,...,,high,young,42.48,42.48,42.48,42.48,42.48,42.48,15.21
941,942,M,26.00,2014.9.19,13.00,118.00,Healthy control,Healthy control,18.99,19.40,...,,high,mid,81.03,81.03,81.03,81.03,81.03,81.03,7.60
942,943,M,26.00,2014.9.27,16.00,113.00,Healthy control,Healthy control,28.78,32.37,...,,mid,mid,29.12,29.12,29.12,29.12,29.12,29.12,9.17
943,944,M,24.00,2014.9.20,13.00,107.00,Healthy control,Healthy control,19.93,25.20,...,,mid,young,38.21,38.21,38.21,38.21,38.21,38.21,14.96
