In [None]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split,GridSearchCV, RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from itertools import product

df = pd.read_csv('data.csv',sep='\t')
df.info()

In [None]:
DASS_keys = {'Depression': [3, 5, 10, 13, 16, 17, 21, 24, 26, 31, 34, 37, 38, 42],
             'Anxiety': [2, 4, 7, 9, 15, 19, 20, 23, 25, 28, 30, 36, 40, 41],
             'Stress': [1, 6, 8, 11, 12, 14, 18, 22, 27, 29, 32, 33, 35, 39]}

df['wrongansw'] = 0
df['wrongansw'] = df['wrongansw'].where(df['VCL6']== 0, df['wrongansw'] + 1)
df['wrongansw'] = df['wrongansw'].where(df['VCL9']== 0, df['wrongansw'] + 1)
df['wrongansw'] = df['wrongansw'].where(df['VCL12']== 0, df['wrongansw'] + 1)

df.wrongansw.value_counts()

In [None]:
df = df[df['wrongansw'].isin([2, 3])]
df = df.drop(columns='wrongansw')
df.head(3)
df.shape

In [None]:
vcls = []
for i in range(1, 17):
    vcls.append('VCL' + str(i))
    
df = df.drop(columns=vcls)
df.shape

In [None]:
depr = []
for i in DASS_keys["Depression"]:
    depr.append('Q' + str(i) + 'A')
    
anx = []
for i in DASS_keys["Anxiety"]:
    anx.append('Q' + str(i) + 'A')

stre = []
for i in DASS_keys["Stress"]:
    stre.append('Q' + str(i) + 'A')

df_depr=df.filter(depr)
df_anx=df.filter(anx)
df_stre=df.filter(stre)

In [None]:
categorical = df.select_dtypes('object').columns
print('Categorical Columns: ', df[categorical].columns)
print(df[categorical].nunique())
# remove 'major' column
df = df.drop(['major'], axis=1)

In [None]:
df[depr] -= 1 
df.head(3)

In [None]:
def scores(df):
    df["ScoresD"] = df[depr].sum(axis=1)
    return df

In [None]:
df = scores(df)
df.head(5)

In [None]:
df[anx] -= 1 
df.head(3)

In [None]:
def scores(df):
    df["ScoresA"] = df[anx].sum(axis=1)
    return df

In [None]:
df = scores(df)
df.head(3)

In [None]:
df[stre] -= 1 
df.head(3)

In [None]:
def scores(df):
    df["ScoresS"] = df[stre].sum(axis=1)
    return df

In [None]:
df = scores(df)
df.head(3)

In [None]:
Category=[]
for i in df['ScoresA']:
    if i<=7:
        Category.append('0')
    elif i<=9:
        Category.append('1')
    elif i<=14 :
        Category.append('2')
    elif i<=19:
        Category.append('3')
    else:
        Category.append('4')
df['CATEGORY']= Category

In [None]:
df.isnull().sum()
df.duplicated().sum()

## Anxiety Model

In [None]:
y = df['CATEGORY']
X = df.drop(columns=['CATEGORY','country', 'ScoresD' ,'ScoresS','Q1A' ,'Q1I', 'Q1E', 'Q2I', 'Q2E', 'Q3A', 'Q3I', 'Q3E','Q4I','Q4E', 'Q5E', 'Q5A' ,'Q5I', 'Q6E', 'Q6A' ,'Q6I', 'Q7E' ,'Q7I', 'Q8E','Q8A' ,'Q8I', 'Q9E', 'Q9I', 'Q10E', 'Q10A', 'Q10I', 'Q11E', 'Q11A' ,'Q11I', 'Q12E', 'Q12A' ,'Q12I', 'Q13E', 'Q13A' ,'Q13I', 'Q14E', 'Q14A' ,'Q14I', 'Q15E', 'Q15I', 'Q16E', 'Q16A' ,'Q16I', 'Q17E', 'Q17A' ,'Q17I', 'Q18E', 'Q18A' ,'Q18I', 'Q19I', 'Q19E',  'Q20I', 'Q20E', 'Q21A', 'Q21I', 'Q21E', 'Q22A', 'Q22I','Q22E', 'Q23I','Q23E','Q24A','Q24I','Q24E', 'Q25I', 'Q25E', 'Q26A', 'Q26I', 'Q26E', 'Q27A', 'Q27I', 'Q27E', 'Q28I', 'Q28E', 'Q29A', 'Q29I', 'Q29E',  'Q30I', 'Q30E', 'Q31A', 'Q31I', 'Q31E', 'Q32A', 'Q32I', 'Q32E', 'Q33A', 'Q33I', 'Q33E', 'Q34A', 'Q34I', 'Q34E', 'Q35A', 'Q35I', 'Q35E', 'Q36I', 'Q36E', 'Q37A', 'Q37I', 'Q37E', 'Q38A', 'Q38I', 'Q38E', 'Q39A', 'Q39I', 'Q39E', 'Q40I', 'Q40E', 'Q41I', 'Q41E', 'Q42A', 'Q42I', 'Q42E'])
X.head()

In [None]:
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
print(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y,random_state=100)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
model = RandomForestClassifier(criterion='entropy',
                               max_depth=9,
                               n_estimators=100,
                               random_state=0)
model.fit(X_train, y_train)

In [None]:
score = model.score(X_test, y_test)
print(f"Accuracy: {score}")

## Depression Model

In [None]:
Category1=[]
for i in df['ScoresD']:
    if i<=9:
        Category1.append('0')
    elif i<=13:
        Category1.append('1')
    elif i<=20 :
        Category1.append('2')
    elif i<=27:
        Category1.append('3')
    else:
        Category1.append('4')
df['CATEGORY1']= Category1

In [None]:
df.isnull().sum()
df.duplicated().sum()

In [None]:
y1 = df['CATEGORY1']
X1 = df.drop(columns=['CATEGORY','CATEGORY1','country', 'ScoresA' ,'ScoresS','Q1A' ,'Q1I', 'Q1E', 'Q2A','Q2I', 'Q2E', 'Q3I', 'Q3E','Q4A','Q4I','Q4E', 'Q5E', 'Q5I', 'Q6E', 'Q6A' ,'Q6I', 'Q7A','Q7E' ,'Q7I', 'Q8E','Q8A' ,'Q8I', 'Q9A','Q9E', 'Q9I', 'Q10E', 'Q10I', 'Q11E', 'Q11A' ,'Q11I', 'Q12E', 'Q12A' ,'Q12I', 'Q13E', 'Q13I', 'Q14E', 'Q14A' ,'Q14I', 'Q15E', 'Q15A','Q15I', 'Q16E', 'Q16I', 'Q17E', 'Q17I', 'Q18E', 'Q18A' ,'Q18I', 'Q19A','Q19I', 'Q19E',  'Q20A','Q20I', 'Q20E', 'Q21I', 'Q21E', 'Q22A', 'Q22I','Q22E', 'Q23A','Q23I','Q23E','Q24I','Q24E', 'Q25A','Q25I', 'Q25E', 'Q26I', 'Q26E', 'Q27A', 'Q27I', 'Q27E', 'Q28A','Q28I', 'Q28E', 'Q29A', 'Q29I', 'Q29E', 'Q30A','Q30I', 'Q30E', 'Q31I', 'Q31E', 'Q32A', 'Q32I', 'Q32E', 'Q33A', 'Q33I', 'Q33E', 'Q34I', 'Q34E', 'Q35A', 'Q35I', 'Q35E', 'Q36A','Q36I', 'Q36E', 'Q37I', 'Q37E', 'Q38I', 'Q38E', 'Q39A', 'Q39I', 'Q39E', 'Q40A','Q40I', 'Q40E', 'Q41I', 'Q41E', 'Q41A', 'Q42I', 'Q42E'])
X1.head()

In [None]:
scaler = StandardScaler()
scaler.fit(X1)
X1 = scaler.transform(X1)
print(X1)

In [None]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.15, stratify=y,random_state=100)
print(X_train1.shape)
print(X_test1.shape)
print(y_train1.shape)
print(y_test1.shape)

In [None]:
X_train1

In [32]:
model1 = RandomForestClassifier(criterion='entropy',
                               max_depth=10,
                               n_estimators=100,
                               random_state=100)
model1.fit(X_train1, y_train1)

RandomForestClassifier(criterion='entropy', max_depth=10, random_state=100)

In [33]:
score1 = model1.score(X_test1, y_test1)
print(f"Accuracy: {score1}")

Accuracy: 1.0


## Stress Model

In [34]:
Category2=[]
for i in df['ScoresS']:
    if i<=14:
        Category2.append('0')
    elif i<=18:
        Category2.append('1')
    elif i<=25 :
        Category2.append('2')
    elif i<=33:
        Category2.append('3')
    else:
        Category2.append('4')
df['CATEGORY2']= Category2

In [35]:
df.isnull().sum()
df.duplicated().sum()

0

In [36]:
y2 = df['CATEGORY2']
X2 = df.drop(columns=['CATEGORY','CATEGORY1','CATEGORY2','country', 'ScoresA' ,'ScoresD','Q1I', 'Q1E', 'Q2A','Q2I', 'Q2E', 'Q3A','Q3I', 'Q3E','Q4A','Q4I','Q4E', 'Q5A','Q5E', 'Q5I', 'Q6E','Q6I', 'Q7A','Q7E' ,'Q7I', 'Q8E','Q8I', 'Q9A','Q9E', 'Q9I', 'Q10A','Q10E', 'Q10I', 'Q11E','Q11I', 'Q12E','Q12I', 'Q13A','Q13E', 'Q13I', 'Q14E','Q14I', 'Q15E', 'Q15A','Q15I', 'Q16A','Q16E', 'Q16I', 'Q17A','Q17E', 'Q17I', 'Q18E', 'Q18I', 'Q19A','Q19I', 'Q19E',  'Q20A','Q20I', 'Q20E', 'Q21A','Q21I', 'Q21E', 'Q22I','Q22E', 'Q23A','Q23I','Q23E','Q24A','Q24I','Q24E', 'Q25A','Q25I', 'Q25E', 'Q26A','Q26I', 'Q26E', 'Q27I', 'Q27E', 'Q28A','Q28I', 'Q28E', 'Q29I', 'Q29E', 'Q30A','Q30I', 'Q30E', 'Q31A','Q31I', 'Q31E', 'Q32I', 'Q32E', 'Q33I', 'Q33E', 'Q34A','Q34I', 'Q34E', 'Q35I', 'Q35E', 'Q36A','Q36I', 'Q36E', 'Q37A','Q37I', 'Q37E', 'Q38A','Q38I', 'Q38E', 'Q39A', 'Q39E', 'Q40A','Q40I', 'Q40E', 'Q41I', 'Q41E', 'Q41A', 'Q42A','Q42I', 'Q42E'])
X2.head()

Unnamed: 0,Q1A,Q6A,Q8A,Q11A,Q12A,Q14A,Q18A,Q22A,Q27A,Q29A,...,screensize,uniquenetworklocation,hand,religion,orientation,race,voted,married,familysize,ScoresS
26,3,3,2,3,3,3,3,3,3,3,...,1,1,1,7,1,60,2,1,3,41
31,1,1,0,1,1,1,0,0,0,2,...,1,1,1,2,1,60,2,1,3,7
57,1,2,1,1,2,2,1,1,2,1,...,2,1,1,1,1,60,1,1,2,20
81,3,3,3,2,3,1,3,3,3,3,...,2,1,1,4,1,70,2,1,6,31
87,2,3,3,3,2,3,3,3,3,3,...,1,1,1,1,1,30,2,1,3,39


In [37]:
scaler = StandardScaler()
scaler.fit(X2)
X2 = scaler.transform(X2)
print(X2)

[[ 1.25283128  1.2144967   0.24179779 ... -0.41158476 -0.04026184
   1.53226737]
 [-0.57235947 -0.55372424 -1.57842427 ... -0.41158476 -0.04026184
  -1.35921687]
 [-0.57235947  0.33038623 -0.66831324 ... -0.41158476 -0.41603906
  -0.25364937]
 ...
 [-1.48495484 -1.43783471 -1.57842427 ... -2.34805733  1.46284701
  -1.69939149]
 [ 1.25283128  1.2144967   0.24179779 ...  3.46136038 -0.04026184
   1.53226737]
 [ 0.34023591  1.2144967   1.15190882 ... -0.41158476  0.71129258
   0.76687448]]


In [38]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.15, stratify=y,random_state=100)
print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

(975, 43)
(173, 43)
(975,)
(173,)


In [41]:
model2 = RandomForestClassifier(criterion='entropy',
                               max_depth=10,
                               n_estimators=100,
                               random_state=21)
model2.fit(X_train2, y_train2)

RandomForestClassifier(criterion='entropy', max_depth=10, random_state=21)

In [42]:
score2 = model2.score(X_test2, y_test2)
print(f"Accuracy: {score2}")

Accuracy: 0.9826589595375722
