In [18]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

from utils.pre_processing import PreProcessing
from utils.pre_processor.outlier_remover import OutlierRemover

In [19]:
data = pd.read_csv("../data/playground-series-s4e11/train.csv")
test_data = pd.read_csv("../data/playground-series-s4e11/test.csv")

In [20]:
processor = PreProcessing(data)

In [21]:
model = RandomForestClassifier(n_estimators= 200)

In [22]:
X = data.drop(["Depression", "Name", "id"], axis = 1)
y = data["Depression"]

X_test = test_data.drop(["Name", "id"], axis = 1)

In [23]:
X.columns

Index(['Gender', 'Age', 'City', 'Working Professional or Student',
       'Profession', 'Academic Pressure', 'Work Pressure', 'CGPA',
       'Study Satisfaction', 'Job Satisfaction', 'Sleep Duration',
       'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?',
       'Work/Study Hours', 'Financial Stress',
       'Family History of Mental Illness'],
      dtype='object')

In [24]:
#X = processor.concat_features(X)
#X_test = processor.concat_features(X_test)


In [25]:
#X = processor.fix_outliers(X)
#X_test = processor.fix_outliers(X_test)

processor = OutlierRemover()
X = processor.fit_transform(X)
X_test = processor.transform(X_test)

In [26]:
X_test

Unnamed: 0,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness
0,Male,53.0,Visakhapatnam,Working Professional,Judge,,2.0,,,5.0,Less than 5 hours,Moderate,LLB,No,9.0,3.0,Yes
1,Female,58.0,Kolkata,Working Professional,Educational Consultant,,2.0,,,4.0,Less than 5 hours,Moderate,B.Ed,No,6.0,4.0,No
2,Male,53.0,Jaipur,Working Professional,Teacher,,4.0,,,1.0,7-8 hours,Moderate,B.Arch,Yes,12.0,4.0,No
3,Female,23.0,Rajkot,Student,Unknown,5.0,,6.84,1.0,,More than 8 hours,Moderate,BSc,Yes,10.0,4.0,No
4,Male,47.0,Kalyan,Working Professional,Teacher,,5.0,,,5.0,7-8 hours,Moderate,BCA,Yes,3.0,4.0,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93795,Female,49.0,Jaipur,Working Professional,Pilot,,3.0,,,5.0,Less than 5 hours,Moderate,BSc,Yes,2.0,2.0,Yes
93796,Male,29.0,Ahmedabad,Working Professional,Pilot,,5.0,,,1.0,7-8 hours,Moderate,BE,Yes,11.0,3.0,Yes
93797,Male,24.0,Visakhapatnam,Student,Unknown,1.0,,7.51,4.0,,7-8 hours,Moderate,B.Tech,No,7.0,1.0,No
93798,Female,23.0,Kalyan,Working Professional,Marketing Manager,,4.0,,,2.0,5-6 hours,Healthy,BA,Yes,7.0,5.0,Yes


In [27]:
sum(X["Sleep Duration"] == "Unknown")

67

In [28]:
np.unique(X["Family History of Mental Illness"].astype(str), return_counts=1)

(array(['No', 'Yes'], dtype=object), array([70758, 69942]))

In [29]:
X_hat = pd.DataFrame()
for col in X:
    if X[col].nunique()==2:
        X_hat[col] = X[col] == X[col].unique()[0]
    elif X[col].dtype=="object":
        X_hat = pd.concat([X_hat, pd.get_dummies(X[col])], axis=1)
    else:
        X_hat[col]= X[col]


In [30]:
X_hat_test = pd.DataFrame()
for col in X_test:
    if X_test[col].nunique()==2:
        X_hat_test[col] = X_test[col] == X_test[col].unique()[0]
    elif X_test[col].dtype=="object":
        X_hat_test = pd.concat([X_hat_test, pd.get_dummies(X_test[col])], axis=1)
    else:
        X_hat_test[col]= X_test[col]


In [31]:
feature_in_common = list(set(X_hat.columns) & set(X_hat_test.columns))
unique_feat = set(X_hat.columns) - set(feature_in_common)
unique_feat

set()

In [32]:
X_hat.columns.duplicated()

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False])

In [33]:
X_hat_test.columns.duplicated()

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False])

In [34]:
print(len(X_hat.columns))
print(len(X_hat_test.columns))

feature_in_common = list(set(X_hat.columns) & set(X_hat_test.columns))
X_hat = X_hat[feature_in_common]
X_hat_test = X_hat_test[feature_in_common]

X_hat = X_hat.loc[:, ~X_hat.columns.duplicated()]
X_hat_test = X_hat_test.loc[:, ~X_hat_test.columns.duplicated()]


print(len(feature_in_common))
print(len(X_hat.columns))
print(len(X_hat_test.columns))

117
117
113
113
113


In [None]:
model = RandomForestClassifier(n_estimators= 50)
scores = cross_val_score(model, X_hat, y, cv=5)


In [37]:
scores

array([0.93614072, 0.93685146, 0.93642502, 0.93610519, 0.93880597])

In [65]:
model.fit(X_hat, y)

In [66]:
y_hat = model.predict(X_hat)

In [67]:
sklearn.metrics.accuracy_score(y_hat, y)

1.0

In [68]:
pred = model.predict(X_hat_test)

In [69]:
submission = pd.DataFrame({"Depression":pred}, index = test_data["id"])

In [70]:
submission

Unnamed: 0_level_0,Depression
id,Unnamed: 1_level_1
140700,0
140701,0
140702,0
140703,1
140704,0
...,...
234495,0
234496,1
234497,0
234498,1


In [71]:
submission.to_csv("sub.csv", index_label="id")