In [147]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
from collections import Counter


In [148]:
data = pd.read_csv("../data/playground-series-s4e11/train.csv")
test_data = pd.read_csv("../data/playground-series-s4e11/test.csv")

In [149]:
model = RandomForestClassifier(n_estimators= 500)

In [150]:
X = data.drop(["Depression", "Name", "id"], axis = 1)
y = data["Depression"]

X_test = test_data.drop(["Name", "id"], axis = 1)

In [127]:
# this actually decrease the performance widely
def concat_features(data:pd.DataFrame) -> pd.DataFrame:
    pres = [[work_pres, academic_pres][is_student]  for (academic_pres, work_pres, is_student) in zip(data["Academic Pressure"], data["Work Pressure"], data["Working Professional or Student"]=="Student")]
    data["Work/Study Presure"] = pres
    data = data.drop(["Academic Pressure", "Work Pressure"], axis=1)
    
    sat = [[work_sat, academic_sat][is_student]  for (work_sat, academic_sat, is_student) in zip(data["Study Satisfaction"], data["Job Satisfaction"], data["Working Professional or Student"]=="Student")]
    data["Work/Study Satisfaction"] = sat
    data = data.drop(["Study Satisfaction", "Job Satisfaction"], axis=1)
    return data



In [151]:
#X = concat_features(X)
#X_test = concat_features(X_test)


In [152]:
valid_cities = [city for (city,count) in Counter(X_test["City"]).items() if count > 10]

def fix_outliers(data: pd.DataFrame) -> pd.DataFrame:
    data["City"] = [city if city in valid_cities else "Unknown" for city in data["City"]]
    return data

X = fix_outliers(X)
X_test = fix_outliers(X_test)


In [153]:
np.unique(X["City"], return_counts=1)

(array(['Agra', 'Ahmedabad', 'Bangalore', 'Bhopal', 'Chennai', 'Delhi',
        'Faridabad', 'Ghaziabad', 'Hyderabad', 'Indore', 'Jaipur',
        'Kalyan', 'Kanpur', 'Kolkata', 'Lucknow', 'Ludhiana', 'Meerut',
        'Mumbai', 'Nagpur', 'Nashik', 'Patna', 'Pune', 'Rajkot',
        'Srinagar', 'Surat', 'Thane', 'Unknown', 'Vadodara', 'Varanasi',
        'Vasai-Virar', 'Visakhapatnam'], dtype=object),
 array([4684, 5613, 4123, 3475, 4044, 3593, 3268, 3620, 4496, 4872, 4328,
        6591, 4398, 5689, 4280, 5226, 5528, 4966, 4209, 3144, 5924, 5210,
        5207, 5074, 4636, 4289,   98, 4568, 4606, 5765, 5176]))

In [97]:
X[X["Working Professional or Student"]!="Student"]["Degree"].unique()

array(['BHM', 'LLB', 'BBA', 'MCA', 'MD', 'B.Pharm', 'ME', 'BSc', 'B.Arch',
       'BCA', 'BE', 'MA', 'B.Ed', 'B.Com', 'MBA', 'M.Com', 'MHM',
       'Class 12', 'BA', 'M.Ed', 'PhD', 'MSc', 'M.Tech', 'B.Tech', 'LLM',
       'MBBS', 'M.Pharm', 'UX/UI Designer', 'Nalini', 'BEd', 'Veda',
       'Degree', 'H_Pharm', 'B.Sc', 'Business Analyst', 'M.Arch',
       'LL.Com', 'Data Scientist', 'MPharm', 'L.Ed', 'P.Pharm', 'Kalyan',
       'HR Manager', 'S.Pharm', 'LLBA', 'Vrinda', 'M. Business Analyst',
       'Bhavesh', 'LLCom', '29', 'MTech', 'Plumber', '5.61', 'B.03',
       'Ritik', '5.56', 'M', 'MEd', 'B BA', 'BArch', 'Bhopal', 'P.Com',
       'B.B.Arch', 'ACA', 'M_Tech', 'Pihu', 'BB', 'Jhanvi', 'LLTech',
       'Aarav', 'Entrepreneur', 'Lata', 'S.Arch', 'HCA', '5.88',
       'LL B.Ed', 'M.S', 'Navya', 'Mahika', nan, 'K.Ed', 'B.3.79',
       'Working Professional', 'LLS', 'Doctor', 'N.Pharm', 'B B.Com',
       'BPharm', 'RCA', 'Mihir', 'Advait'], dtype=object)

In [98]:
X[X["Working Professional or Student"]=="Student"]["Degree"].unique()

array(['B.Pharm', 'BSc', 'BA', 'BCA', 'M.Tech', 'PhD', 'Class 12', 'B.Ed',
       'LLB', 'BE', 'M.Ed', 'MSc', 'BHM', 'M.Pharm', 'MCA', 'MA', 'B.Com',
       'MD', 'MBA', 'MBBS', 'M.Com', 'B.Arch', 'LLM', 'B.Tech', 'BBA',
       'ME', 'MHM', 'MPA', 'BH', 'B.Sc', 'Bhopal', 'S.Tech', '20',
       'Class 11', 'M', 'P.Com', 'BPharm', 'Unite', 'BArch', 'Badhya',
       '0', 'Vivaan', 'BPA', 'Brit', 'B', '7.06', 'Brithika', 'CGPA',
       '24', 'Jhanvi', '8.56', 'LHM', 'Marsh', 'L.Ed', 'B.Student',
       'LL B.Ed', 'Mthanya', 'Esha', 'LLEd', 'E.Tech', 'LCA'],
      dtype=object)

In [154]:
X_hat = pd.DataFrame()
for col in X:
    if X[col].nunique()==2:
        X_hat[col] = X[col] == X[col].unique()[0]
    elif X[col].dtype=="object":
        X_hat = pd.concat([X_hat, pd.get_dummies(X[col])], axis=1)
    else:
        X_hat[col]= X[col]


In [155]:
X_hat_test = pd.DataFrame()
for col in X_test:
    if X_test[col].nunique()==2:
        X_hat_test[col] = X_test[col] == X_test[col].unique()[0]
    elif X_test[col].dtype=="object":
        X_hat_test = pd.concat([X_hat_test, pd.get_dummies(X_test[col])], axis=1)
    else:
        X_hat_test[col]= X_test[col]


In [156]:
feature_in_common = list(set(X_hat.columns) & set(X_hat_test.columns))
X_hat = X_hat[feature_in_common]
X_hat_test = X_hat_test[feature_in_common]

X_hat = X_hat.loc[:, ~X_hat.columns.duplicated()]
X_hat_test = X_hat_test.loc[:, ~X_hat_test.columns.duplicated()]


print(len(feature_in_common))
print(len(X_hat.columns))
print(len(X_hat_test.columns))

161
161
161


In [None]:
model.fit(X_hat, y)

In [121]:
y_hat = model.predict(X_hat)

In [122]:
sklearn.metrics.accuracy_score(y_hat, y)

0.999864960909737

In [123]:
pred = model.predict(X_hat_test)

In [124]:
submission = pd.DataFrame({"Depression":pred}, index = test_data["id"])

In [125]:
submission

Unnamed: 0_level_0,Depression
id,Unnamed: 1_level_1
140700,0
140701,0
140702,0
140703,1
140704,0
...,...
234495,0
234496,0
234497,0
234498,1


In [126]:
submission.to_csv("sub.csv", index_label="id")