In [101]:
import kaggle
import pandas as pd
import numpy as np
import zipfile
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn import metrics 
from sklearn.neural_network import MLPClassifier

from sklearn.neighbors import KDTree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

import seaborn as sns
import matplotlib.pyplot as plt
width = 20
height = 8
sns.set(rc = {'figure.figsize':(width, height)})

In [102]:
# !kaggle competitions download -c playground-series-s4e6
# with zipfile.ZipFile('playground-series-s4e6.zip', 'r') as zip_ref:
#     zip_ref.extractall()

In [103]:
df_train = pd.read_csv('train.csv')

def clean_data(df):
    
    # Dummy for Target_Graduate, Target_Dropout and Target_Enrolled
    one_hot = pd.get_dummies(df['Target'], prefix='Target')
    df = df.join(one_hot)

    # Dummy for first choice
    df['First_choice'] = df.apply(lambda x: int(x['Application order'] <= 1), axis=1)
    
    # Dummy for Marital Status Single
    df['Single'] = df.apply(lambda x: int(x['Marital status'] == 1), axis=1)

    # Dummy for different application modes
    df['Application_mode_1st_phase'] = df.apply(lambda x: int(x['Application mode'] == 1), axis=1)
    df['Application_mode_2nd_phase'] = df.apply(lambda x: int(x['Application mode'] == 17), axis=1)
    df['Application_mode_Over_23 years_old'] = df.apply(lambda x: int(x['Application mode'] == 39), axis=1)
    df['Application_mode_Tech_Spec'] = df.apply(lambda x: int(x['Application mode'] == 44), axis=1)
    df['Application_mode_Change'] = df.apply(lambda x: int(x['Application mode'] == 43), axis=1)
    
    # Dummy for different courses
    one_hot = pd.get_dummies(df['Course'], prefix='Course')
    df = df.join(one_hot)

    # Dummy for different qualifications
    df['Quali_Secondary education'] = df.apply(lambda x: int(x['Previous qualification'] == 1), axis=1)
    df['Quali_Basic education'] = df.apply(lambda x: int(x['Previous qualification'] == 19), axis=1)
    df['Quali_Tech Spec course'] = df.apply(lambda x: int(x['Previous qualification'] == 39), axis=1)
    df['Quali_Higher education'] = df.apply(lambda x: int(x['Previous qualification'] == 3), axis=1)
    df['Quali_Other'] = df.apply(lambda x: int(x['Previous qualification'] == 12), axis=1)
    df['Quali_12th year of schooling'] = df.apply(lambda x: int(x['Previous qualification'] == 9), axis=1)
    df['Quali_Higher education'] = df.apply(lambda x: int(x['Previous qualification'] == 40), axis=1)
    df['Quali_Professional higher technical course'] = df.apply(lambda x: int(x['Previous qualification'] == 42), axis=1)

    # Dummy for Mother's qualification
    df['M_Quali_Secondary education'] = df.apply(lambda x: int(x['Mother\'s qualification'] == 1), axis=1)
    df['M_Quali_Basic education_3rd'] = df.apply(lambda x: int(x['Mother\'s qualification'] == 19), axis=1)
    df['M_Quali_Basic education_1st'] = df.apply(lambda x: int(x['Mother\'s qualification'] == 37), axis=1)
    df['M_Quali_Basic education_2nd'] = df.apply(lambda x: int(x['Mother\'s qualification'] == 38), axis=1)
    df['M_Quali_Higher Education'] = df.apply(lambda x: int(x['Mother\'s qualification'] == 3), axis=1)   

    # Dummy for Mother's qualification
    df['M_Quali_Secondary education'] = df.apply(lambda x: int(x['Mother\'s qualification'] == 1), axis=1)
    df['M_Quali_Basic education_3rd'] = df.apply(lambda x: int(x['Mother\'s qualification'] == 19), axis=1)
    df['M_Quali_Basic education_1st'] = df.apply(lambda x: int(x['Mother\'s qualification'] == 37), axis=1)
    df['M_Quali_Basic education_2nd'] = df.apply(lambda x: int(x['Mother\'s qualification'] == 38), axis=1)
    df['M_Quali_Higher Education'] = df.apply(lambda x: int(x['Mother\'s qualification'] == 3), axis=1) 

    # Dummy for Father's qualification
    df['F_Quali_Secondary education'] = df.apply(lambda x: int(x['Father\'s qualification'] == 1), axis=1)
    df['F_Quali_Basic education_3rd'] = df.apply(lambda x: int(x['Father\'s qualification'] == 19), axis=1)
    df['F_Quali_Basic education_1st'] = df.apply(lambda x: int(x['Father\'s qualification'] == 37), axis=1)
    df['F_Quali_Basic education_2nd'] = df.apply(lambda x: int(x['Father\'s qualification'] == 38), axis=1)
    df['F_Quali_Higher Education'] = df.apply(lambda x: int(x['Father\'s qualification'] == 3), axis=1) 

    return df

df = clean_data(df_train)
df.to_csv('df_clean.csv')