In [None]:
import pandas as pd 
pd.set_option('display.max_columns', None)

In [None]:
INPUT_DIR = '../input/'
OUTPUT_DIR = '../output/'
IMAGE_DIR = '../image/'
SUBMISSION_ID_COLUMN = 'PassengerId'
SUBMISSION_OUTPUT_COLUMN = 'Transported'

In [None]:
# Load train data
df = pd.read_csv(f"{INPUT_DIR}train.csv")
df

### EDA

In [None]:
df.columns = df.columns.str.lower()
df.head()

In [None]:
df.shape

In [None]:
# check for null
df.isna().sum()

In [None]:
# divide cabin into three columns
# df['cabin_1'] = df['cabin'].astype(str).str.split('/')[0]
# df['cabin_2'] = df['cabin'].astype(str).str.split('/')[1]
# df['cabin_3'] = df['cabin'].astype(str).str.split('/')[2]
# df
# cabin_df = df.loc[df['cabin'].notna()]
# df[['cabin_1', 'cabin_2', 'cabin_3']] = df['cabin'].astype(str).str.split('/', n=2, expand=True)
# cabin_df['cabin_2'] = cabin_df['cabin'].astype(str).str.split('/')[1]
# cabin_df['cabin_3'] = cabin_df['cabin'].astype(str).str.split('/')[2]
# df['cabin_1'].value_counts() # no null
# df['cabin_2'].loc[df['cabin_2'].notna()].astype(int).mean() # 109 null, set to mean() --> 600
# df['cabin_3'].isna().sum() # 199, all S or P, set to na
# df['cabin_1'].isna().sum() # 0 
# df['age'].value_counts()
# df['age'].isna().sum() # 182 null, set to na
# df['age'][df['age'].notna()].median() # 27.0
# df['vip'].isna().sum() # 203
# df['vip'].value_counts()
# df['vip'] = df['vip'].astype(bool)
# values = {'vip': pd.NA}
# df.fillna(value=values, inplace=True)
# df['vip'].isna().sum()
# df['roomservice'].value_counts()
# # groupby syntax: df.groupby(['col1','col2']).size(), df.groupby(['Name', 'Fruit'])['Number'].sum() 
# df.groupby(['roomservice'])['roomservice'].count() # works! 0.0 is highest so that can be value for fillna()
# df.groupby(['foodcourt'])['foodcourt'].count() # 0.0 highest use as fillna()
# df.groupby(['shoppingmall'])['shoppingmall'].count() # 0.0 highest use as fillna()
# df.groupby(['spa'])['spa'].count() # 0.0 highest use as fillna()
# df.groupby(['vrdeck'])['vrdeck'].count()  # 0.0 highest use as fillna()

In [None]:
df.dtypes

In [None]:
df['transported'].value_counts() # True false in even number. 

### Helper method to clean up, encoding. 

In [None]:
def cleanup(df: pd.DataFrame) -> pd.DataFrame:
    
    # convert columns to lowercase for convenience
    df.columns = df.columns.str.lower()
    
    # drop columns not required
    cols_to_drop = ["passengerid", "name"]
    for col in cols_to_drop:
        df.drop(col, axis=1, inplace=True)
        
    # split cabin columns into 3 columns
    df[['cabin_1', 'cabin_2', 'cabin_3']] = df['cabin'].astype(str).str.split('/', n=2, expand=True)
    df.drop('cabin', axis=1, inplace=True)
    
    # convert boolean types from object
    df['cryosleep'] = df['cryosleep'].astype(bool)
    df['vip'] = df['vip'].astype(bool)
    df['cabin_2'] = df['vip'].astype(bool)
    
    # fill null values
    fill_values = {'homeplanet': 'na', 
                   'cryosleep': pd.NA, 
                   'cabin_1': 'na',
                   'cabin_2': -1, 
                   'cabin_3': 'na', 
                   'destination': 'na',
                   'age': df['age'][df['age'].notna()].median(),
                   'vip': pd.NA,
                   'roomservice': 0.0,
                   'foodcourt': 0.0,
                   'shoppingmall': 0.0,
                   'spa': 0.0,
                   'vrdeck': 0.0         
                  }
    df.fillna(value=fill_values, inplace=True)
    
    # convert ojects to category
    cols_to_convert = ['homeplanet', 'destination', 'cabin_1', 'cabin_3']
    for col in cols_to_convert:
        df[col] = df[col].astype('category')
    
    return df

df = cleanup(df)
df.isna().sum()

In [None]:
from pandas import get_dummies

def one_hot_encoding(df: pd.DataFrame) -> pd.DataFrame:
    return pd.get_dummies(df, columns=['homeplanet', 'destination', 'cabin_1', 'cabin_3'], 
               prefix=['homeplanet', 'destination', 'cabin_1', 'cabin_3'])
df = one_hot_encoding(df)
df

In [None]:
def write_output(df: pd.DataFrame, info: str) -> None:
    df.to_csv(f"{OUTPUT_DIR}{info}.csv", index=False)

### Train and see result

In [None]:
from random import randint
from sklearn.model_selection import train_test_split

X = df.drop(columns=['transported'])
y = df['transported']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, random_state=randint(1, 100))
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [None]:
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

# uncomment to run the desired model in the following list
models = [
    # "DecisionTreeClassifier(max_depth=3, random_state=randint(1, 100))",
    # "DecisionTreeClassifier(max_depth=10, random_state=randint(1, 100))",
    "RandomForestClassifier()",
    # "SGDClassifier(max_iter=1000, tol=1e-3, random_state=randint(1, 100))"
]

model_grid_values = {
    "RandomForestClassifier": {'n_estimators': [130],     # 'n_estimators': [90, 100, 115, 130], 
                               'criterion': ['entropy'],  # 'criterion': ['gini', 'entropy'],
                               'max_depth': [11],         # 'max_depth': range(2, 20, 1), 
                               'min_samples_leaf': [4],   # 'min_samples_leaf': range(1, 10, 1),
                               'min_samples_split': [9],  # 'min_samples_split': range(2, 10, 1),
                               'max_features': ['log2']   # 'max_features': ['sqrt', 'log2']
                              },
    "SGDClassifier": {'penalty': ['l1', 'l2'] , 
                      'class_weight': [{1:0.5, 0:0.5}, {1:0.4, 0:0.6}, {1:0.6, 0:0.4}, {1:0.7, 0:0.3}]}
}

# Load test data
final_test_raw = pd.read_csv(f"{INPUT_DIR}test.csv")
final_test_df = pd.read_csv(f"{INPUT_DIR}test.csv")
final_test_input = cleanup(final_test_df)
final_test_input = one_hot_encoding(final_test_input)

for model in models:
    model_name = model.split("(")[0]
    print(f"********** {model_name} **********")

    clf = eval(model)
    grid_values = model_grid_values[model_name]
    clf = GridSearchCV(clf, param_grid = grid_values, scoring = 'f1', n_jobs=-1)
    clf.fit(X_train, y_train)
    print(f"{model_name} best parameters: {clf.best_params_}")
    y_pred = clf.predict(X_test)

    # cross validation
    print(f"{model_name} train cross_val_score accuracy:")
    print(f"{cross_val_score(clf, X_train, y_train, cv=3, scoring='accuracy')}")

    # accuracy with test portion
    # y_pred = clf.predict(X_test)
    print(f"{model_name} test accuracy: {accuracy_score(y_test, y_pred)}")


    # Precision and Recall
    # f1_score = 2 * (precision * recall) / (precision + recall)
    print(f"{model_name} test precision: {precision_score(y_test, y_pred)}")
    print(f"{model_name} test recall: {recall_score(y_test, y_pred)}")
    print(f"{model_name} test f1_score: {f1_score(y_test, y_pred)}")

    # Confusion matrix
    y_cf = cross_val_predict(clf, X_train, y_train, cv=3)
    print(f"{model_name} train confusion_matrix: {confusion_matrix(y_train, y_cf)}")

    # Run on submission test data and save result
    final_test_prediction = clf.predict(final_test_input)
    final_test_prediction = pd.DataFrame(final_test_prediction, columns=[SUBMISSION_OUTPUT_COLUMN])
    output_df = pd.concat([final_test_raw[SUBMISSION_ID_COLUMN], final_test_prediction[SUBMISSION_OUTPUT_COLUMN]], axis=1) 
    write_output(output_df, f"{model_name}_{f1_score(y_test, y_pred)}")
