In [12]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score, roc_curve, auc
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xg 
import lightgbm as lgbm 
from lightgbm import LGBMClassifier

In [11]:
def base_models(data):
    models = [
        DecisionTreeClassifier(),
        LogisticRegression(),
        RandomForestClassifier(),
        GradientBoostingClassifier(),
        XGBClassifier(),
        LGBMClassifier(),
    ]

    if len(data) == 2:
        X = data[0]
        y = data[1]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    else:
        X_train = data[0]
        X_test = data[1]
        y_train = data[2]
        y_test = data[3]

    accuracy = []
    auc = []
    recall = []
    precision = []
    f1 = []
    
    for model in models:
        clf = model
        clf.fit(X_train, y_train)
        y_preds = clf.predict(X_test)
        accuracy.append(accuracy_score(y_test, y_preds))
        f1.append(f1_score(y_test, y_preds))
        recall.append(recall_score(y_test, y_preds))
        precision.append(precision_score(y_test, y_preds))
        auc.append(roc_auc_score(y_test, y_preds))

        print('Classification Report')
        print(classification_report(y_test, y_preds))

        results = pd.DataFrame([f1, auc, accuracy, precision, recall],
                                index=['f1', 'auc', 'accuracy', 'precision', 'recall'],
                                columns=['DecisionTree', 'LogisticRegression', 'RandomForest', 'GradientBoosting', 'XGBoost', 'LightGBM'])
        
        return results.transpose().sort_values(by='f1')

In [13]:
def plot_roc_curve(y_true, y_preds, model_name):
    fpr, tpr, thresholds = roc_curve(y_test, y_preds)

    plt.figure(figsize=(10, 10))
    plt.plot([0, 1], [0, 1], linestyle='--', label='no skill')
    plt.plot(fpr, tpr, marker='.', label=model_name)
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.legend()
    plt.show()

In [3]:
train_data = pd.read_csv('Datasets/tabular-playground-series-apr-2021/train.csv')
test_data = pd.read_csv('Datasets/tabular-playground-series-apr-2021/test.csv')

In [4]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,1,"Oconnor, Frankie",male,,2,0,209245,27.14,C12239,S
1,1,0,3,"Bryan, Drew",male,,0,0,27323,13.35,,S
2,2,0,3,"Owens, Kenneth",male,0.33,1,2,CA 457703,71.29,,S
3,3,0,3,"Kramer, James",male,19.0,0,0,A. 10866,13.04,,S
4,4,1,3,"Bond, Michael",male,25.0,0,0,427635,7.76,,S


In [5]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,100000,3,"Holliday, Daniel",male,19.0,0,0,24745,63.01,,S
1,100001,3,"Nguyen, Lorraine",female,53.0,0,0,13264,5.81,,S
2,100002,1,"Harris, Heather",female,19.0,0,0,25990,38.91,B15315,C
3,100003,2,"Larsen, Eric",male,25.0,0,0,314011,12.93,,S
4,100004,1,"Cleary, Sarah",female,17.0,0,2,26203,26.89,B22515,C


In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   PassengerId  100000 non-null  int64  
 1   Survived     100000 non-null  int64  
 2   Pclass       100000 non-null  int64  
 3   Name         100000 non-null  object 
 4   Sex          100000 non-null  object 
 5   Age          96708 non-null   float64
 6   SibSp        100000 non-null  int64  
 7   Parch        100000 non-null  int64  
 8   Ticket       95377 non-null   object 
 9   Fare         99866 non-null   float64
 10  Cabin        32134 non-null   object 
 11  Embarked     99750 non-null   object 
dtypes: float64(2), int64(5), object(5)
memory usage: 9.2+ MB


In [7]:
train_data.isnull().sum()*100/len(train_data)

PassengerId     0.000
Survived        0.000
Pclass          0.000
Name            0.000
Sex             0.000
Age             3.292
SibSp           0.000
Parch           0.000
Ticket          4.623
Fare            0.134
Cabin          67.866
Embarked        0.250
dtype: float64

In [8]:
test_data.isnull().sum()*100/len(test_data)

PassengerId     0.000
Pclass          0.000
Name            0.000
Sex             0.000
Age             3.487
SibSp           0.000
Parch           0.000
Ticket          5.181
Fare            0.133
Cabin          70.831
Embarked        0.277
dtype: float64