In [1]:
import pandas as pd
import numpy as np
import logging
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer   
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
def load_csv(path:str):

    try:
        df_train = pd.read_csv(os.path.join(path,'train.csv'))
        df_test = pd.read_csv(os.path.join(path,'test.csv'))
       
        return (df_train, df_test)

    except Exception as e:
        print(f"Error during reading csv {e}")


In [3]:
def create_pipeline(df_train, df_test):

    categorical_features = ['Sex', 'Embarked', 'Pclass']
    numerical_features = ['Age', 'SibSp', 'Parch', 'Fare']
    categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

    numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', MinMaxScaler()) 
        ])
  # Combine transformers into a preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features), 
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='drop'  # Drop any other columns not specified
    )

    x_train = df_train.iloc[:,1:]
    x_test  = df_test.iloc[:,1:]
    

    preprocessor.fit(x_train)

    train = pd.DataFrame(preprocessor.transform(x_train))
    train['label'] = df_train.Survived.values


    test = pd.DataFrame(preprocessor.transform(x_test))
    test['label'] = df_test.Survived.values

    return (train,test)

In [15]:
data_path = ("../data/raw")
df_train, df_test = load_csv(data_path)


In [5]:
train , test = create_pipeline(df_train,df_test)

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [7]:
svc = SVC(kernel= "sigmoid", gamma  = 1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth = 5)
lrc = LogisticRegression(solver = 'liblinear', penalty = 'l1')
rfc = RandomForestClassifier(n_estimators = 2, random_state = 2 )
abc = AdaBoostClassifier(n_estimators = 2, random_state = 2)
bc = BaggingClassifier(n_estimators = 2, random_state = 2)
etc = ExtraTreesClassifier(n_estimators = 2, random_state = 2)
gbdt = GradientBoostingClassifier(n_estimators = 2, random_state = 2)    
xgb  = XGBClassifier(n_estimators = 2, random_state = 2)

In [8]:
clfs = {
    'SVC': svc,
    'KNN': knc,
    'NB': mnb,
    'DT': dtc,
    'LR': lrc,
    'RF': rfc,
    'Adaboost': abc,
    'Bgc': bc,
    'ETC': etc,
    'GBDT': gbdt,
    'xgb': xgb
    
}

In [9]:
from sklearn.metrics import accuracy_score, precision_score
def train_classifier(clfs, X_train, y_train, X_test, y_test):
    clfs.fit(X_train,y_train)
    y_pred = clfs.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    return accuracy , precision

In [10]:
X_train = train.iloc[:,:-1].values
X_test = test.iloc[:,:-1].values
y_train = train.label.values
y_test = test.label.values

In [11]:
accuracy_scores = []
precision_scores = []




for name , clfs in clfs.items():
    current_accuracy, current_precision = train_classifier(clfs, X_train, y_train, X_test, y_test)
    print()
    print("For: ", name)
    print("Accuracy: ", current_accuracy)
    print("Precision: ", current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)


For:  SVC
Accuracy:  0.6536312849162011
Precision:  0.5857142857142857

For:  KNN
Accuracy:  0.8044692737430168
Precision:  0.782608695652174

For:  NB
Accuracy:  0.7653631284916201
Precision:  0.6951219512195121

For:  DT
Accuracy:  0.7988826815642458
Precision:  0.8275862068965517

For:  LR
Accuracy:  0.7988826815642458
Precision:  0.7714285714285715

For:  RF
Accuracy:  0.7821229050279329
Precision:  0.7868852459016393

For:  Adaboost
Accuracy:  0.7821229050279329
Precision:  0.7536231884057971

For:  Bgc
Accuracy:  0.776536312849162
Precision:  0.7833333333333333

For:  ETC
Accuracy:  0.8100558659217877
Precision:  0.803030303030303

For:  GBDT
Accuracy:  0.5865921787709497
Precision:  0.0

For:  xgb
Accuracy:  0.8100558659217877
Precision:  0.8846153846153846


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [12]:
train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,label
0,0.566474,0.000,0.000000,0.055628,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0
1,0.283740,0.000,0.000000,0.025374,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0
2,0.396833,0.000,0.000000,0.015469,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0
3,0.321438,0.125,0.000000,0.015330,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0
4,0.070118,0.500,0.333333,0.061045,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
707,0.258608,0.000,0.000000,0.014932,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1
708,0.365404,0.000,0.000000,0.060508,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0
709,0.509927,0.250,0.000000,0.027538,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0
710,0.170646,0.125,0.333333,0.234224,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1
