In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sns
import pandas_profiling
import plotly.express as px
import plotly.graph_objects as go
import sklearn.metrics as metrics

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, roc_curve,auc, confusion_matrix, classification_report

In [None]:
train_df = pd.read_csv("/kaggle/input/titanic/train.csv")
test_df = pd.read_csv("/kaggle/input/titanic/test.csv")
train_df.head()

In [None]:
train_df.columns

In [None]:
train_df.dtypes

In [None]:
train_df.isna().sum()

In [None]:
train_df.nunique()

In [None]:
train_df.count

In [None]:
# visualize NaN's

msno.matrix(train_df)

In [None]:
# inspect features

profile = pandas_profiling.ProfileReport(train_df)

In [None]:
profile

In [None]:
# Create new features: 'relatives' & 'travelled alone'

data = [train_df, test_df]
for dataset in data:
    dataset['relatives'] = dataset['SibSp'] + dataset['Parch']
    dataset.loc[dataset['relatives'] > 0, 'travelled_alone'] = 'No'
    dataset.loc[dataset['relatives'] == 0, 'travelled_alone'] = 'Yes'
    
train_df['travelled_alone'].value_counts()

In [None]:
# Drop 'PassengerId', because it does not affect survival probability

train_df = train_df.drop(['PassengerId'], axis=1)

In [None]:
# Engineer new feature, 'Deck', from 'Cabin'

import re
deck = {"A": "A", "B": "B", "C": "C", "D": "D", "E": "E", "F": "F", "G": "G", "U": "U"}
data = [train_df, test_df]

for dataset in data:
    dataset['Cabin'] = dataset['Cabin'].fillna("U0")
    dataset['Deck'] = dataset['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
    dataset['Deck'] = dataset['Deck'].map(deck)
    dataset['Deck'] = dataset['Deck'].fillna("U")

# Drop cabin feature
train_df = train_df.drop(['Cabin'], axis=1)
test_df = test_df.drop(['Cabin'], axis=1)



In [None]:
# Fill NaN values in 'Age' with random values generated using mean and std dev

data = [train_df, test_df]

for dataset in data:
    mean = train_df["Age"].mean()
    std = test_df["Age"].std()
    is_null = dataset["Age"].isnull().sum()
    # compute random numbers between the mean, std and is_null
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
    # fill NaN values in Age column with random values generated
    age_slice = dataset["Age"].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    dataset["Age"] = age_slice
    dataset["Age"] = train_df["Age"].astype(int)

In [None]:
common_value = 'S'
data = [train_df, test_df]

for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].fillna(common_value)

In [None]:
data = [train_df, test_df]

for dataset in data:
    dataset['Fare'] = dataset['Fare'].fillna(0)
    dataset['Fare'] = dataset['Fare'].astype(int)

In [None]:
train_df['Fare'] = train_df['Fare'].astype(int)

In [None]:
test_df['Fare'] = test_df['Fare'].astype(int)

In [None]:
# Begin the process of extracting titles

train_titles = train_df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
type(train_titles)

In [None]:
# Inspect value counts for title

train_titles.value_counts()

In [None]:
# Engineer new feature 'title', and map to an integer

data = [train_df, test_df]
titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

for dataset in data:
    # extract titles
    dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    # replace titles with a more common title or as Rare
    dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr',\
                                            'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
    dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
    # convert titles into numbers
    #dataset['Title'] = dataset['Title'].map(titles)
    # filling NaN with 0, to get safe
    dataset['Title'] = dataset['Title'].fillna("NA")
train_df = train_df.drop(['Name'], axis=1)
test_df = test_df.drop(['Name'], axis=1)

In [None]:
train_df = train_df.drop(['Ticket'], axis=1)
test_df = test_df.drop(['Ticket'], axis=1)

In [None]:
# Engineer new feature, 'Age_Class'

data = [train_df, test_df]
for dataset in data:
    dataset['Age_Class']= dataset['Age']* dataset['Pclass']



In [None]:
# Engineer new feature, "Fare_Per_Person"

for dataset in data:
    dataset['Fare_Per_Person'] = dataset['Fare']/(dataset['relatives']+1)
    dataset['Fare_Per_Person'] = dataset['Fare_Per_Person'].astype(int)

In [None]:
# Assign 'Age' to categories

data = [train_df, test_df]
for dataset in data:
    dataset['Age'] = dataset['Age'].astype(int)
    dataset.loc[ dataset['Age'] <= 2, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 2) & (dataset['Age'] <= 12), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 12) & (dataset['Age'] <= 18), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 18) & (dataset['Age'] <= 24), 'Age'] = 3
    dataset.loc[(dataset['Age'] > 24) & (dataset['Age'] <= 45), 'Age'] = 4
    dataset.loc[(dataset['Age'] > 45) & (dataset['Age'] <= 64), 'Age'] = 5
   # dataset.loc[(dataset['Age'] > 55) & (dataset['Age'] <= 66), 'Age'] = 6
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 6
    
    dataset['Age'] = dataset['Age'].astype(str)
    dataset.loc[ dataset['Age'] == '0', 'Age'] = "Children"
    dataset.loc[ dataset['Age'] == '1', 'Age'] = "Teens"
    dataset.loc[ dataset['Age'] == '2', 'Age'] = "Youngsters"
    dataset.loc[ dataset['Age'] == '3', 'Age'] = "Young Adults"
    dataset.loc[ dataset['Age'] == '4', 'Age'] = "Adults"
    dataset.loc[ dataset['Age'] == '5', 'Age'] = "Middle Age"
    dataset.loc[ dataset['Age'] == '6', 'Age'] = "Senior"
   # dataset.loc[ dataset['Age'] == '7', 'Age'] = "Retired"

# inspect how age is distributed
train_df['Age'].value_counts()

In [None]:
# Assign 'Fare' to categories

data = [train_df, test_df]

for dataset in data:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[(dataset['Fare'] > 31) & (dataset['Fare'] <= 99), 'Fare']   = 3
    dataset.loc[(dataset['Fare'] > 99) & (dataset['Fare'] <= 250), 'Fare']   = 4
    dataset.loc[ dataset['Fare'] > 250, 'Fare'] = 5
    dataset['Fare'] = dataset['Fare'].astype(int)
    
    dataset['Fare'] = dataset['Fare'].astype(str)
    dataset.loc[ dataset['Fare'] == '0', 'Fare'] = "Extremely Low"
    dataset.loc[ dataset['Fare'] == '1', 'Fare'] = "Very Low"
    dataset.loc[ dataset['Fare'] == '2', 'Fare'] = "Low"
    dataset.loc[ dataset['Fare'] == '3', 'Fare'] = "High"
    dataset.loc[ dataset['Fare'] == '4', 'Fare'] = "Very High"
    dataset.loc[ dataset['Fare'] == '5', 'Fare'] = "Extremely High"

In [None]:
train_df.info()

In [None]:
data = [train_df, test_df]

for dataset in data:
    dataset['Pclass'] = dataset['Pclass'].astype(str)
    dataset.loc[ dataset['Pclass'] == '1', 'Pclass'] = "Class1"
    dataset.loc[ dataset['Pclass'] == '2', 'Pclass'] = "Class2"
    dataset.loc[ dataset['Pclass'] == '3', 'Pclass'] = "Class3"

In [None]:
# Capture all the numerical features so they can be scaled

train_numerical_features = list(train_df.select_dtypes(include=['int64', 'float64', 'int32']).columns)
train_numerical_features

In [None]:
del train_numerical_features[0]
train_numerical_features

In [None]:
# Feature scaling
ss_scaler = StandardScaler()
train_df_ss = pd.DataFrame(data = train_df)
train_df_ss[train_numerical_features] = ss_scaler.fit_transform(train_df_ss[train_numerical_features])

In [None]:
test_numerical_features = list(test_df.select_dtypes(include=['int64', 'float64', 'int32']).columns)
test_numerical_features

In [None]:
del test_numerical_features[0]
test_numerical_features

In [None]:
# Feature scaling

test_ss_scaler = StandardScaler()
test_df_ss = pd.DataFrame(data = test_df)
test_df_ss[test_numerical_features] = test_ss_scaler.fit_transform(test_df_ss[test_numerical_features])

In [None]:
# One-Hot encoding

encode_col_list = list(train_df.select_dtypes(include=['object']).columns)
for i in encode_col_list:
    train_df_ss = pd.concat([train_df_ss,pd.get_dummies(train_df_ss[i], prefix=i)],axis=1)
    train_df_ss.drop(i, axis = 1, inplace=True)

In [None]:
# One-Hot encoding

test_encode_col_list = list(test_df.select_dtypes(include=['object']).columns)
for i in test_encode_col_list:
    test_df_ss = pd.concat([test_df_ss,pd.get_dummies(test_df_ss[i], prefix=i)],axis=1)
    test_df_ss.drop(i, axis = 1, inplace=True)

In [None]:
X_train = train_df_ss.drop("Survived", axis=1)
y_train = train_df_ss["Survived"]
X_test  = test_df_ss.drop("PassengerId", axis=1).copy()

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
# Features
X = X_train 

# Target variable
y = y_train 

In [None]:
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) #, stratify=y)

In [None]:
X_train.shape, y_train.shape, X_test.shape

In [None]:
# Classify using a Random Forrest (RF)

from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(n_estimators=100)
RF.fit(X_train,y_train)

y_pred=RF.predict(X_train)
RF_acc = metrics.accuracy_score(y_train, y_pred)

print("Accuracy:",RF_acc)

In [None]:
# Assess RF accuracy


In [None]:
# Classify using Naive Bayes (NB)

from sklearn.naive_bayes import GaussianNB

GNB = GaussianNB()
GNB.fit(X_train, y_train)
y_pred = GNB.predict(X_train)
GNB_acc = metrics.accuracy_score(y_train, y_pred)

print("Accuracy:",GNB_acc)


In [None]:
# Classify using Logisitic Regression
from sklearn.linear_model import LogisticRegression

LogReg = LogisticRegression()
LogReg.fit(X_train,y_train)

y_pred=LogReg.predict(X_train)
LogReg_acc = metrics.accuracy_score(y_train, y_pred)

print("Accuracy:",LogReg_acc)


In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

LDA = LDA(n_components=1)
LDA.fit(X_train, y_train)

y_pred=LDA.predict(X_train)
LDA_acc = metrics.accuracy_score(y_train, y_pred)

print("Accuracy:",LDA_acc)

In [None]:
# Classify using XG Boost (XGB)
from xgboost import XGBClassifier

XGB = XGBClassifier()
XGB.fit(X_train, y_train)

y_pred = XGB.predict(X_train)
XGB_acc = metrics.accuracy_score(y_train, y_pred)

print("Accuracy:", XGB_acc)

In [None]:
# Tune XGB paramters using brute force

#import warnings
#warnings.filterwarnings('ignore')

#from datetime import datetime
#from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
#from sklearn.metrics import roc_auc_score
#from sklearn.model_selection import StratifiedKFold

In [None]:
# Time our brute force parameter tuning

#    if not start_time:
#        start_time = datetime.now()
#        return start_time
#    elif start_time:
#        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
#        tmin, tsec = divmod(temp_sec, 60)
#        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

In [None]:
# A parameter grid for XGBoost (bruteforce)

#params = {
#        'min_child_weight': [1, 5, 10],
#        'gamma': [0.5, 1, 1.5, 2, 5],
#        'subsample': [0.6, 0.8, 1.0],
#        'colsample_bytree': [0.6, 0.8, 1.0],
#        'max_depth': [3, 4, 5]
 #       }

In [None]:
#xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
#                    silent=True, nthread=1)

In [None]:
#folds = 5
#param_comb = 5

#skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1)

#random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X,y), verbose=3, random_state=1 )


#start_time = timer(None)
#random_search.fit(X, y)
#timer(start_time) 

In [None]:
#print('\n All results:')
#print(random_search.cv_results_)
#print('\n Best estimator:')
#print(random_search.best_estimator_)
#print('\n Best normalized gini score for %d-fold search with %d parameter combinations:' % (folds, param_comb))
#print(random_search.best_score_ * 2 - 1)
#print('\n Best hyperparameters:')
#print(random_search.best_params_)
#results = pd.DataFrame(random_search.cv_results_)
#results.to_csv('xgb-random-grid-search-results-01.csv', index=False)

In [None]:
#XGB = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
#              colsample_bynode=1, colsample_bytree=1.0, gamma=1, gpu_id=-1,
#              importance_type='gain', interaction_constraints='',
#              learning_rate=0.02, max_delta_step=0, max_depth=5,
#              min_child_weight=5, monotone_constraints='()',
#              n_estimators=600, n_jobs=1, nthread=1, num_parallel_tree=1,
#              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
#              silent=True, subsample=1.0, tree_method='exact',
#              validate_parameters=1, verbosity=None)

#XGB.fit(X_train, y_train)
#y_pred = XGB.predict(X_train)
#XGB_acc = metrics.accuracy_score(y_train, y_pred)
#print("Accuracy:",XGB_acc)

In [None]:
models = pd.DataFrame({
    'Model': ['Logistic Regression', 
              'Random Forest', 'Naive Bayes','XGBoost', 'LDA'],
    'Score': [LogReg_acc, RF_acc, GNB_acc, 
              XGB_acc, LDA_acc]})
models.sort_values(by='Score', ascending=False)

In [None]:
submission_preds = RF.predict(X_test)

In [None]:
submission = pd.DataFrame({
        "PassengerId": test_df["PassengerId"],
        "Survived": submission_preds
    })
submission.to_csv('submission.csv', index=False)

In [None]:
#        TO 
#                BE 
#                        CONTINUED......