## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import re

%matplotlib inline

### Dataset and EDA of Dataset

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

data = pd.concat([train_df, test_df])
print(data)

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
numeric_columns = ['Pclass', 'Age', 'SibSp', 'Parch', 'Alone', 'Fare']
categorical_columns = ['Title', 'Sex', 'Embarked']
features = numeric_columns + categorical_columns

In [None]:
data["Family_Size"]=data["SibSp"] + data["Parch"]

data['Alone'] = [0 if item>1 else 1 for item in data['Family_Size'].values]
data['Alone']

data = data.drop(['Family_Size'], 1)

In [None]:
data['Title'] = data.Name.apply(lambda x: x.split(',')[1].split('.')[0].strip())
data['Title']

In [None]:
# data['Surname'] = data.Name.apply(lambda x: re.split(',', x)[0].strip())
# data['Surname']

In [None]:
data.describe()

In [None]:
sb.countplot(x='Pclass', hue='Survived', data=data)

In [None]:
sb.countplot(x='Sex', hue='Survived', data=data)

In [None]:
sb.countplot(x='Age', hue='Survived', data=data)

In [None]:
sb.countplot(x='SibSp', hue='Survived', data=data)

In [None]:
sb.countplot(x='Parch', hue='Survived', data=data)

In [None]:
sb.countplot(x='Fare', hue='Survived', data=data)

In [None]:
sb.countplot(x='Embarked', hue='Survived', data=data)

In [None]:
sb.countplot(x='Title', hue='Survived', data=data)

In [None]:
sb.countplot(x='Alone', hue='Survived', data=data)

In [None]:
# sb.countplot(x='Surname', hue='Survived', data=data)

In [None]:
p_id = data['PassengerId']
p_id

In [None]:
survived = data['Survived']
survived

In [None]:
data.info()

### Pre-processing of Dataset

In [None]:
for col in data.columns:
    if col not in features:
        data = data.drop([col], 1)
        
data.info()

In [None]:
data.isna().sum()

In [None]:
data['Age'] = data['Age'].fillna(data['Age'].mean().round()).astype(int)
data['Fare'] = data['Fare'].fillna(data['Fare'].mean().round()).astype(int)

In [None]:
data.Embarked.value_counts(dropna=False)

In [None]:
data["Embarked"]= data["Embarked"].fillna("S")

In [None]:
data.Embarked.value_counts(dropna=False)

In [None]:
data.isna().sum()

In [None]:
data.info()

In [None]:
from sklearn import preprocessing

for col in data.columns:
    if col in categorical_columns:
        dummy = pd.get_dummies(data[[col]])
        data = pd.concat([data, dummy], axis=1)
        
        data = data.drop([col], axis=1)


In [None]:
data.info()

In [None]:
drop_title_cols = ['Title_Master', 'Title_Miss', 'Title_Mr', 'Title_Mrs']
for col in drop_title_cols:
    data = data.drop([col], 1)

In [None]:
data.info()

In [None]:
data.tail()

In [None]:
# Normalization of Data
from sklearn.preprocessing import StandardScaler

data = StandardScaler().fit(data).transform(data)
data

### Model

#### Logistic Regression

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

def best_hyperparams(model, grid_dict, train_data, test_data):
    
    grid = grid_dict

    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
    grid_result = grid_search.fit(train_data, test_data)

    # summarize results
    print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    means = grid_result.cv_results_['mean_test_score']
    stds = grid_result.cv_results_['std_test_score']
    params = grid_result.cv_results_['params']
    
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
log_reg = LogisticRegression(C=1, verbose=1)

log_reg.__dict__

In [None]:
log_reg.fit(data[:891], survived[:891])

In [None]:
log_reg.score(data[:891], survived[:891])

In [None]:
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

lr_dict = {'solver' : solvers, 'penalty' : penalty, 'C': c_values}

best_hyperparams(model=log_reg, grid_dict=lr_dict, train_data=data[:891], test_data=survived[:891])

In [None]:
pred = log_reg.predict(data[891:]).astype(int)

In [None]:
pred

#### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model_knn = KNeighborsClassifier()
model_knn.__dict__

In [None]:
model_knn.fit(data[:891], survived[:891])


In [None]:
model_knn.score(data[:891], survived[:891])

In [None]:
n_neighbors = range(1, 21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']

knn_dict = {'n_neighbors' : n_neighbors, 'weights' : weights, 'metric': metric}

best_hyperparams(model=model_knn, grid_dict=knn_dict, train_data=data[:891], test_data=survived[:891])

#### SVM

In [None]:
from sklearn.svm import SVC

model_svc = SVC(C=50)

model_svc.__dict__

In [None]:
model_svc.fit(data[:891], survived[:891])

In [None]:
model_svc.score(data[:891], survived[:891])

In [None]:
C = [50, 10, 1.0, 0.1, 0.01]
gamma = ['scale']
kernel = ['poly', 'rbf', 'sigmoid']

svc_dict = {'kernel' : kernel, 'C' : C, 'gamma': gamma}

best_hyperparams(model=model_svc, grid_dict=svc_dict, train_data=data[:891], test_data=survived[:891])

#### Decision Trees

In [None]:
# Bagged Decision Tree
from sklearn.tree  import DecisionTreeClassifier

model_dt = DecisionTreeClassifier(criterion = 'gini', max_depth = 25, min_samples_leaf = 3,
                             min_samples_split = 5, splitter = 'random')

model_dt.__dict__

In [None]:
model_dt.fit(data[:891], survived[:891])

In [None]:
model_dt.score(data[:891], survived[:891])

In [None]:
# data.info()

In [None]:
# from sklearn import tree

# fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (100, 100), dpi=50)
# features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked', 'Family_Size']
# labels = ['0', '1']
# #create the tree plot
# a = tree.plot_tree(model_dt, feature_names = features, class_names = labels, filled = True, fontsize=20)

# #show the plot
# fig.savefig('tree.png')

In [None]:
# from sklearn.tree import export_text
# feature_names = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked', 'Family_Size']

# tree_rules = export_text(model_dt,
#                         feature_names = list(feature_names))

# print(tree_rules)

In [None]:
# from sklearn.model_selection import GridSearchCV

# tuned_parameters = [{'max_depth': [1,2,3,4,5], 
#                      'min_samples_split': [2,4,6,8,10]}]

# scores = ['recall', 'precision', 'f1']

# for score in scores:
    
#     print()
#     print(f"Tuning hyperparameters for {score}")
#     print()
    
#     clf = GridSearchCV(
#         DecisionTreeClassifier(), tuned_parameters,
#         scoring = f'{score}_macro'
#     )
#     clf.fit(data[:891], survived[:891])
    
#     print("Best parameters set found on development set:")
#     print()
#     print(clf.best_params_)
#     print()
#     print("Grid scores on development set:")
#     means = clf.cv_results_["mean_test_score"]
#     stds = clf.cv_results_["std_test_score"]
#     for mean, std, params in zip(means, stds,
#                                  clf.cv_results_['params']):
#         print(f"{mean:0.3f} (+/-{std*2:0.03f}) for {params}")

In [None]:
# n_estimators = [10, 100, 1000]

# grid = dict(n_estimators=n_estimators)
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# grid_search = GridSearchCV(estimator=model_dt, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
# grid_result = grid_search.fit(data[:891], survived[:891])

# # summarize results
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
#     print("%f (%f) with: %r" % (mean, stdev, param))


In [None]:
pred_dt = model_dt.predict(data[891:]).astype(int)
pred_dt

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(max_depth=18, random_state=0, n_estimators=10)

model_rf.__dict__

In [None]:
model_rf.fit(data[:891], survived[:891])

In [None]:
model_rf.score(data[:891], survived[:891])

In [None]:
n_estimators = [10, 100, 1000]
max_features = ['sqrt', 'log2']

rf_dict = {'n_estimators' : n_estimators, 'max_features' : max_features}

best_hyperparams(model=model_rf, grid_dict=rf_dict, train_data=data[:891], test_data=survived[:891])

In [None]:
pred_rf = model_rf.predict(data[891:]).astype(int)
pred_rf

#### Stochastic Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

model_sgbc = GradientBoostingClassifier(verbose=1)
model_sgbc.__dict__

In [None]:
model_sgbc.fit(data[:891], survived[:891])

In [None]:
model_sgbc.score(data[:891], survived[:891])

In [None]:
# n_estimators = [10, 100, 1000]
# learning_rate = [0.001, 0.01, 0.1]
# subsample = [0.5, 0.7, 1.0]
# max_depth = [3, 7, 9]

# sgbc_dict = {'n_estimators' : n_estimators, 'learning_rate' : learning_rate, 'subsample': subsample, 'max_depth': max_depth}

# best_hyperparams(model=model_sgbc, grid_dict=sgbc_dict, train_data=data[:891], test_data=survived[:891])

In [None]:
pred_sgbc = model_sgbc.predict(data[891:]).astype(int)
pred_sgbc

## Generate CSV

In [None]:
prediction = pd.DataFrame({'PassengerId': p_id[891:], 'Survived': pred_sgbc})

In [None]:
prediction.to_csv('final-sgbc.csv', index=False)