In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
train.head()

In [None]:
print('Shape is ', train.shape)

In [None]:
train.isnull().sum()

# each feature has some missing values so we need to work on that.

In [None]:
train.dtypes

# Object columns need transformation ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Name', 'VIP']

In [None]:
train.select_dtypes('float64').skew()

#  Those columns are right skewed ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'Vrdeck']
#  apply np.log to those columns.

In [None]:
# See if the target variable is balanced or not
train['Transported'].value_counts()

# Yes, It is almost balanced as we can see.
# But we need to transform this column from bool to int.

In [None]:
correlations = train.corr()

# Poor correlations between the columns and the target variable.
# We need to work the feature engineering alittle bit.

In [None]:
train.hist(figsize=(13, 10))

In [None]:
train.plot(kind='density', subplots=True, layout=(3,3), sharex=False, figsize=(13, 10))

# What the hell is going on with those columns !!

In [None]:
train.plot(kind='box', subplots=True, layout=(3,3), sharex=False, figsize=(13, 10))

# There is something really weird with the numerical columns except Age column

In [None]:
# encoding the Transported column
mapping = {True:1, False:0}
train['Transported'] = train['Transported'].map(mapping)
train['Transported']  = train['Transported'].astype(int)

In [None]:
# plot correlation matrix
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,7,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(train.select_dtypes(['float64', 'int']).columns.values)
ax.set_yticklabels(train.select_dtypes(['float64', 'int']).columns.values)

# Poor correlation between the columns and the target column.

# Now it's time to go through the data preprocessing step

## Creating some new features as part of feature engineering.

In [None]:
train[["Deck", "Num", "Side"]] = (  # Create two new features
    train["Cabin"]           # from the Policy feature
    .str                         # through the string accessor
    .split("/", expand=True)     # by splitting on " "
                                 # and expanding the result into separate columns
)
train[["Cabin", "Deck", "Num", "Side"]]

In [None]:
train[["Group", "Group_Size"]] = (  # Create two new features
    train["PassengerId"]           # from the Policy feature
    .str                         # through the string accessor
    .split("_", expand=True)     # by splitting on " "
                                 # and expanding the result into separate columns
)
train[['PassengerId', 'Group', 'Group_Size']]

In [None]:
# New feature
train['Solo']=(train['Group_Size'] == 1).astype(int)  
# test['Solo']=(test['Group'] == 1).astype(int)

### Handling the missing values and encoding the categorical variables.

In [None]:
# filling and encoding the CryoSleep column
mapping = {'Europe':1, 'Earth':2, 'Mars':3}
train['HomePlanet'] = train['HomePlanet'].map(mapping)
train['HomePlanet'] = train['HomePlanet'].fillna(2)
train['HomePlanet']  = train['HomePlanet'].astype(int)


# filling and encoding the CryoSleep column
mapping = {True:1, False:0}
train['CryoSleep'] = train['CryoSleep'].map(mapping)
train['CryoSleep'] = train['CryoSleep'].fillna(0)
train['CryoSleep']  = train['CryoSleep'].astype(int)

# filling and encoding the VIP column
mapping = {True:1, False:0}
train['VIP'] = train['VIP'].map(mapping)
train['VIP'] = train['VIP'].fillna(0)
train['VIP']  = train['VIP'].astype(int)


# Filling nan values of the ShoppingMall 
filling_num = round(train['ShoppingMall'].mean())
train['ShoppingMall'] = train['ShoppingMall'].fillna(filling_num)


# the unique values of this columns are alot to encode by hand. So we will use sklearn LabelEncoder
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit(train['Cabin'])
filling_num = train['Cabin'].mode()
train['Cabin'] = encoder.transform(train['Cabin'])
train['Cabin'] = train['Cabin'].fillna(filling_num)
train['Cabin'] = train['Cabin'].astype(int)


# filling the VRDeck column
filling_num = round(train['VRDeck'].mean())
train['VRDeck'] = train['VRDeck'].fillna(filling_num)


# filling the FoodCourt column
filling_num = round(train['FoodCourt'].mean())
train['FoodCourt'] = train['FoodCourt'].fillna(filling_num)


# filling the FoodCourt column
filling_num = round(train['Spa'].mean())
train['Spa'] = train['Spa'].fillna(filling_num)

# filling and encoding the CryoSleep column
mapping = {'TRAPPIST-1e':1, 'PSO J318.5-22':2, '55 Cancri e':3}
train['Destination'] = train['Destination'].map(mapping)
train['Destination'] = train['Destination'].fillna(1)
train['Destination']  = train['Destination'].astype(int)
# train['Destination'].unique()

# filling the RoomService column
filling_num = round(train['RoomService'].mean())
train['RoomService'] = train['RoomService'].fillna(filling_num)


# filling the RoomService column
filling_num = train['Age'].mode()
train['Age'] = train['Age'].fillna(24)

cols = ['Solo', 'Group', 'Group_Size', "Deck", "Num", "Side"]
for col in cols:
    encoder = LabelEncoder()
    encoder.fit(train[col])
    filling_num = train[col].mode()
    train[col] = encoder.transform(train[col])
    train[col] = train[col].fillna(filling_num)
    train[col] = train[col].astype(int)


In [None]:
train.plot(kind='box', subplots=True, sharex=False, legend=True, figsize=(20, 20), layout=(6, 6))

### Reducing the skewness by applying the np.log1p function two times to the ment columns

In [None]:
train.select_dtypes('float64').skew()

In [None]:
train.drop(['PassengerId', 'Name'], axis=1, inplace=True)

In [None]:
# skewed_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# for col in skewed_cols:
#     train[col] = train[col].apply(np.log1p)
#     train[col] = train[col].apply(np.log1p)

In [None]:
X = train.drop('Transported', axis=1)
y = train['Transported']

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score, KFold

kfold = KFold(10, random_state=0, shuffle=True)
model = AdaBoostClassifier()

score = cross_val_score(model, X, y, cv=kfold, scoring='roc_auc')
print(score.mean())

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=30)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
model.fit(X_train, y_train)
pred = model.predict(X_test)
matrix = confusion_matrix(y_test, pred)
print(matrix)

score = cross_val_score(model, X, y, cv=kfold, scoring='roc_auc')
print(score.mean())

In [None]:
from sklearn.metrics import classification_report
report = classification_report(y_test, pred)
print(report)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

# models = [
#     ('Knn', KNeighborsClassifier(15)),
#     ('svc', SVC(probability=True)),
#     ('Dtc', DecisionTreeClassifier()),
#     ('Rdf', RandomForestClassifier()),
# 	('adb', AdaBoostClassifier()),
#     ('grdb', GradientBoostingClassifier()),
#     ('Gaus', GaussianNB()),
#     ('LDA', LinearDiscriminantAnalysis()),
#     ('QDA', QuadraticDiscriminantAnalysis()),
#     ('LogR', LogisticRegression(max_iter=1000)),
#     ('MLP', MLPClassifier(max_iter=300))
# ]
# # evaluate each model in turn
# results = []
# names = []
# scoring = 'accuracy'
# for name, model in models:
#     kfold = KFold(n_splits=10, random_state=0, shuffle=True)
#     cv_results = cross_val_score(model, X, y, cv=kfold, scoring=scoring)
#     results.append(cv_results)
#     names.append(name)
#     print(name, cv_results.mean(), cv_results.std())
# # boxplot algorithm comparison
# fig = plt.figure()
# fig.suptitle('Algorithm Comparison')
# ax = fig.add_subplot(111)
# plt.boxplot(results)
# ax.set_xticklabels(names)
# plt.show()

## After comparing those algorithms we found that the best are:
1. RandomForestClassifier
2. AdaBoosingClassifier
3. GradientBoostingClassirier
4. LinearDiscriminantAnalysis

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split
base_estm = [('GRD', GradientBoostingClassifier()),
             ('RDFC', RandomForestClassifier()),
             ('ADA', AdaBoostClassifier())]
level_estm = LinearDiscriminantAnalysis()

stacking_models = StackingClassifier(estimators=base_estm, final_estimator=level_estm)

X = train.drop('Transported', axis=1)
y = train['Transported']
# X = X[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'CryoSleep', 'Deck', 'Cabin', 'Group']]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
# stacking_models.fit(X, y)

# print(stacking_models.score(X_test, y_test))
# score = stacking_models.score(X_test, y_test)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import chi2
# create feature union
features = []
features.append(('pca', PCA(n_components=3)))
features.append(('select_best', SelectKBest(score_func=chi2, k=11)))
feature_union = FeatureUnion(features)
# create pipeline
estimators = []
estimators.append(('feature_union', feature_union))
# estimators.append('scaler', StandardScaler())
estimators.append(('Stacking', stacking_models))
model = Pipeline(estimators)

# evaluate pipeline
kfold = KFold(n_splits=10, random_state=7, shuffle=True)
results = cross_val_score(model, X, y, cv=kfold)
print(results.mean())

In [None]:
# Random 8 pca 16 k 300 n_estimators.
# GRDBC 12 pca 15 k  Without piplines is better.
# ADAB 11 pca 15 k 
# LDA 12 pca 5 k

In [None]:
test[["Group", "Group_Size"]] = (  # Create two new features
    test["PassengerId"]           # from the Policy feature
    .str                         # through the string accessor
    .split("_", expand=True)     # by splitting on " "
                                 # and expanding the result into separate columns
)
# test[['PassengerId', 'Group', 'Group_Size']]

In [None]:
# New feature
test['Solo']=(test['Group_Size'] == 1).astype(int)  
# test['Solo']=(test['Group'] == 1).astype(int)

In [None]:
test[["Deck", "Num", "Side"]] = (  # Create two new features
    test["Cabin"]           # from the Policy feature
    .str                         # through the string accessor
    .split("/", expand=True)     # by splitting on " "
                                 # and expanding the result into separate columns
)
# test[["Cabin", "Deck", "Num", "Side"]]

In [None]:
# filling and encoding the CryoSleep column
mapping = {'Europe':1, 'Earth':2, 'Mars':3}
test['HomePlanet'] = test['HomePlanet'].map(mapping)
test['HomePlanet'] = test['HomePlanet'].fillna(2)
test['HomePlanet']  = test['HomePlanet'].astype(int)


# filling and encoding the CryoSleep column
mapping = {True:1, False:0}
test['CryoSleep'] = test['CryoSleep'].map(mapping)
test['CryoSleep'] = test['CryoSleep'].fillna(0)
test['CryoSleep']  = test['CryoSleep'].astype(int)

# filling and encoding the VIP column
mapping = {True:1, False:0}
test['VIP'] = test['VIP'].map(mapping)
test['VIP'] = test['VIP'].fillna(0)
test['VIP']  = test['VIP'].astype(int)

# Filling nan values of the ShoppingMall 
filling_num = round(test['ShoppingMall'].mean())
test['ShoppingMall'] = test['ShoppingMall'].fillna(filling_num)

# the unique values of this columns are alot to encode by hand. So we will use sklearn LabelEncoder
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
encoder.fit(test['Cabin'])
filling_num = test['Cabin'].mode()
test['Cabin'] = encoder.transform(test['Cabin'])
test['Cabin'] = test['Cabin'].fillna(filling_num)
test['Cabin'] = test['Cabin'].astype(int)

# filling the VRDeck column
filling_num = round(train['VRDeck'].mean())
test['VRDeck'] = test['VRDeck'].fillna(filling_num)

# filling the FoodCourt column
filling_num = round(test['FoodCourt'].mean())
test['FoodCourt'] = test['FoodCourt'].fillna(filling_num)

# filling the FoodCourt column
filling_num = round(test['Spa'].mean())
test['Spa'] = test['Spa'].fillna(filling_num)

# filling and encoding the CryoSleep column
mapping = {'TRAPPIST-1e':1, 'PSO J318.5-22':2, '55 Cancri e':3}
test['Destination'] = test['Destination'].map(mapping)
test['Destination'] = test['Destination'].fillna(1)
test['Destination']  = test['Destination'].astype(int)
# train['Destination'].unique()

# filling the RoomService column
filling_num = round(test['RoomService'].mean())
test['RoomService'] = test['RoomService'].fillna(filling_num)

# filling the RoomService column
filling_num = test['Age'].mode()
test['Age'] = test['Age'].fillna(24)

cols = ['Solo', 'Group', 'Group_Size', "Deck", "Num", "Side"]
for col in cols:
    encoder = LabelEncoder()
    encoder.fit(test[col])
    filling_num = test[col].mode()
    test[col] = encoder.transform(test[col])
    test[col] = test[col].fillna(filling_num)
    test[col] = test[col].astype(int)

#Droping Name column
test = test.drop('Name', axis=1)

In [None]:
PassengerId = test['PassengerId']
test.drop('PassengerId', axis=1, inplace=True)

In [None]:
# test = test[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'CryoSleep', 'Deck', 'Cabin', 'Group']]
model.fit(X, y)
predictions = model.predict(test)

In [None]:
# Generate Submission File 
GBCsubmission = pd.DataFrame({ 'PassengerId': PassengerId,
                            'Transported': predictions })
GBCsubmission['Transported'] = GBCsubmission['Transported'] .astype(bool)
GBCsubmission.to_csv("best_score.csv", index=False)

In [None]:
from joblib import load, dump

file_name = 'Titanic_Spaceship_Best_Stacking_models.sav'
dump(model, file_name)

In [None]:
loaded_model = load(file_name)
result = loaded_model.score(X, y)
print("the score of the loaded model is ", result)