In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer 
import warnings
warnings.simplefilter('ignore')
from sklearn.preprocessing import StandardScaler , MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import  f1_score, auc, precision_recall_curve, recall_score , precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
pd.options.display.max_rows = 4000

In [None]:
df_base_titanic = pd.read_csv('train.csv')
df_base_test =pd.read_csv('test.csv')
df_base_titanic.head()

In [None]:
df_base_test.head()

### Exploring the dataset

In [None]:
print(df_base_titanic.shape[0] , 'X' , df_base_titanic.shape[1])

The dataset have 8693 rows and 14 columns 

In [None]:
df_base_titanic.info()

We noticed several categorical features, mainly referring to services, which can be transformed into dummies, in order to analyze the impact of the passenger having used the service in the fact of having been teleported or not.

In [None]:
df_base_titanic.describe()

Some features only show values ​​in the last quartile.
It would also be interesting to do the same analysis with categorical features like dummies

### Analyze how much of the base is missing values

In [None]:
df_base_titanic.isna().sum()

In [None]:
# improving this insight, let's analyze how much missing data represents each feature
nan_ratio = []
for col in df_base_titanic.columns :
    nan_item = []
    nan_item.append(col)
    nan_item.append(df_base_titanic[col].isnull().sum())
    nan_item.append(str(round(100*df_base_titanic[col].isnull().sum() / df_base_titanic.shape[0], 2 ))+ '%')
    nan_ratio.append(nan_item)
    
df_nan = pd.DataFrame(nan_ratio, columns=["Column", "NaN count", "NaN ratio"]).set_index("Column")
df_nan = df_nan.sort_values("NaN ratio", ascending=False)
df_nan.astype(object).T

Features have about 2.5% of the data as NaN , which is a low percentage

Only the PassengerID and Transported features do not have any Na

### Analyze if the base is unbalanced

In [None]:
df_base_titanic['Transported'].value_counts(normalize = True)

The percentage of transported and not transported is almost equal. So we don't have an imbalance problem here

In [None]:
df_base_titanic['Transported'] = np.where(df_base_titanic['Transported'] == True, 1, 0)

### Analyze features and handle missing data

#1 HomePlanet

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,6))
fig.suptitle('Origen Distribution', size = 20, weight='bold')

sizes = (df_base_titanic['HomePlanet'].value_counts(sort=False))

labels = df_base_titanic['HomePlanet'].dropna().unique()
colors = ['#099FFF', '#CC00FF', '#13CA91']
explode = (0.05,0.05,0.05) 

ax[0].pie(sizes, colors=colors, explode=explode, startangle=90, labels=labels,
       autopct='%1.2f%%', pctdistance=0.6,textprops={'fontsize':12})
sns.countplot(x='HomePlanet', data=df_base_titanic, hue='Transported', ax=ax[1])

As we verified above that there are few missing values ​​in each feature (approximately 2.5%), in categorical variables we will use to fill these data with the most frequent data.
For this we will use Sklearn's SimpleImputer function, with the parameter strategy='most_frequent'

In [None]:
most_imputer = SimpleImputer(missing_values = np.nan,
                            strategy = 'most_frequent')

In [None]:
df_base_titanic.HomePlanet = most_imputer.fit_transform(df_base_titanic[['HomePlanet']])
df_base_test.HomePlanet = most_imputer.fit_transform(df_base_test[['HomePlanet']])

#2 CryoSleep

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,6))
fig.suptitle('CryoSleep Distribution', size = 20, weight='bold')

sizes = (df_base_titanic['CryoSleep'].value_counts())

labels = df_base_titanic['CryoSleep'].dropna().unique()
colors = ['#099FFF', '#CC00FF']

ax[0].pie(sizes, colors=colors,  startangle=90, labels=labels,
       autopct='%1.2f%%', pctdistance=0.6,textprops={'fontsize':12})
sns.countplot(x='CryoSleep', data=df_base_titanic, hue='Transported', ax=ax[1])

In [None]:
df_base_titanic.CryoSleep = most_imputer.fit_transform(df_base_titanic[['CryoSleep']])
df_base_test.CryoSleep = most_imputer.fit_transform(df_base_test[['CryoSleep']])

In [None]:
df_base_titanic['CryoSleep'] = np.where(df_base_titanic['CryoSleep'] == True , 1 , 0)
df_base_test['CryoSleep'] = np.where(df_base_test['CryoSleep'] == True , 1 , 0)


#3 Cabin

We need to handle the Feature Cabin. In the documentation it is written that the cabin number is composed of deck/num/side, where side can be P for port and S for starboard. Let's create a two new features with this information

In [None]:
df_base_titanic['Cabin']

In [None]:
# Removing all the lines where we don't have the cabin information, as we can't assume any value for them. 
df_base_titanic['Cabin'] = df_base_titanic['Cabin'].fillna(0)
df_base_test['Cabin'] = df_base_test['Cabin'].fillna(0)

In [None]:
df_base_titanic['Cabin'] = df_base_titanic['Cabin'].astype(str)
df_base_test['Cabin'] = df_base_test['Cabin'].astype(str)

In [None]:
df_base_titanic.info()

In [None]:
df_cabines = df_base_titanic["Cabin"].str.extract("(.*?)/(.*?)/(.)")
df_cabines.columns = ["Deck", "Num", "Side"]

df_cabines1 = df_base_test["Cabin"].str.extract("(.*?)/(.*?)/(.)")
df_cabines1.columns = ["Deck", "Num", "Side"]

In [None]:
df_cabines["PassengerId"] = df_base_titanic["PassengerId"]
df_cabines.dropna(inplace=True)
df_cabines.head(4)

In [None]:
df_cabines1["PassengerId"] = df_base_test["PassengerId"]
df_cabines1.dropna(inplace=True)

In [None]:
df_base_test = df_base_test.merge(df_cabines1 , how='left')
df_base_test.drop(['Cabin'], axis=1 , inplace = True)

In [None]:
df_base_titanic = df_base_titanic.merge(df_cabines , how='left')
df_base_titanic.drop(['Cabin'], axis=1 , inplace = True)
df_base_titanic

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(22,6))
fig.suptitle('Deck Distribution', size = 20, weight='bold')

sizes = (df_base_titanic['Deck'].value_counts())

labels = df_base_titanic['Deck'].dropna().unique()
 

ax[0].pie(sizes,  startangle=90, labels=labels,
       autopct='%1.1f%%', pctdistance=0.5,textprops={'fontsize':10})
sns.countplot(x='Deck', data=df_base_titanic, hue='Transported', ax=ax[1])

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,6))
fig.suptitle('Side Distribution', size = 20, weight='bold')

sizes = df_base_titanic['Side'].value_counts()

labels = df_base_titanic['Side'].dropna().unique()
 

ax[0].pie(sizes,  startangle=90, labels=labels,
       autopct='%1.2f%%', pctdistance=0.6,textprops={'fontsize':12})
sns.countplot(x='Side', data=df_base_titanic, hue='Transported', ax=ax[1])

#4 Destination

In [None]:
df_base_titanic.Destination.value_counts()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,6))
fig.suptitle('Destination Distribution', size = 20, weight='bold')

sizes = df_base_titanic['Destination'].value_counts()

labels = df_base_titanic['Destination'].dropna().unique()

ax[0].pie(sizes,  startangle=90, labels=labels,
       autopct='%1.2f%%', pctdistance=0.6,textprops={'fontsize':12})
sns.countplot(x='Destination', data=df_base_titanic, hue='Transported', ax=ax[1])

In [None]:
df_base_titanic.Destination = most_imputer.fit_transform(df_base_titanic[['Destination']])
df_base_test.Destination = most_imputer.fit_transform(df_base_test[['Destination']])

#5 VIP

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,6))
fig.suptitle('VIP Distribution', size = 20, weight='bold')

sizes = (df_base_titanic['VIP'].value_counts())

labels = df_base_titanic['VIP'].dropna().unique()
colors = ['#099FFF',  '#13CA91' ]
explode = (0.05,0.05) 

ax[0].pie(sizes, colors=colors, explode=explode, startangle=90, labels=labels,
       autopct='%1.2f%%', pctdistance=0.6,textprops={'fontsize':12})
sns.countplot(x='VIP', data=df_base_titanic, hue='Transported', ax=ax[1])

1 - The vast majority of passengers were not in the VIP category
2 - Little difference in probability of being transported to those who are and are not VIPs

In [None]:
df_base_titanic.VIP = most_imputer.fit_transform(df_base_titanic[['VIP']])
df_base_test.VIP = most_imputer.fit_transform(df_base_test[['VIP']])

In [None]:
df_base_titanic['VIP'] = np.where(df_base_titanic['VIP'] == True, 1, 0)
df_base_test['VIP'] = np.where(df_base_test['VIP'] == True, 1, 0)

#6 PassengerId

In [None]:
def create_group_id(passenger_id):
    splitted_id = passenger_id.split("_")
    group_id = splitted_id[1]
    return group_id

In [None]:
df_base_titanic["group_id"] = df_base_titanic["PassengerId"].apply(create_group_id)
df_base_titanic["group_id"] = df_base_titanic["group_id"].astype(int)

In [None]:
df_base_test["group_id"] = df_base_test["PassengerId"].apply(create_group_id)
df_base_test["group_id"] = df_base_test["group_id"].astype(int)

### handling numeric variables with fit_transform

In [None]:
# Applying SimpleImputer to fill the NaN in numeric variables
df_base_titanic.Age =  most_imputer.fit_transform(df_base_titanic[['Age']])
df_base_titanic.RoomService =  most_imputer.fit_transform(df_base_titanic[['RoomService']])
df_base_titanic.FoodCourt =  most_imputer.fit_transform(df_base_titanic[['FoodCourt']])
df_base_titanic.ShoppingMall =  most_imputer.fit_transform(df_base_titanic[['ShoppingMall']])
df_base_titanic.Spa =  most_imputer.fit_transform(df_base_titanic[['Spa']])
df_base_titanic.VRDeck =  most_imputer.fit_transform(df_base_titanic[['VRDeck']])


df_base_test.Age =  most_imputer.fit_transform(df_base_test[['Age']])
df_base_test.RoomService =  most_imputer.fit_transform(df_base_test[['RoomService']])
df_base_test.FoodCourt =  most_imputer.fit_transform(df_base_test[['FoodCourt']])
df_base_test.ShoppingMall =  most_imputer.fit_transform(df_base_test[['ShoppingMall']])
df_base_test.Spa =  most_imputer.fit_transform(df_base_test[['Spa']])
df_base_test.VRDeck =  most_imputer.fit_transform(df_base_test[['VRDeck']])

In [None]:
df_base_titanic = pd.get_dummies(df_base_titanic , columns = ['HomePlanet' , 'Destination' , 'Deck' , 'Side'])
df_base_test = pd.get_dummies(df_base_test , columns = ['HomePlanet' , 'Destination' , 'Deck' , 'Side'])


In [None]:
df_base_titanic.head()

In [None]:
df_base_test.drop(['Name'] , axis =1 , inplace = True)
df_base_titanic.drop(['Name'] , axis =1 , inplace = True)

In [None]:
corr = df_base_titanic.corr()
f, ax = plt.subplots(figsize=(15, 15))
ax = sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values)

In [None]:
df_base_titanic.drop(['PassengerId' , 'Num'] , axis =1 , inplace = True)

In [None]:
df_base_test.drop(['Num'] , axis =1 , inplace = True)

### Train and Test Division 

In [None]:
from sklearn.model_selection import train_test_split
# import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler

In [None]:
df_train, df_test = train_test_split(df_base_titanic,
                                         test_size = 0.5 ,
                                         random_state = 0)

In [None]:
X_train , y_train = df_train.drop('Transported' , axis=1) , df_train['Transported']
X_test , y_test = df_test.drop('Transported' , axis=1) , df_test['Transported']

In [None]:
X_train[['Age' , 'RoomService' , 'FoodCourt' , 'ShoppingMall' , 'Spa', 'VRDeck']] =  MinMaxScaler().fit_transform(X_train[['Age' , 
                                                                                                                            'RoomService' , 
                                                                                                                            'FoodCourt' , 
                                                                                                                            'ShoppingMall',
                                                                                                                            'Spa',
                                                                                                                            'VRDeck']])

Testing some models

In [None]:
clf = DecisionTreeClassifier(criterion="gini")

In [None]:
model1 = clf.fit(X_train, y_train)

In [None]:
allScores = cross_val_score(clf, X_train, y_train , cv=10)
allScores.mean() 

In [None]:
 y_pred = clf.predict(X_test)
print('f1_score' ,round(f1_score(y_test, y_pred, average="macro"),4))
print('precision_score',round(precision_score(y_test, y_pred, average="macro"),4))
print('recall_score', round(recall_score(y_test, y_pred, average="macro"),4)) 

In [None]:
data = pd.DataFrame(clf.feature_importances_,index=list(X_train.columns)).reset_index()
importancia = pd.DataFrame(data)
importancia = importancia.rename({0: 'importance%'}, axis = 1)
importancia = importancia.rename({'index': 'features'}, axis = 1)

importancia.sort_values(by='importance%' , ascending = False)

In [None]:
#RandomForestClassifier

In [None]:
clf_2 = RandomForestClassifier(max_depth=24) 

In [None]:
model2 = clf_2.fit(X_train, y_train)

In [None]:
allScores = cross_val_score(clf_2, X_train, y_train , cv=10)
allScores.mean() 

In [None]:
y_pred2 = clf_2.predict(X_test)
print('f1_score' ,round(f1_score(y_test, y_pred2, average="macro"),4))
print('precision_score',round(precision_score(y_test, y_pred2, average="macro"),4))
print('recall_score', round(recall_score(y_test, y_pred2, average="macro"),4)) 

In [None]:
#KNeighborsClassifier() 

In [None]:
clf_3 = KNeighborsClassifier() 

In [None]:
model3 = clf_3.fit(X_train, y_train)

In [None]:
allScores = cross_val_score(clf_3, X_train, y_train , cv=10)
allScores.mean()

In [None]:
y_pred3 = clf_3.predict(X_test)
print('f1_score' ,round(f1_score(y_test, y_pred3, average="macro"),4))
print('precision_score',round(precision_score(y_test, y_pred3, average="macro"),4))
print('recall_score', round(recall_score(y_test, y_pred3, average="macro"),4)) 

In [None]:
#LogisticRegression

In [None]:
clf_4 = LogisticRegression(max_iter=5000,  random_state=123)

In [None]:
model4 = clf_4.fit(X_train, y_train)

In [None]:
allScores = cross_val_score(clf_4, X_train, y_train , cv=10)
allScores.mean() # tomamos a média do score

In [None]:
y_pred4 = clf_4.predict(X_test)
print('f1_score' ,round(f1_score(y_test, y_pred4, average="macro"),4))
print('precision_score',round(precision_score(y_test, y_pred4, average="macro"),4))
print('recall_score', round(recall_score(y_test, y_pred4, average="macro"),4)) 

In [None]:
# Best Model : Random Forest

In [None]:
#Submission

In [None]:
df_base_test1 = df_base_test.copy()

In [None]:
df_base_test1 = df_base_test1.drop(['PassengerId'], axis=1)

In [None]:
predictions = model2.predict(df_base_test1)

In [None]:
output = pd.DataFrame({'PassengerId': df_base_test.PassengerId, 
                       'Transported': predictions})

In [None]:
output['Transported'] = np.where(output['Transported'] == 1, True, False)

In [None]:
output.to_csv('submission.csv', index=False)