In [None]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd 
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import style

# Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
#Load Data
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

train_df.info()
print(train_df.describe())
print(train_df.head(10))

total = train_df.isnull().sum().sort_values(ascending=False)
percent_1 = train_df.isnull().sum()/train_df.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
print(missing_data.head(12))

In [None]:
#Some visualizations
train_df.Survived[train_df.Sex == 'male'].value_counts(normalize = True).plot(kind='bar', alpha = 0.5, color = '#F8BA00')
plt.title('Male Survived')
plt.show()
 
train_df.Survived[train_df.Sex == 'female'].value_counts(normalize = True).plot(kind='bar', alpha = 0.5, color = '#FA0000')
plt.title('Female Survived')
plt.show()

survived = 'survived'
not_survived = 'not survived'
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(10, 4))
women = train_df[train_df['Sex']=='female']
men = train_df[train_df['Sex']=='male']
ax = sns.distplot(women[women['Survived']==1].Age.dropna(), bins=18, label = survived, ax = axes[0], kde =False)
ax = sns.distplot(women[women['Survived']==0].Age.dropna(), bins=40, label = not_survived, ax = axes[0], kde =False)
ax.legend()
ax.set_title('Female')
ax = sns.distplot(men[men['Survived']==1].Age.dropna(), bins=18, label = survived, ax = axes[1], kde = False)
ax = sns.distplot(men[men['Survived']==0].Age.dropna(), bins=40, label = not_survived, ax = axes[1], kde = False)
ax.legend()
_ = ax.set_title('Male')
plt.show()

grid = sns.FacetGrid(train_df, col='Survived', row='Pclass', height=2.2, aspect=1.6)
grid.map(plt.hist, 'Age', alpha=.5, bins=20, color='green')
grid.add_legend()
plt.show()

for i in [1,2,3]:
    train_df.Age[train_df.Pclass == i].plot(kind = 'kde', alpha = 0.8)
plt.title("Class vs Age")
plt.legend(("1", "2", "3"))
plt.show()

train_df.Embarked.value_counts(normalize = True).plot(kind='bar', alpha = 0.7, color = ['#FA0000', '#F8BA00', '#1AB385'])
plt.title('Embarked Distributed')
plt.show()

train_df.Pclass.value_counts(normalize = True).plot(kind='bar', alpha = 0.7, color = ['#852828', '#503226', '#34495E'])
plt.title('Passenger Class')
plt.show()

sns.heatmap(train_df.corr(),annot=True)
plt.title('Correlation between features')
plt.show()

In [None]:
# Drop unnecessary features
train_df = train_df.drop(['PassengerId'], axis=1)
train_df = train_df.drop(['Cabin'], axis=1)
test_df = test_df.drop(['Cabin'], axis=1)
train_df = train_df.drop(['Ticket'], axis=1)
test_df = test_df.drop(['Ticket'], axis=1)

In [None]:
# To fill the Null value of Age feature
data = [train_df, test_df]

for dataset in data:
    mean = train_df["Age"].mean()
    std = test_df["Age"].std()
    is_null = dataset["Age"].isnull().sum()
    rand_age = np.random.randint(mean - std, mean + std, size = is_null)
    age_slice = dataset["Age"].copy()
    age_slice[np.isnan(age_slice)] = rand_age
    dataset["Age"] = age_slice
    dataset["Age"] = train_df["Age"].astype(int)

In [None]:
# To fill the Null value of Emarked
for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].fillna('S')

In [None]:
# Convert Fare column to Int.
for dataset in data:
    dataset['Fare'] = dataset['Fare'].fillna(0)
    dataset['Fare'] = dataset['Fare'].astype(int)

In [None]:
# Convetr Name column data type to Int.
titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

for dataset in data:
    # extract titles
    dataset['P_Name'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
    # replace titles with a more common title or as Rare
    dataset['P_Name'] = dataset['P_Name'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    dataset['P_Name'] = dataset['P_Name'].replace('Mlle', 'Miss')
    dataset['P_Name'] = dataset['P_Name'].replace('Ms', 'Miss')
    dataset['P_Name'] = dataset['P_Name'].replace('Mme', 'Mrs')
    # convert titles into numbers
    dataset['P_Name'] = dataset['P_Name'].map(titles)
    # filling NaN with 0, to get safe
    dataset['P_Name'] = dataset['P_Name'].fillna(0)

In [None]:
# Drop Name column from dataset.    
train_df = train_df.drop(['Name'], axis=1)
test_df = test_df.drop(['Name'], axis=1)

In [None]:
# Convetr Sex column data type to Int.
genders = {"male": 0, "female": 1}
data = [train_df, test_df]
for dataset in data:
    dataset['Sex'] = dataset['Sex'].map(genders)

In [None]:
# Convert embarked column to Int.
values = {"S": 0, "C": 1, "Q": 2}
data = [train_df, test_df]
for dataset in data:
    dataset['Embarked'] = dataset['Embarked'].map(values)

In [None]:
# Grouping the Age column.
for dataset in data:
    dataset['Age'] = dataset['Age'].astype(int)
    dataset.loc[ dataset['Age'] <= 11, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 11) & (dataset['Age'] <= 18), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 18) & (dataset['Age'] <= 22), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 22) & (dataset['Age'] <= 27), 'Age'] = 3
    dataset.loc[(dataset['Age'] > 27) & (dataset['Age'] <= 33), 'Age'] = 4
    dataset.loc[(dataset['Age'] > 33) & (dataset['Age'] <= 40), 'Age'] = 5
    dataset.loc[(dataset['Age'] > 40) & (dataset['Age'] <= 66), 'Age'] = 6
    dataset.loc[ dataset['Age'] > 66, 'Age'] = 6

In [None]:
# Grouping the Fare column.
data = [train_df, test_df]

for dataset in data:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[(dataset['Fare'] > 31) & (dataset['Fare'] <= 99), 'Fare']   = 3
    dataset.loc[(dataset['Fare'] > 99) & (dataset['Fare'] <= 250), 'Fare']   = 4
    dataset.loc[ dataset['Fare'] > 250, 'Fare'] = 5
    dataset['Fare'] = dataset['Fare'].astype(int)

train_df.info()    
print(train_df.head(10))

In [None]:
# prepare the data.
X_train = train_df.drop("Survived", axis=1)
Y_train = train_df["Survived"]
X_test  = test_df.drop("PassengerId", axis=1).copy()

In [None]:
# ------------ Features Importances ---------------

FI = RandomForestClassifier()      
FI = FI.fit(X_train,Y_train)
importances = FI.feature_importances_
std = np.std([tree.feature_importances_ for tree in FI.estimators_], axis=0)
indices = np.argsort(importances)[::-1]
print("Feature ranking:")

for f in range(X_train.shape[1]):
    print("%s: (%f)" % (X_train.columns[indices[f]], importances[indices[f]]))

# Plot the feature importances

plt.figure(1, figsize=(14, 13))
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices], color="g", yerr=std[indices], align="center")
plt.xticks(range(X_train.shape[1]), X_train.columns[indices],rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.show()