In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# **Data preparation**

In [None]:
trainingData=pd.read_csv("/kaggle/input/titanic/train.csv")
testingData = pd.read_csv("/kaggle/input/titanic/test.csv")
trainingData.head()



# **Understanding the data**

In [None]:
print(sum(trainingData.Survived)/len(trainingData.Survived)*100)

I want to know the distrubtion of the survived persons from male or female.

In [None]:
men = trainingData.loc[trainingData.Sex == 'male']["Survived"]
women=trainingData.loc[trainingData.Sex == 'female']["Survived"]
MenPercentageSurvived = sum(men)/len(men)*100
WomenPercentageSurvived=sum(women)/len(women)*100

print("% of men who survived:", MenPercentageSurvived)
print("% of women who survived:", WomenPercentageSurvived)
sns.barplot(x='Sex', y='Survived', data=trainingData)

From the above 2 cells i concluded that women have higher rates of survival than men.

In [None]:
trainingData.describe()

Here i found that maximum age is highly greater than 75% of the age's data(outliers in the age).

In [None]:
trainingData.info()


Here from the info i founded that there is null values so i better fill them rather than build data modeling with null values.

The non null count shows that certain columns have missing values

In [None]:
for col in trainingData.columns:
    print(col)

Here i checked whether the features or the columns have null data or not

# **PreProcessing:How i detected and handled the missing values**

In [None]:
missingColumns=trainingData.columns[trainingData.isnull().any()]
trainingData.isnull().sum()

In [None]:
missingColumns

Here i get the missing columns that i want to either fill with a statistical function or drop it.

Since the cabin missing values are 687 out of 891 nearly 77% then i will drop the column.


In [None]:
trainingData.drop('Cabin', axis=1,inplace=True)

In [None]:
trainingData['Age'] = trainingData['Age'].fillna(trainingData['Age'].median())

Here i handled the missing values of the age column with the median as from the conclusion i get from section of understanding the data(there is outliers so i'll better choose median over mean).

As the embarked ports are of charachters so introduced an error , i either change their values with numbers so that i can replace the missing values with one of statistical methods.

In [None]:
trainingData['Embarked'].replace('S',0 ,inplace=True)
trainingData['Embarked'].replace('Q',1 ,inplace=True)
trainingData['Embarked'].replace('C',2 ,inplace=True)

In [None]:
trainingData['Embarked'] = trainingData['Embarked'].fillna(trainingData['Embarked'].mean())



Here i found that the mean is the most suitable .

In [None]:
trainingData.isnull().sum()

I make sure that there is no null values in my data.

# **Model Building:I want to get the relationships between my features**

In [None]:
trainingData['Sex'].replace('female',0 ,inplace=True)
trainingData['Sex'].replace('male',1 ,inplace=True)
trainingData.head()

Now all my data that i will use in the correlation matrix are of form of numbers

In [None]:
corrMatrix = trainingData.corr()
plt.figure(figsize=(15,7))
sns.heatmap(corrMatrix, linewidths=3,annot=True)
plt.show()

I used  corr matrix to choose best variables important in the survival and to eliminate variables that are poor(low value).From the heatmap i found the row of survival is high with **pclass,sex,fare** so i rather pick this features to predict with in my model.

In [None]:
sns.barplot(x='Pclass', y='Survived', data=trainingData, hue='Sex')# 0 here means female and 1 is male


Here women of class 1 is highly survived from women of other classes.

# **Understanding the testing data**

In [None]:
testingData.head()

In [None]:
testingData.describe()


In [None]:
testingData.info()

As there are missing values for the testing data i would rather fill it.

# **Handling the missing values appeared in investgation of testing data.**

In [None]:
testingData['Sex'].replace('female',0 ,inplace=True)
testingData['Sex'].replace('male',1 ,inplace=True)
testingData.head()

In [None]:
missingTestingColumns=testingData.columns[testingData.isnull().any()]
testingData.isnull().sum()

Same as for training data, i handled the missing values for testing data

In [None]:
testingData['Age'].fillna(testingData['Age'].median(), inplace = True)

In [None]:
missingTestingColumns=testingData.columns[testingData.isnull().any()]
testingData.isnull().sum()

In [None]:
testingData['Fare'].fillna(testingData['Fare'].median(),inplace=True)

In [None]:
missingTestingColumns=testingData.columns[testingData.isnull().any()]
testingData.isnull().sum()

In [None]:
testingData.drop('Cabin',axis=1,inplace=True)

In [None]:
missingTestingColumns=testingData.columns[testingData.isnull().any()]
testingData.isnull().sum()

In [None]:

testingData.head()

# **Model Evaluation**

In the model evaluaton section i wanted to evaluate which is the best classifier of diffrent models using kfold cross validation (Also to prevent overfitting).

In [None]:

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

features = ["Pclass","Sex","Fare"]
X_train =trainingData[features]
Y_train = trainingData['Survived']
X_test = testingData[features]
print("X_train shape: ", X_train.shape)
print("Y_train shape: ", Y_train.shape)
print("X_test shape: ", X_test.shape)



In [None]:
X_train.head(10)

In [None]:
Y_train.head(10)

i get the sample of x,y train

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

I partition the original dataset by k equal size folds and i choose 10 folds.

I then evaluated using cross validation with 3 diffrent classifcation models to see which has higher mean score.

In [None]:
clf = KNeighborsClassifier(n_neighbors = 13)
scoring = 'accuracy'
score = cross_val_score(clf, X_train, Y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
round(np.mean(score)*100, 2)


In [None]:
clf = RandomForestClassifier(n_estimators=13)
scoring = 'accuracy'
score = cross_val_score(clf, X_train, Y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
round(np.mean(score)*100, 2)


In [None]:
clf = SVC()
scoring = 'accuracy'
score = cross_val_score(clf, X_train, Y_train, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)
round(np.mean(score)*100, 2)

From the above 3 cells of diffrent models , i choose the highest one which is random forest.

# **Parameter Tuning**

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state = 42)
n_estimators = [100, 300, 500, 800, 1200]
max_depth = [5, 8, 15, 25, 30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10] 

hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split, 
             min_samples_leaf = min_samples_leaf)

gridF = GridSearchCV(model, hyperF, cv = 3, verbose = 1, n_jobs = -1)
bestF = gridF.fit(X_train, Y_train)

In [None]:
bestF.get_params()

# **Data prediction**

I get the best paramters for random forest classifier from the gridSearch CV.

In [None]:

model = RandomForestClassifier(criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0,
            warm_start=False)
model.fit(X_train, Y_train)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': testingData.PassengerId, 'Survived': predictions})
output.to_csv('AhmedSubmission.csv', index=False)