#### Let's begin:

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # data visualization
import os 

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


### Acquire data:

In [2]:
def read_data():
    train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
    print("Train data imported successfully!!")
    print("-"*50)
    test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
    print("Test data imported successfully!!")
    return train_data , test_data

In [3]:
train_data , test_data = read_data()

Train data imported successfully!!
--------------------------------------------------
Test data imported successfully!!


#### We are going to get the **true solution** for testing on unseen data.

**Important Note:** We will not depend on it, Just for learning purposes.

In [4]:
# You can skip this cell 

import re
import warnings
import io
import requests

warnings.filterwarnings("ignore")

url="https://github.com/thisisjasonjafari/my-datascientise-handcode/raw/master/005-datavisualization/titanic.csv"
s=requests.get(url).content
c=pd.read_csv(io.StringIO(s.decode('utf-8')))
test_data_with_labels = c
for i, name in enumerate(test_data_with_labels['name']):
    if '"' in name:
        test_data_with_labels['name'][i] = re.sub('"', '', name)
        
for i, name in enumerate(test_data['Name']):
    if '"' in name:
        test_data['Name'][i] = re.sub('"', '', name)
        
survived = []

for name in test_data['Name']:
    survived.append(int(test_data_with_labels.loc[test_data_with_labels['name'] == name]['survived'].values[-1]))
    
true_solution = pd.read_csv('../input/titanic/gender_submission.csv')
true_solution['Survived'] = survived
true_solution.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


##### In this notebook we will not focus on data analysis, **Just modeling.**

In [5]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#### Filling Blanks and Missed Data:

In [6]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
test_data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

##### It's important to fill Age, Embarked, Fare features: 

In [8]:
train_data['Embarked'] = train_data.Embarked.fillna(train_data.Embarked.dropna().max())
test_data['Fare'] = test_data.Fare.fillna(test_data.Fare.dropna().mean())

##### we will guess the age from Pclass and Sex:

In [9]:
guess_ages = np.zeros((2,3))
guess_ages

array([[0., 0., 0.],
       [0., 0., 0.]])

Now we iterate over Sex (0 or 1) and Pclass (1, 2, 3) to calculate guessed values of Age for the six combinations.

In [10]:
combine = [train_data , test_data]

# Converting Sex categories (male and female) to 0 and 1:
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

# Filling missed age feature:

for dataset in combine:
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = dataset[(dataset['Sex'] == i) & \
                                  (dataset['Pclass'] == j+1)]['Age'].dropna()
            age_guess = guess_df.median()

            # Convert random age float to nearest .5 age
            guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
            
    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\
                    'Age'] = guess_ages[i,j]

    dataset['Age'] = dataset['Age'].astype(int)

train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35,0,0,373450,8.05,,S


In [11]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [12]:
test_data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64

##### There are no important missed data anymore!!!

### Modeling:

In [13]:
# ========================================================= #
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
# ========================================================= #
from sklearn.tree import DecisionTreeClassifier
# ========================================================= #
from colorama import Fore

In [14]:
# First we will select the features that we will use:

features = ["Pclass" , "Sex" , "Age" , "SibSp" , "Parch" , "Fare" , "Embarked"]

# Categorical to indicator variables:
X_train = pd.get_dummies(train_data[features])
Y_train = train_data["Survived"]
X_test = pd.get_dummies(test_data[features])

X_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,3,0,22,1,0,7.25,0,0,1
1,1,1,38,1,0,71.2833,1,0,0
2,3,1,26,0,0,7.925,0,0,1
3,1,1,35,1,0,53.1,0,0,1
4,3,0,35,0,0,8.05,0,0,1


In [15]:
def print_scores(model ,X_train , Y_train,predictions , cv_splites=10):
    print(Fore.BLUE , "The mean accuracy score of the train data is %.5f" % model.score(X_train, Y_train))
    CV_scores = cross_val_score(model, X_train, Y_train, cv=cv_splites)
    print(Fore.BLACK ,"The individual cross-validation scores are: \n",CV_scores)
    print(Fore.BLACK ,"The minimum cross-validation score is %.3f" % min(CV_scores))
    print(Fore.BLACK ,"The maximum cross-validation score is %.3f" % max(CV_scores))
    print(Fore.YELLOW ,"The mean  cross-validation   score is %.5f ± %0.2f" % (CV_scores.mean(), CV_scores.std() * 2))
    print(Fore.RED ,"The test (i.e. leaderboard)  score is %.5f (this score is unknown)" % accuracy_score(true_solution["Survived"],predictions))

In [16]:
model = DecisionTreeClassifier(max_depth=1 , max_features=2 ,random_state=7)
model.fit(X_train, Y_train)
predictions = model.predict(X_test)
print_scores(model, X_train, Y_train, predictions)

[34m The mean accuracy score of the train data is 0.66779
[30m The individual cross-validation scores are: 
 [0.64444444 0.58426966 0.58426966 0.62921348 0.70786517 0.70786517
 0.75280899 0.6741573  0.68539326 0.70786517]
[30m The minimum cross-validation score is 0.584
[30m The maximum cross-validation score is 0.753
[33m The mean  cross-validation   score is 0.66782 ± 0.11
[31m The test (i.e. leaderboard)  score is 0.60287 (this score is unknown)


In [17]:
model = DecisionTreeClassifier(random_state=7)
model.fit(X_train, Y_train)
predictions = model.predict(X_test)
print_scores(model, X_train, Y_train, predictions)

[34m The mean accuracy score of the train data is 0.97980
[30m The individual cross-validation scores are: 
 [0.74444444 0.7752809  0.71910112 0.75280899 0.83146067 0.80898876
 0.80898876 0.75280899 0.80898876 0.78651685]
[30m The minimum cross-validation score is 0.719
[30m The maximum cross-validation score is 0.831
[33m The mean  cross-validation   score is 0.77894 ± 0.07
[31m The test (i.e. leaderboard)  score is 0.71770 (this score is unknown)


In [18]:
model = DecisionTreeClassifier(max_depth=3 , max_features=4 ,random_state=7)
model.fit(X_train, Y_train)
predictions = model.predict(X_test)
print_scores(model, X_train, Y_train, predictions)

[34m The mean accuracy score of the train data is 0.82379
[30m The individual cross-validation scores are: 
 [0.82222222 0.84269663 0.7752809  0.85393258 0.84269663 0.79775281
 0.80898876 0.78651685 0.85393258 0.83146067]
[30m The minimum cross-validation score is 0.775
[30m The maximum cross-validation score is 0.854
[33m The mean  cross-validation   score is 0.82155 ± 0.05
[31m The test (i.e. leaderboard)  score is 0.78947 (this score is unknown)


- This score is much better because the training score (82.3%) is not far from cross-validation score (82.1%)
- The individual cross-validation scores variance became less (from 77.5% to 85.4%).
- This state leads to higher leaderboard result (78.9%). But we don't know it yet:(, So we want to find best cross-validation score without Overfitting nor Underfitting.

In [19]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators= 80 ,max_depth=5 , max_features=8 ,min_samples_split=3 ,random_state=7)
model.fit(X_train, Y_train)
predictions = model.predict(X_test)
print_scores(model, X_train, Y_train, predictions)

[34m The mean accuracy score of the train data is 0.85859
[30m The individual cross-validation scores are: 
 [0.76666667 0.85393258 0.75280899 0.91011236 0.88764045 0.80898876
 0.80898876 0.78651685 0.87640449 0.84269663]
[30m The minimum cross-validation score is 0.753
[30m The maximum cross-validation score is 0.910
[33m The mean  cross-validation   score is 0.82948 ± 0.10
[31m The test (i.e. leaderboard)  score is 0.79426 (this score is unknown)


#### Conclusions: