# Titanic Prediction
This notebook is for predicting the passenger survival in the Titanics.
## Decision Tree
as a novice to kaggle, followed by https://www.kaggle.com/nedaamiri/titanic-decision-tree.

## Import Packages

In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Train File

In [59]:
df_train = pd.read_csv('../input/titanic/train.csv')

In [60]:
df_train.head()

In [61]:
df_train.info()    # the abstract of dataframe

In [62]:
df_train.describe()    # the statistical infos about the dataframe

In [63]:
df_train.shape

## Data Analysis

In [64]:
df_train['Survived'].value_counts()     # count the number of survivors and deaths

In [65]:
sns.countplot(data=df_train, x='Survived')     # draw the img of the survivors and deaths numbers

In [66]:
sns.heatmap(df_train.corr(), annot=True, cmap='winter')
# df.corr() can show the relevance between each columns
# cmap indicates the img color

## Data Prepration
handle the NaN 

In [67]:
((df_train.isnull().sum())/len(df_train))*100    

In [68]:
# because the cabin has nothing to do with the percentage of human survival, so we choose to drop it
df_train.drop('Cabin', axis=1, inplace=True)

In [69]:
# when the age is NaN, fill in the mean of the age
df_train['Age'].fillna(df_train['Age'].mean(), inplace=True)

In [70]:
((df_train.isnull().sum())/len(df_train))*100

In [71]:
df_train.info()

In [72]:
def missing_percent(df):
    nan_percent = 100*(df.isnull().sum()/len(df))
    nan_percent = nan_percent[nan_percent>0].sort_values()
    return nan_percent

In [73]:
nan_percent = missing_percent(df_train)

In [74]:
nan_percent

In [75]:
df_train.drop(['PassengerId', 'Name', 'Ticket', 'Fare', 'Embarked'], axis=1, inplace=True)

In [76]:
df_train

In [77]:
# make the sex from string to int
df_train.loc[df_train['Sex']=='male', 'Sex']=1
df_train.loc[df_train['Sex']=='female', 'Sex']=0

In [78]:
nan_percent = missing_percent(df_train)
nan_percent

In [79]:
df_train.isnull().sum()

## Features and Labels

In [80]:
X = df_train.drop('Survived', axis=1)
y = df_train['Survived']

**split tht data to train and val dataset**

In [81]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=101)

**scale the feature**
is it really necessary??

In [82]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
scaled_X_train = scaler.transform(X_train)
scaled_X_val = scaler.transform(X_val)

## Model Train 

In [83]:
from sklearn.tree import DecisionTreeClassifier

In [84]:
model = DecisionTreeClassifier()

In [85]:
model.fit(X_train, y_train)

In [87]:
y_pred = model.predict(X_val)

## Evaluate the Model

In [90]:
from sklearn.metrics import classification_report, confusion_matrix

In [92]:
confusion_matrix(y_val, y_pred)   # 混淆矩阵

In [95]:
print(classification_report(y_val, y_pred))

In [97]:
model.feature_importances_    # the importance of each paras in the decision tree

In [98]:
pd.DataFrame(index=X_train.columns, data=model.feature_importances_, columns=['feature importance'])

## Visualize the Tree

In [99]:
from sklearn.tree import plot_tree

In [102]:
plt.figure(figsize=(15, 8), dpi=150)
plot_tree(model);

## Understanding Hyperparams

In [104]:
# help(DecisionTreeClassifier)

### max_depth

In [105]:
pruned_tree = DecisionTreeClassifier(max_depth=2)
pruned_tree.fit(X_train, y_train)

In [106]:
def report_model(model):
    model_preds = model.predict(X_val)
    print(classification_report(y_val,model_preds))
    print('\n')
    plt.figure(figsize=(12,8),dpi=150)
    plot_tree(model,filled=True,feature_names=X_val.columns);

In [107]:
report_model(pruned_tree)

### max_leaf_nodes

In [110]:
pruned_tree = DecisionTreeClassifier(max_leaf_nodes=3)
pruned_tree.fit(X_train,y_train)

In [111]:
report_model(pruned_tree)

### Criterion

In [112]:
entropy_tree = DecisionTreeClassifier(criterion='entropy')
entropy_tree.fit(X_train,y_train)

In [113]:
report_model(entropy_tree)

## Handle the Submission File