In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import plot_confusion_matrix, plot_roc_curve


  import pandas.util.testing as tm


In [0]:
train_url = 'https://raw.githubusercontent.com/kant1310/machine-learning-from-scratch/master/titanic_competition/data/train.csv'
test_url = 'https://raw.githubusercontent.com/kant1310/machine-learning-from-scratch/master/titanic_competition/data/test.csv'
train_set = pd.read_csv(train_url)
test_set = pd.read_csv(test_url)

In [87]:
train_set.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [0]:
train_set.drop(['Cabin', 'Name', 'Ticket'], axis=1, inplace=True)
train_set['Age'].fillna(train_set.Age.median(), inplace=True)
train_set['Embarked'].fillna(train_set.Embarked.mode()[0], inplace=True)

In [89]:
train_set.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [0]:
test_set.drop(['Cabin', 'Name', 'Ticket'], axis=1, inplace=True)
test_set['Age'].fillna(train_set.Age.median(), inplace=True)
test_set['Fare'].fillna(train_set.Fare.median(), inplace=True)

In [91]:
test_set.isna().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [0]:
def binning(data):
  df = data.copy(deep=True)
  age_bins = [0,12,20,40,120]
  age_labels = ['Child', 'Teenager', 'Adult', 'Elder']
  fare_bins = [0,8,15,32,600]
  fare_labels = ['Low', 'Median', 'Avarage', 'High']
  df['Age_bins'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels)
  df['Fare_bins'] = pd.cut(df['Fare'], bins=fare_bins, labels=fare_labels)
  df.drop(['Age', 'Fare'], axis=1, inplace=True)
  return df

In [0]:
train_df = binning(train_set)
test_df = binning(test_set)

In [143]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,SibSp,Parch,Embarked,Age_bins,Fare_bins
0,1,0,3,male,1,0,S,Adult,Low
1,2,1,1,female,1,0,C,Adult,High
2,3,1,3,female,0,0,S,Adult,Low
3,4,1,1,female,1,0,S,Adult,High
4,5,0,3,male,0,0,S,Adult,Median


In [0]:
X = train_df.drop(['PassengerId', 'Survived'], axis=1)
y = train_df.Survived

In [145]:
X.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked,Age_bins,Fare_bins
0,3,male,1,0,S,Adult,Low
1,1,female,1,0,C,Adult,High
2,3,female,0,0,S,Adult,Low
3,1,female,1,0,S,Adult,High
4,3,male,0,0,S,Adult,Median


In [0]:
X = pd.get_dummies(X, columns=['Sex', 'Age_bins', 'Fare_bins', 'Embarked'], prefix=['Sex_Type_', 'Age_Type_', 'Fare_Type_', 'Embarked_Type_'])

In [0]:
np.random.seed(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [130]:
np.random.seed(0)
log = LogisticRegression(max_iter=1000)
log.fit(X_train, y_train)
print(log.score(X_test, y_test))
np.random.seed(0)
forest = RandomForestClassifier()
forest.fit(X_train, y_train)
print(forest.score(X_test, y_test))

0.8100558659217877
0.8603351955307262


In [175]:
np.random.seed(0)
for i in range(10, 500, 50):
  forest = RandomForestClassifier(n_estimators=i)
  forest.fit(X_train, y_train)
  print(f'Trying model with n_estimators = {i}...Accuracy = {forest.score(X_test, y_test)}')

Trying model with n_estimators = 10...Accuracy = 0.8715083798882681
Trying model with n_estimators = 60...Accuracy = 0.8491620111731844
Trying model with n_estimators = 110...Accuracy = 0.8659217877094972
Trying model with n_estimators = 160...Accuracy = 0.8603351955307262
Trying model with n_estimators = 210...Accuracy = 0.8659217877094972
Trying model with n_estimators = 260...Accuracy = 0.8659217877094972
Trying model with n_estimators = 310...Accuracy = 0.8603351955307262
Trying model with n_estimators = 360...Accuracy = 0.8603351955307262
Trying model with n_estimators = 410...Accuracy = 0.8603351955307262
Trying model with n_estimators = 460...Accuracy = 0.8603351955307262


In [190]:
np.random.seed(0)
forest = RandomForestClassifier(n_estimators=10)
forest.fit(X_train, y_train)
forest.score(X_test, y_test)

0.8715083798882681

In [195]:
cross_score = cross_val_score(forest, X, y, cv=5)
cross_score

array([0.79888268, 0.80337079, 0.84831461, 0.83146067, 0.83146067])

In [136]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,SibSp,Parch,Embarked,Age_bins,Fare_bins
0,892,3,male,0,0,Q,Adult,Low
1,893,3,female,1,0,S,Elder,Low
2,894,2,male,0,0,Q,Elder,Median
3,895,3,male,0,0,S,Adult,Median
4,896,3,female,1,1,S,Adult,Median


In [0]:
test_X = test_df.drop(['PassengerId'], axis=1)
test_X = pd.get_dummies(test_X, columns=['Sex', 'Embarked', 'Age_bins', 'Fare_bins'], prefix=['Sex_Type', 'Embarked_Type', 'Age_Type', 'Fare_Type'])

In [156]:
test_X.head()

Unnamed: 0,Pclass,SibSp,Parch,Sex_Type_female,Sex_Type_male,Embarked_Type_C,Embarked_Type_Q,Embarked_Type_S,Age_Type_Child,Age_Type_Teenager,Age_Type_Adult,Age_Type_Elder,Fare_Type_Low,Fare_Type_Median,Fare_Type_Avarage,Fare_Type_High
0,3,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0
1,3,1,0,1,0,0,0,1,0,0,0,1,1,0,0,0
2,2,0,0,0,1,0,1,0,0,0,0,1,0,1,0,0
3,3,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0
4,3,1,1,1,0,0,0,1,0,0,1,0,0,1,0,0


In [0]:
 
survived_preds = forest.predict(test_X)

In [0]:
submit_df = pd.DataFrame(test_df['PassengerId'], columns=['PassengerId'])
submit_df['Survived'] = survived_preds

In [198]:
submit_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [29]:
from google.colab import drive
drive.mount('/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /drive


In [0]:
submit_df.to_csv('/drive/My Drive/Colab Notebooks/titanic_submission.csv', index=False)