In [54]:
import pandas as pd
import numpy as np

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train["Set"] = "train"
test["Set"] = "test"

combined = pd.concat([train, test], ignore_index=True)
display(combined.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Set
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,train
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,train
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,train


In [55]:
# check for features with NaN values
#data.isnull().sum()


combined['Title'] = combined['Name'].str.extract('([A-Za-z]+)\.', expand=True)
title_reduction = {'Mr': 'Mr', 'Mrs': 'Mrs', 'Miss': 'Miss', 
                   'Master': 'Master', 'Don': 'Mr', 'Rev': 'Mr',
                   'Dr': 'Mr', 'Mme': 'Miss', 'Ms': 'Miss',
                   'Major': 'Mr', 'Lady': 'Mrs', 'Sir': 'Mr',
                   'Mlle': 'Miss', 'Col': 'Mr', 'Capt': 'Mr',
                   'Countess': 'Mrs','Jonkheer': 'Mr',
                   'Dona': 'Mrs'}
combined['Title'] = combined['Title'].map(title_reduction)
display(combined.head())

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Set,Title
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,train,Mr
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,train,Mrs
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,train,Miss
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,train,Mrs
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,train,Mr


In [56]:
pclass = combined.loc[combined.Fare.isnull(), 'Pclass'].values[0]
combined.loc[combined.Fare.isnull(), 'Fare'] = combined.loc[combined.Pclass==pclass, 'Fare'].mean()

avg_girl_age = round(combined[(combined.Title=='Miss')&(combined.Parch!=0)]['Age'].mean())
avg_lady_age = round(combined[(combined.Title=="Miss") & (combined.Parch==0)]['Age'].mean())
avg_boy_age = round(combined[combined.Title=='Master']['Age'].mean())
avg_men_age = round(combined[combined.Title=='Mr']['Age'].mean())
avg_women_age = round(combined[combined.Title=='Mrs']['Age'].mean())

In [57]:
combined.loc[(combined.Title=='Miss')&(combined.Parch!=0)&(combined.Age.isnull()), 'Age']=avg_girl_age
combined.loc[(combined.Title=="Miss") & (combined.Parch==0)&(combined.Age.isnull()), 'Age']=avg_lady_age
combined.loc[(combined.Title=='Master')&(combined.Age.isnull()), 'Age']=avg_boy_age
combined.loc[(combined.Title=='Mr')&(combined.Age.isnull()), 'Age']=avg_men_age
combined.loc[(combined.Title=='Mrs')&(combined.Age.isnull()), 'Age']=avg_women_age
#combined[combined.Age.isnull()]

In [58]:
combined.loc[combined.Embarked.isnull(), 'Embarked'] = combined.loc[combined.Pclass==1, 'Embarked'].mode()[0]

In [59]:
combined.drop(['Name', 'Cabin', 'Title', 'Ticket'], axis=1, inplace=True)
combined['Sex'] = combined['Sex'].map({'female':0, 'male':1})
combined['Embarked'] = combined['Embarked'].map({'C':0, 'Q':1, 'S':2})

In [60]:
train = combined[combined.Set=='train'].drop('Set', axis=1)
test = combined[combined.Set=='test'].drop(['Survived', 'Set'], axis=1)
display(train.head())
display(test.head())

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0.0,3,1,22.0,1,0,7.25,2
1,2,1.0,1,0,38.0,1,0,71.2833,0
2,3,1.0,3,0,26.0,0,0,7.925,2
3,4,1.0,1,0,35.0,1,0,53.1,2
4,5,0.0,3,1,35.0,0,0,8.05,2


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
891,892,3,1,34.5,0,0,7.8292,1
892,893,3,0,47.0,1,0,7.0,2
893,894,2,1,62.0,0,0,9.6875,1
894,895,3,1,27.0,0,0,8.6625,2
895,896,3,0,22.0,1,1,12.2875,2


In [61]:
y_train = np.array(train['Survived'], dtype=np.int8)
X_train = train.drop('Survived', axis=1)

In [63]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)
print("Accuracy on training set: {}".format(clf.score(X_train, y_train)))


Accuracy on training set: 1.0


In [65]:
pred = clf.predict(test)
df = pd.DataFrame({'PassengerId':test['PassengerId'], 'Survived':pred})
df.to_csv(r'C:\Users\USER\Desktop\my_pythonfiles_\kaggle\titanic_survival\\RF_solution.csv', index=False)