In [231]:
import pandas as pd
import numpy as np
from scipy.stats import mode

df = pd.read_csv('train.csv')

In [232]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [233]:

df.isnull().sum()



PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [234]:
df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)

In [235]:
age_mean = df['Age'].mean()
df['Age'] = df['Age'].fillna(age_mean)

#df = df.dropna()

In [236]:
df.shape

(891, 9)

In [237]:
df['Sex'].unique()

array(['male', 'female'], dtype=object)

In [238]:
df['Gender'] = df['Sex'].map({'female': 0, 'male':1}).astype(int)

In [239]:

mode_embarked = mode(df['Embarked'])[0][0]
df['Embarked'] = df['Embarked'].fillna(mode_embarked)
df['Port'] = df['Embarked'].map({'C':1, 'S':2, 'Q':3}).astype(int)

In [240]:
df = df.drop(['Sex', 'Embarked'], axis=1)

In [241]:
cols = df.columns.tolist()
print(cols)

['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Gender', 'Port']


In [242]:
cols = [cols[1]] + cols[0:1] + cols[2:]
df = df[cols]

In [243]:
df.head(10)


Unnamed: 0,Survived,PassengerId,Pclass,Age,SibSp,Parch,Fare,Gender,Port
0,0,1,3,22.0,1,0,7.25,1,2
1,1,2,1,38.0,1,0,71.2833,0,1
2,1,3,3,26.0,0,0,7.925,0,2
3,1,4,1,35.0,1,0,53.1,0,2
4,0,5,3,35.0,0,0,8.05,1,2
5,0,6,3,29.699118,0,0,8.4583,1,3
6,0,7,1,54.0,0,0,51.8625,1,2
7,0,8,3,2.0,3,1,21.075,1,2
8,1,9,3,27.0,0,2,11.1333,0,2
9,1,10,2,14.0,1,0,30.0708,0,1


In [244]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
Survived       891 non-null int64
PassengerId    891 non-null int64
Pclass         891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Gender         891 non-null int64
Port           891 non-null int64
dtypes: float64(2), int64(7)
memory usage: 62.7 KB


In [245]:
train_data = df.values

In [246]:
train_datadf=df

In [247]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation

model = RandomForestClassifier(random_state=1,
    n_estimators=150,
    min_samples_split=4,
    min_samples_leaf=2)

In [248]:

predictors = ["Pclass", "Gender", "Age", "SibSp", "Parch", "Fare", "Port"]
scores = cross_validation.cross_val_score(
    model,
    train_datadf[predictors],
    train_datadf["Survived"],
    cv=3
)

print(scores.mean())

0.814814814815


In [249]:
model = model.fit(train_datadf[predictors], train_data[0:,0])

In [250]:
df_test = pd.read_csv('test.csv')

In [251]:
df_test.head(10)
df_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [252]:
pivoted= pd.pivot_table(df_test, values='Fare', index=['Pclass'], columns=[], aggfunc=np.mean)
pivoted

Pclass
1    94.280297
2    22.202104
3    12.459678
Name: Fare, dtype: float64

In [253]:
df_test['Fare'] = df_test['Fare'].fillna(df_test['Pclass'].map(pivoted))

In [254]:

df_test['Age'] = df_test['Age'].fillna(age_mean)
df_test['Embarked'] = df_test['Embarked'].fillna(mode_embarked)
df_test = df_test.drop(['Name', 'Ticket', 'Cabin'], axis=1)


In [255]:
df_test['Gender'] = df_test['Sex'].map({'female': 0, 'male':1})
df_test['Port'] = df_test['Embarked'].map({'C':1, 'S':2, 'Q':3})

In [256]:
df_test = df_test.drop(['Sex', 'Embarked'], axis=1)
test_data = df_test.values

In [257]:
output = model.predict(df_test[predictors])

In [258]:
result = np.c_[test_data[:,0].astype(int), output.astype(int)]
df_result = pd.DataFrame(result[:,0:2], columns=['PassengerId', 'Survived'])

In [259]:
df_result.head(10)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
5,897,0
6,898,0
7,899,0
8,900,1
9,901,0


In [260]:
df_result.to_csv('titanic_result.csv', index=False)

In [261]:
df_result.shape

(418, 2)