In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv


In [6]:
#load the data
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')

In [7]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [8]:
# check for missing values
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
# choose predictors and targets
y_train = train_data.Survived
x_train = train_data.drop(['Survived'],axis=1)

In [10]:
x_train.columns.values

array(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype=object)

In [11]:
# split the data into categorical and numerical
numerical_col = x_train[['Pclass','Age','SibSp','Fare']].copy()
categorical_col = x_train[['Sex','Cabin','Embarked']]
numerical_col.head()

Unnamed: 0,Pclass,Age,SibSp,Fare
0,3,22.0,1,7.25
1,1,38.0,1,71.2833
2,3,26.0,0,7.925
3,1,35.0,1,53.1
4,3,35.0,0,8.05


In [12]:
#always see your data before working on it
categorical_col.head()

Unnamed: 0,Sex,Cabin,Embarked
0,male,,S
1,female,C85,C
2,female,,S
3,female,C123,S
4,male,,S


In [13]:
#do the same for the testing data
test_num_col = test_data[['Pclass','Age','SibSp','Fare']]
test_cat_col = test_data[['Sex','Cabin','Embarked']]

In [14]:
#impute the missing values according to the type of data
from sklearn.impute import SimpleImputer
num_imputer = SimpleImputer(strategy = 'mean')
cat_imputer = SimpleImputer(strategy = 'most_frequent')
imputed_x_num_train = pd.DataFrame(num_imputer.fit_transform(numerical_col))
imputed_x_num_test  = pd.DataFrame(num_imputer.transform(test_num_col))
imputed_x_cat_train = pd.DataFrame(cat_imputer.fit_transform(categorical_col))
imputed_x_cat_test  = pd.DataFrame(cat_imputer.transform(test_cat_col))

# rename the columns
imputed_x_num_train.columns = numerical_col.columns
imputed_x_num_test.columns  = test_num_col.columns
imputed_x_cat_train.columns = categorical_col.columns
imputed_x_cat_test.columns  = test_cat_col.columns

In [15]:
# join the numerical and categorial data
imputed_x_train = imputed_x_num_train.copy()
imputed_x_train = pd.concat([imputed_x_train ,imputed_x_cat_train] , axis =1)
imputed_x_test = imputed_x_num_test.copy()
imputed_x_test = pd.concat([imputed_x_test,imputed_x_cat_test] , axis =1 )

In [16]:
# eyeball the training data
imputed_x_train.head()

Unnamed: 0,Pclass,Age,SibSp,Fare,Sex,Cabin,Embarked
0,3.0,22.0,1.0,7.25,male,B96 B98,S
1,1.0,38.0,1.0,71.2833,female,C85,C
2,3.0,26.0,0.0,7.925,female,B96 B98,S
3,1.0,35.0,1.0,53.1,female,C123,S
4,3.0,35.0,0.0,8.05,male,B96 B98,S


In [17]:
# find the number of unique entries in each of the categorical data 
imputed_x_train[['Sex','Cabin','Embarked']].nunique()

Sex           2
Cabin       147
Embarked      3
dtype: int64

In [18]:
# so we onehotencode Embarked and Sex 
OH_X_train = imputed_x_train[['Sex','Embarked']].copy()
OH_X_test = imputed_x_test[['Sex','Embarked']].copy()

In [19]:
#check
OH_X_train.head()

Unnamed: 0,Sex,Embarked
0,male,S
1,female,C
2,female,S
3,female,S
4,male,S


In [21]:
from sklearn.preprocessing import OneHotEncoder
OH_encoder = OneHotEncoder(handle_unknown = 'ignore',sparse = False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(OH_X_train))
OH_cols_test = pd.DataFrame(OH_encoder.fit_transform(OH_X_test))
OH_cols_train.index = OH_X_train.index
OH_cols_test.index = OH_X_test.index


In [22]:
Oh_x_train = imputed_x_train.drop(OH_X_train.columns , axis=1)
Oh_x_test = imputed_x_test.drop(OH_X_test.columns , axis =1)
OH_x_train = pd.concat([Oh_x_train, OH_cols_train],axis=1)
OH_x_test = pd.concat([Oh_x_test , OH_cols_test], axis=1)

In [25]:
#eyeball the data
OH_x_train.head()

Unnamed: 0,Pclass,Age,SibSp,Fare,Cabin,0,1,2,3,4
0,3.0,22.0,1.0,7.25,B96 B98,0.0,1.0,0.0,0.0,1.0
1,1.0,38.0,1.0,71.2833,C85,1.0,0.0,1.0,0.0,0.0
2,3.0,26.0,0.0,7.925,B96 B98,1.0,0.0,0.0,0.0,1.0
3,1.0,35.0,1.0,53.1,C123,1.0,0.0,0.0,0.0,1.0
4,3.0,35.0,0.0,8.05,B96 B98,0.0,1.0,0.0,0.0,1.0


In [27]:
OH_X_train = OH_x_train.drop(['Cabin'],axis=1)
OH_X_train.head()

Unnamed: 0,Pclass,Age,SibSp,Fare,0,1,2,3,4
0,3.0,22.0,1.0,7.25,0.0,1.0,0.0,0.0,1.0
1,1.0,38.0,1.0,71.2833,1.0,0.0,1.0,0.0,0.0
2,3.0,26.0,0.0,7.925,1.0,0.0,0.0,0.0,1.0
3,1.0,35.0,1.0,53.1,1.0,0.0,0.0,0.0,1.0
4,3.0,35.0,0.0,8.05,0.0,1.0,0.0,0.0,1.0


In [29]:
OH_X_test = OH_x_test.drop(['Cabin'],axis=1)
OH_X_test.head()

Unnamed: 0,Pclass,Age,SibSp,Fare,0,1,2,3,4
0,3.0,34.5,0.0,7.8292,0.0,1.0,0.0,1.0,0.0
1,3.0,47.0,1.0,7.0,1.0,0.0,0.0,0.0,1.0
2,2.0,62.0,0.0,9.6875,0.0,1.0,0.0,1.0,0.0
3,3.0,27.0,0.0,8.6625,0.0,1.0,0.0,0.0,1.0
4,3.0,22.0,1.0,12.2875,1.0,0.0,0.0,0.0,1.0


In [28]:
#create a random forest model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(OH_X_train , y_train)

RandomForestRegressor()

In [30]:
predictions = model.predict(OH_X_test)

In [31]:
output = pd.DataFrame({'PassengerId':test_data['PassengerId'], 'Survived':predictions})
output.to_csv('Titanic_preds.csv' , index=False)