In [100]:
import numpy as np
import pandas as pd
import sklearn
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier

Load train and test data from /media/sf_as/Dropbox/files/DSScale/AnaMet/titanic

In [101]:
path2files = "/media/sf_as/Dropbox/files/DSScale/AnaMet/titanic/"
train = pd.read_csv(path2files + "train.csv")
test = pd.read_csv(path2files + "test.csv")

In [102]:
train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C


In [103]:
test.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


## Data manipulation and Feature selection

In [104]:
print len(test)

418


In [105]:
train.columns[[2,4,5,6,7,9,10,11,1]]

Index([u'Pclass', u'Sex', u'Age', u'SibSp', u'Parch', u'Fare', u'Cabin', u'Embarked', u'Survived'], dtype='object')

In [106]:
train[train.columns[[2,4,5,6,7,9,10,11,1]]].head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Survived
0,3,male,22,1,0,7.25,,S,0
1,1,female,38,1,0,71.2833,C85,C,1


In [107]:
train[['Sex', 'Pclass', 'Age', "SibSp", "Parch", "Fare", 'Cabin', 'Embarked', 'Survived']].head(2)

Unnamed: 0,Sex,Pclass,Age,SibSp,Parch,Fare,Cabin,Embarked,Survived
0,male,3,22,1,0,7.25,,S,0
1,female,1,38,1,0,71.2833,C85,C,1


In [108]:
train[['Sex', 'Pclass', 'Age', "SibSp", "Parch", "Fare", 'Cabin', 'Embarked']].head(2)

Unnamed: 0,Sex,Pclass,Age,SibSp,Parch,Fare,Cabin,Embarked
0,male,3,22,1,0,7.25,,S
1,female,1,38,1,0,71.2833,C85,C


In [109]:
# preliminary selection of features
features_df = train[['Sex', 'Pclass', 'Age', "SibSp", "Parch", "Fare", 'Cabin', 'Embarked', 'Survived']].copy()
#features_df['Sex'] = features_df['Sex'] == 'female'
features_df = features_df.dropna()

In [110]:
features_df.head(2)

Unnamed: 0,Sex,Pclass,Age,SibSp,Parch,Fare,Cabin,Embarked,Survived
1,female,1,38,1,0,71.2833,C85,C,1
3,female,1,35,1,0,53.1,C123,S,1


In [111]:
## Function to map the values of a categorical feature to a numerical range
## scikit-learn uses numpy and expects all the features to be numerical
def transform_feature( df, column_name ):
    unique_values = set( df[column_name].tolist() )
    transformer_dict = {}
    for ii, value in enumerate(unique_values):
        transformer_dict[value] = ii

    def label_map(y):
        return transformer_dict[y]
    df[column_name] = df[column_name].apply( label_map )
    return df


In [112]:
### list of column names indicating which columns to transform; 
names_of_columns_to_transform = ['Sex', "Embarked"]
for column in names_of_columns_to_transform:
    features_df = transform_feature( features_df, column )
    
print( features_df.head() )
    
### Dataframe sintax to remove columns we cant make use of
features_df.drop("Cabin", axis=1, inplace=True)

print(features_df.columns.values)

    Sex  Pclass  Age  SibSp  Parch     Fare Cabin  Embarked  Survived
1     1       1   38      1      0  71.2833   C85         1         1
3     1       1   35      1      0  53.1000  C123         2         1
6     0       1   54      0      0  51.8625   E46         2         0
10    1       3    4      1      1  16.7000    G6         2         1
11    1       1   58      0      0  26.5500  C103         2         1

[5 rows x 9 columns]
['Sex' 'Pclass' 'Age' 'SibSp' 'Parch' 'Fare' 'Embarked' 'Survived']


### Repeat feature selection on the Test Set

In [113]:
test_features_df = test[['Sex', 'Pclass', 'Age', "SibSp", "Parch", "Fare", 'Embarked']].copy()
test_features_df['Age'].fillna(0, inplace=True)
test_features_df['Fare'].fillna(0, inplace=True)
test_features_df[pd.isnull(test_features_df).any(axis=1)]

0,1
"Int64Index([], dtype='int64')",Empty DataFrame


In [114]:
fare_avg = test_features_df.groupby(['Pclass'])['Fare'].mean()
fare_avg.head()

Pclass
1         94.280297
2         22.202104
3         12.402523
Name: Fare, dtype: float64

In [115]:
### list of column names indicating which columns to transform; 
names_of_columns_to_transform = ['Sex', "Embarked"]
for column in names_of_columns_to_transform:
    test_features_df = transform_feature( test_features_df, column )
    
print( test_features_df.head() )

   Sex  Pclass   Age  SibSp  Parch     Fare  Embarked
0    0       3  34.5      0      0   7.8292         0
1    1       3  47.0      1      0   7.0000         1
2    0       2  62.0      0      0   9.6875         0
3    0       3  27.0      0      0   8.6625         1
4    1       3  22.0      1      1  12.2875         1

[5 rows x 7 columns]


## Model-Training, Scoring and Prediction

In [116]:
predictors = ['Sex', 'Pclass', 'Age', "SibSp", "Parch", "Fare", 'Embarked']

### Random Forest Classifier

In [117]:
# Initialize our algorithm with the default paramters
# n_estimators is the number of trees we want to make
# min_samples_split is the minimum number of rows we need to make a split
# min_samples_leaf is the minimum number of samples we can have at the place where a tree branch ends (the bottom points of the tree)
rf = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)
# Compute the accuracy score for all the cross validation folds.  (much simpler than what we did before!)
scores = cross_validation.cross_val_score(rf, features_df[predictors], features_df["Survived"], cv=3)

# Take the mean of the scores (because we have one for each fold)
print(scores.mean())

0.737704918033


In [118]:
## Train model (fit params)
rf.fit(features_df[predictors], features_df['Survived'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [119]:
## Prediction
rf_predicted_y = rf.predict(test_features_df[predictors])
# look at results
test_data = test_features_df[predictors]
test_data['rf_predicted_y'] = rf_predicted_y
test_data.head()

Unnamed: 0,Sex,Pclass,Age,SibSp,Parch,Fare,Embarked,rf_predicted_y
0,0,3,34.5,0,0,7.8292,0,0
1,1,3,47.0,1,0,7.0,1,0
2,0,2,62.0,0,0,9.6875,0,0
3,0,3,27.0,0,0,8.6625,1,0
4,1,3,22.0,1,1,12.2875,1,1


### Logistic regression model

In [120]:
### Training and Accuracy Scoring
import sklearn.linear_model as lm
lr = lm.LogisticRegression()
lr.fit(features_df[predictors], features_df['Survived'])
## compute the scores (a different sintax)
cross_validation.cross_val_score(lr, features_df[predictors], features_df['Survived'])


array([ 0.7704918 ,  0.75409836,  0.70491803])

In [121]:
### Prediction
lr_predicted_y = lr.predict(test_features_df[predictors])
# look at results
test_data['lr_predicted_y'] = rf_predicted_y
test_data.head()

Unnamed: 0,Sex,Pclass,Age,SibSp,Parch,Fare,Embarked,rf_predicted_y,lr_predicted_y
0,0,3,34.5,0,0,7.8292,0,0,0
1,1,3,47.0,1,0,7.0,1,0,0
2,0,2,62.0,0,0,9.6875,0,0,0
3,0,3,27.0,0,0,8.6625,1,0,0
4,1,3,22.0,1,1,12.2875,1,1,1


In [122]:
result_df = pd.DataFrame()
result_df['PassengerId'] = test['PassengerId']
result_df['Survived'] = test_data['lr_predicted_y']
result_df.to_csv(path2files + "logreg01.csv", columns=['PassengerId','Survived'], header=True,mode = 'w', index=False)

In [123]:
result_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [124]:
# Estimate the score after imputation of the missing values
X_missing = test_features_df[predictors].copy()

from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(features_df[predictors])
imp.transform(X_missing)


array([[  0.    ,   3.    ,  34.5   , ...,   0.    ,   7.8292,   0.    ],
       [  1.    ,   3.    ,  47.    , ...,   0.    ,   7.    ,   1.    ],
       [  0.    ,   2.    ,  62.    , ...,   0.    ,   9.6875,   0.    ],
       ..., 
       [  0.    ,   3.    ,  38.5   , ...,   0.    ,   7.25  ,   1.    ],
       [  0.    ,   3.    ,   0.    , ...,   0.    ,   8.05  ,   1.    ],
       [  0.    ,   3.    ,   0.    , ...,   1.    ,  22.3583,   2.    ]])

In [125]:
X_missing.head()

Unnamed: 0,Sex,Pclass,Age,SibSp,Parch,Fare,Embarked
0,0,3,34.5,0,0,7.8292,0
1,1,3,47.0,1,0,7.0,1
2,0,2,62.0,0,0,9.6875,0
3,0,3,27.0,0,0,8.6625,1
4,1,3,22.0,1,1,12.2875,1


In [126]:
## Prediction
rf2_predicted_y = rf.predict(X_missing)
# look at results
test_data = X_missing.copy()
test_data['rf2_predicted_y'] = rf2_predicted_y.copy()
test_data.head()

Unnamed: 0,Sex,Pclass,Age,SibSp,Parch,Fare,Embarked,rf2_predicted_y
0,0,3,34.5,0,0,7.8292,0,0
1,1,3,47.0,1,0,7.0,1,0
2,0,2,62.0,0,0,9.6875,0,0
3,0,3,27.0,0,0,8.6625,1,0
4,1,3,22.0,1,1,12.2875,1,1


In [127]:
rf2_result_df = pd.DataFrame()
rf2_result_df['PassengerId'] = test['PassengerId'].copy()
rf2_result_df['Survived'] = test_data['rf2_predicted_y']
rf2_result_df.to_csv(path2files + "ranfor01.csv", columns=['PassengerId','Survived'], header=True,mode = 'w', index=False)