In [1]:
import xgboost as xgb

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np

# load the data 

In [3]:
train_df=pd.read_csv('train.csv',header=0)

In [4]:
train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [14]:
len(train_df)

891

In [8]:
test_df=pd.read_csv('test.csv',header=0)

In [9]:
test_df.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [13]:
len(test_df)

418

In [10]:
def missing_count(dataset):
    missing={}
    for col in dataset.columns:
        missing[col]=len(dataset[dataset[col].isnull()])
    return missing

In [11]:
missing_count(train_df)

{'Age': 177,
 'Cabin': 687,
 'Embarked': 2,
 'Fare': 0,
 'Name': 0,
 'Parch': 0,
 'PassengerId': 0,
 'Pclass': 0,
 'Sex': 0,
 'SibSp': 0,
 'Survived': 0,
 'Ticket': 0}

In [12]:
missing_count(test_df)

{'Age': 86,
 'Cabin': 327,
 'Embarked': 0,
 'Fare': 1,
 'Name': 0,
 'Parch': 0,
 'PassengerId': 0,
 'Pclass': 0,
 'Sex': 0,
 'SibSp': 0,
 'Ticket': 0}

It seems that there are a lot of missing values in age and cabin. We should be careful about age and cabin information even though we impute missing values.

Next we impute missing values using the median for numeric columns and the most common value for string columns.

In [15]:
from sklearn.base import TransformerMixin

In [16]:
class DataFrameImputer(TransformerMixin):
    def fit(self,X,y=None):
        self.fill=pd.Series([X[c].value_counts().index[0]
                            if X[c].dtype==np.dtype('O') else X[c].median() for c in X],
                           index=X.columns)
        return self
    def transform(self,X,y=None):
        return X.fillna(self.fill)

In [19]:
feature_columns_to_use=['Pclass','Sex','Age','Fare','Parch']
nonnumeric_columns=['Sex']

Join the features from train and test together before imputing missing values, in case their distribution is slightly different.

In [20]:
big_X=train_df[feature_columns_to_use].append(test_df[feature_columns_to_use])
big_X_imputed=DataFrameImputer().fit_transform(big_X)

XGBoost doesn't handle categorical features automatically, so we need to change them to columns of integer values.

In [21]:
train_df[nonnumeric_columns]

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male
5,male
6,male
7,male
8,female
9,female


In [22]:
big_X_imputed['Sex']

0        male
1      female
2      female
3      female
4        male
5        male
6        male
7        male
8      female
9      female
10     female
11     female
12       male
13       male
14     female
15     female
16       male
17       male
18     female
19     female
20       male
21       male
22     female
23       male
24     female
25     female
26       male
27       male
28     female
29       male
        ...  
388      male
389      male
390      male
391    female
392      male
393      male
394      male
395    female
396      male
397    female
398      male
399      male
400    female
401      male
402    female
403      male
404      male
405      male
406      male
407      male
408    female
409    female
410    female
411    female
412    female
413      male
414    female
415      male
416      male
417      male
Name: Sex, Length: 1309, dtype: object

In [24]:
LabelEncoder().fit_transform(big_X_imputed['Sex'])

array([1, 0, 0, ..., 1, 1, 1])

In [23]:
le=LabelEncoder()

In [25]:
for feature in nonnumeric_columns:
    big_X_imputed[feature]=le.fit_transform(big_X_imputed[feature])

In [26]:
train_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [27]:
train_df.shape

(891, 12)

In [28]:
train_df.shape[0]

891

In [29]:
big_X_imputed[0:891]

Unnamed: 0,Pclass,Sex,Age,Fare,Parch
0,3,1,22.0,7.2500,0
1,1,0,38.0,71.2833,0
2,3,0,26.0,7.9250,0
3,1,0,35.0,53.1000,0
4,3,1,35.0,8.0500,0
5,3,1,28.0,8.4583,0
6,1,1,54.0,51.8625,0
7,3,1,2.0,21.0750,1
8,3,0,27.0,11.1333,2
9,2,0,14.0,30.0708,0


In [30]:
big_X_imputed[0:891].as_matrix()

array([[  3.    ,   1.    ,  22.    ,   7.25  ,   0.    ],
       [  1.    ,   0.    ,  38.    ,  71.2833,   0.    ],
       [  3.    ,   0.    ,  26.    ,   7.925 ,   0.    ],
       ..., 
       [  3.    ,   0.    ,  28.    ,  23.45  ,   2.    ],
       [  1.    ,   1.    ,  26.    ,  30.    ,   0.    ],
       [  3.    ,   1.    ,  32.    ,   7.75  ,   0.    ]])

In [31]:
big_X_imputed[:891]

Unnamed: 0,Pclass,Sex,Age,Fare,Parch
0,3,1,22.0,7.2500,0
1,1,0,38.0,71.2833,0
2,3,0,26.0,7.9250,0
3,1,0,35.0,53.1000,0
4,3,1,35.0,8.0500,0
5,3,1,28.0,8.4583,0
6,1,1,54.0,51.8625,0
7,3,1,2.0,21.0750,1
8,3,0,27.0,11.1333,2
9,2,0,14.0,30.0708,0


In [33]:
 big_X_imputed[train_df.shape[0]::]

Unnamed: 0,Pclass,Sex,Age,Fare,Parch
0,3,1,34.5,7.8292,0
1,3,0,47.0,7.0000,0
2,2,1,62.0,9.6875,0
3,3,1,27.0,8.6625,0
4,3,0,22.0,12.2875,1
5,3,1,14.0,9.2250,0
6,3,0,30.0,7.6292,0
7,2,1,26.0,29.0000,1
8,3,0,18.0,7.2292,0
9,3,1,21.0,24.1500,0


In [44]:
test_X = big_X_imputed[train_df.shape[0]::].as_matrix()

In [43]:
train_X=big_X_imputed[0:train_df.shape[0]].as_matrix()

In [40]:
test_X

array([[  3.    ,   1.    ,  34.5   ,   7.8292,   0.    ],
       [  3.    ,   0.    ,  47.    ,   7.    ,   0.    ],
       [  2.    ,   1.    ,  62.    ,   9.6875,   0.    ],
       ..., 
       [  3.    ,   1.    ,  38.5   ,   7.25  ,   0.    ],
       [  3.    ,   1.    ,  28.    ,   8.05  ,   0.    ],
       [  3.    ,   1.    ,  28.    ,  22.3583,   1.    ]])

In [45]:
train_Y=train_df['Survived']

In [46]:
gbm=xgb.XGBClassifier(max_depth=3,n_estimators=300,learning_rate=0.05).fit(train_X,train_Y)

In [50]:
predictions=gbm.predict(test_X)

In [51]:
submission=pd.DataFrame({'PassengerId':test_df['PassengerId'],
                        'Survived':predictions})

In [53]:
submission.to_csv("Desktop/submission_guo.csv",index=False)