In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score
import numpy as np
from collections import Counter

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
combined_data = pd.concat([train, test], axis=0)

In [3]:
combined_data.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22,,S,7.25,"Braund, Mr. Owen Harris",0,1,3,male,1,0,A/5 21171
1,38,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1,PC 17599
2,26,,S,7.925,"Heikkinen, Miss. Laina",0,3,3,female,0,1,STON/O2. 3101282
3,35,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1,113803
4,35,,S,8.05,"Allen, Mr. William Henry",0,5,3,male,0,0,373450


In [4]:
combined_data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [5]:
combined_data.head()

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Survived
0,22,S,7.25,0,3,male,1,0
1,38,C,71.2833,0,1,female,1,1
2,26,S,7.925,0,3,female,0,1
3,35,S,53.1,0,1,female,1,1
4,35,S,8.05,0,3,male,0,0


In [6]:
labels = train['Survived']
ids = test['PassengerId']
combined_data.drop(['Survived'], axis=1, inplace=True)

In [7]:
combined_data.head()

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp
0,22,S,7.25,0,3,male,1
1,38,C,71.2833,0,1,female,1
2,26,S,7.925,0,3,female,0
3,35,S,53.1,0,1,female,1
4,35,S,8.05,0,3,male,0


In [8]:
combined_data = pd.concat([
        combined_data,
        pd.get_dummies(combined_data['Sex']),
        pd.get_dummies(combined_data['Embarked'])
    ], axis=1)

In [9]:
combined_data.head()

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,female,male,C,Q,S
0,22,S,7.25,0,3,male,1,0,1,0,0,1
1,38,C,71.2833,0,1,female,1,1,0,1,0,0
2,26,S,7.925,0,3,female,0,1,0,0,0,1
3,35,S,53.1,0,1,female,1,1,0,0,0,1
4,35,S,8.05,0,3,male,0,0,1,0,0,1


In [10]:
combined_data.drop(['Embarked', 'Sex'], axis=1, inplace=True)
combined_data.head()

Unnamed: 0,Age,Fare,Parch,Pclass,SibSp,female,male,C,Q,S
0,22,7.25,0,3,1,0,1,0,0,1
1,38,71.2833,0,1,1,1,0,1,0,0
2,26,7.925,0,3,0,1,0,0,0,1
3,35,53.1,0,1,1,1,0,0,0,1
4,35,8.05,0,3,0,0,1,0,0,1


In [11]:
model = LogisticRegression(C=10, n_jobs=-1)

In [12]:
for colname in combined_data.columns:
    if np.any(pd.isnull(combined_data[colname])):
        print colname

Age
Fare


In [13]:
median_age = np.median(combined_data[np.logical_not(pd.isnull(combined_data['Age']))]['Age'])

In [14]:
combined_data[pd.isnull(combined_data['Age'])] = median_age

In [15]:
average_fare = np.average(combined_data[np.logical_not(pd.isnull(combined_data['Fare']))]['Fare'])

In [16]:
combined_data[pd.isnull(combined_data['Fare'])] = average_fare

In [17]:
train = combined_data[:train.shape[0]]
test = combined_data[train.shape[0]:]

In [19]:
print cross_val_score(model, train, labels, cv=10, scoring='accuracy').mean()

0.775688344115


In [20]:
for cval in range(1, 21):
    model = LogisticRegression(C=cval, n_jobs=-1)
    print cval, cross_val_score(model, train, labels, cv=10, scoring='accuracy').mean()

1 0.77342866871
2 0.775688344115
3 0.77456474861
4 0.775688344115
5 0.775688344115
6 0.775688344115
7 0.775688344115
8 0.775688344115
9 0.775688344115
10 0.775688344115
11 0.775688344115
12 0.775688344115
13 0.775688344115
14 0.775688344115
15 0.775688344115
16 0.775688344115
17 0.775688344115
18 0.775688344115
19 0.775688344115
20 0.775688344115


In [21]:
model = LogisticRegression(C=3, n_jobs=-1)
model.fit(train, labels)
preds = model.predict(test)

In [22]:
preds

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0,

In [23]:
submission = pd.DataFrame()
submission['PassengerId'] = ids
submission['Survived'] = preds
submission.to_csv('submission_logistic.csv', index=False)