In [1]:
import pandas
import numpy

In [2]:
data_frame = pandas.read_csv("train.csv", header = 0)

In [3]:
def convert_categorical_input(column_name, data_frame):
    categorical_column = data_frame[column_name]
    categorical_column_values = categorical_column.unique()
    for column_value in categorical_column_values:
        data_frame[column_name + '=' + str(column_value)] = data_frame[column_name].map(lambda x: 1 if x == column_value else 0)

In [4]:
convert_categorical_input('Sex', data_frame)
convert_categorical_input('Embarked', data_frame)

In [5]:
processed_data = data_frame.drop(['PassengerId','Survived','Pclass','Name','Sex','Ticket','Embarked'], axis=1)

In [6]:
## THere are two people who have embarked as empty and both SURVIVED. 
## I think this means they didnt embark on the journey itself
## Removing these rows from the dataframe
data_frame[(data_frame['Embarked'] != 'S') & (data_frame['Embarked'] != 'C') & (data_frame['Embarked'] != 'Q')]
## Drop creates copy but inplace deletes in that dataframe only
data_frame.drop(data_frame.index[[61,829]], inplace=True)
data_frame.drop('Embarked=nan', axis=1, inplace=True)

In [7]:
data_frame['Embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

In [8]:
del processed_data

In [9]:
processed_data = data_frame.drop(['PassengerId','Survived','Name','Sex','Ticket','Embarked', 'Cabin'], axis=1)
processed_data

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex=male,Sex=female,Embarked=S,Embarked=C,Embarked=Q
0,0,3,22.0,1,0,7.2500,1,0,1,0,0
1,1,1,38.0,1,0,71.2833,0,1,0,1,0
2,1,3,26.0,0,0,7.9250,0,1,1,0,0
3,1,1,35.0,1,0,53.1000,0,1,1,0,0
4,0,3,35.0,0,0,8.0500,1,0,1,0,0
5,0,3,,0,0,8.4583,1,0,0,0,1
6,0,1,54.0,0,0,51.8625,1,0,1,0,0
7,0,3,2.0,3,1,21.0750,1,0,1,0,0
8,1,3,27.0,0,2,11.1333,0,1,1,0,0
9,1,2,14.0,1,0,30.0708,0,1,0,1,0


In [10]:
## This tells the number of NULL in the data
processed_data.isnull().sum()

Survived        0
Pclass          0
Age           177
SibSp           0
Parch           0
Fare            0
Sex=male        0
Sex=female      0
Embarked=S      0
Embarked=C      0
Embarked=Q      0
dtype: int64

In [11]:
processed_data['Age'].median()

28.0

In [12]:
processed_data['Age'].fillna(28, inplace=True)

In [13]:
processed_data.isnull().sum()

Survived      0
Pclass        0
Age           0
SibSp         0
Parch         0
Fare          0
Sex=male      0
Sex=female    0
Embarked=S    0
Embarked=C    0
Embarked=Q    0
dtype: int64

In [14]:
processed_data['Fare'] = numpy.sqrt(processed_data['Fare'])

In [15]:
## READY TO APPLY ML ALGO FINALLY
processed_data

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex=male,Sex=female,Embarked=S,Embarked=C,Embarked=Q
0,0,3,22.0,1,0,2.692582,1,0,1,0,0
1,1,1,38.0,1,0,8.442944,0,1,0,1,0
2,1,3,26.0,0,0,2.815138,0,1,1,0,0
3,1,1,35.0,1,0,7.286975,0,1,1,0,0
4,0,3,35.0,0,0,2.837252,1,0,1,0,0
5,0,3,28.0,0,0,2.908316,1,0,0,0,1
6,0,1,54.0,0,0,7.201562,1,0,1,0,0
7,0,3,2.0,3,1,4.590752,1,0,1,0,0
8,1,3,27.0,0,2,3.336660,0,1,1,0,0
9,1,2,14.0,1,0,5.483685,0,1,0,1,0


In [16]:
target_value = processed_data['Survived'].copy()
processed_data = processed_data.drop(['Survived'],axis=1)

In [20]:
from sklearn import linear_model, datasets
logistic_regression = linear_model.LogisticRegression()
logistic_regression.fit(processed_data, target_value)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
test_data = pandas.read_csv("test.csv", header = 0)

convert_categorical_input('Sex', test_data)
convert_categorical_input('Embarked', test_data)

processed_test_data = test_data.drop(['PassengerId','Name','Sex','Ticket','Embarked', 'Cabin'], axis=1)

processed_test_data['Fare'] = numpy.sqrt(processed_test_data['Fare'])

print processed_test_data.isnull().sum()

processed_test_data['Age'].fillna(28, inplace=True)

print processed_test_data.isnull().sum()

Pclass         0
Age           86
SibSp          0
Parch          0
Fare           1
Sex=male       0
Sex=female     0
Embarked=Q     0
Embarked=S     0
Embarked=C     0
dtype: int64
Pclass        0
Age           0
SibSp         0
Parch         0
Fare          1
Sex=male      0
Sex=female    0
Embarked=Q    0
Embarked=S    0
Embarked=C    0
dtype: int64


In [32]:
processed_test_data[processed_test_data.isnull().any(axis=1)]

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex=male,Sex=female,Embarked=Q,Embarked=S,Embarked=C
152,3,60.5,0,0,,1,0,0,1,0


In [38]:
print 'Test data'
test_third_class = processed_test_data[processed_test_data['Pclass'] == 3]
print test_third_class['Fare'].median()
print test_third_class['Fare'].mean()

print 'Train data'
train_third_class = processed_data[processed_data['Pclass'] == 3]
print train_third_class['Fare'].median()
print train_third_class['Fare'].mean()

Test data
2.80994661871
3.35048069255
Train data
2.83725219182
3.47646918724


In [39]:
## Replacing by median 
processed_test_data['Fare'].fillna(2.8, inplace=True)

In [41]:
predictions = logistic_regression.predict(processed_test_data)

In [43]:
output = pandas.DataFrame({'PassengerId' : test_data['PassengerId'],
                          'Survived'    : predictions})

In [46]:
output.to_csv("submission_logistic_regression.csv",index=False)

In [48]:
from sklearn import svm
SVM = svm.SVC(gamma=0.01, C=10)
SVM.fit(processed_data, target_value)
predictions = SVM.predict(processed_test_data)
output = pandas.DataFrame({'PassengerId' : test_data['PassengerId'],
                          'Survived'    : predictions})
output.to_csv("submission_SVM1.csv",index=False)