In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.linear_model import SGDClassifier, LinearRegression, LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

passenger_ids = test["PassengerId"]

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
total = len(train)

In [5]:
for i in train:
    print (i, train[i].dtype)

PassengerId int64
Survived int64
Pclass int64
Name object
Sex object
Age float64
SibSp int64
Parch int64
Ticket object
Fare float64
Cabin object
Embarked object


In [6]:
lab_enc = preprocessing.LabelEncoder()

train = train.drop(['PassengerId', 'Name'], axis = 1)

test = test.drop(['PassengerId', 'Name'], axis = 1)

In [7]:
for i in train:
    if (train[i].dtypes == 'object'):
        train[i] = train[i].fillna('Unknown')
        train[i] = lab_enc.fit_transform(train[i])
    else:
        train[i] = lab_enc.fit_transform(train[i])
        
for i in test:
    if (test[i].dtypes == 'object'):
        test[i] = test[i].fillna('Unknown')
        test[i] = lab_enc.fit_transform(test[i])
    else:
        test[i] = lab_enc.fit_transform(test[i])

In [8]:
Y = np.array(train['Survived'])
X = np.array(train.drop(['Survived'], axis = 1))
# X = preprocessing.normalize(X)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.33, random_state = 42)

In [9]:
epochs = 10

sgd = SGDClassifier(random_state = 42, max_iter = total * epochs , loss = 'hinge', penalty = 'l2')
svc = SVC(random_state = 42, max_iter = total * epochs)
l_svc = LinearSVC(random_state = 42, max_iter = total * epochs, dual = False)
dtc = DecisionTreeClassifier(random_state = 42, presort = True)
knc = KNeighborsClassifier()
linreg = LinearRegression(normalize = True)
logreg = LogisticRegression(random_state = 42, max_iter = total * epochs)
rfc = RandomForestClassifier(n_estimators = 100, random_state = 42, warm_start = True)

In [10]:
def trainModel(model, model_name):
    model.fit(X_train, Y_train)

    acc = model.score(X_test, Y_test)

    print ("Accuracy of the {model_name} model = {acc:.4f}".format(model_name = model_name
                                                               ,acc = acc * 100))

In [11]:
models = [sgd, svc, l_svc, dtc, knc, linreg, logreg, rfc]
names = ['SGD', 'SVC', 'LinearSVC', 'DTC', 'KNC', 'LinearReg', 'LogisticReg', 'RFC']
total_models = len(models)

In [12]:
for i in range(total_models):
    trainModel(models[i], names[i])

Accuracy of the SGD model = 68.8136
Accuracy of the SVC model = 60.3390
Accuracy of the LinearSVC model = 80.6780
Accuracy of the DTC model = 76.2712
Accuracy of the KNC model = 66.7797
Accuracy of the LinearReg model = 41.3697
Accuracy of the LogisticReg model = 78.9831
Accuracy of the RFC model = 80.3390


In [13]:
def testModel(model, model_name):
    
    correct = 0
    total = 0
    
    for input_array, sur in zip(X, Y):
        input_array = input_array.reshape(1, -1)
        prediction = model.predict(input_array)
        survived = sur
        
        if (prediction == survived):
            correct += 1
        
        total += 1
        
    acc = float(correct)/total
    
    print ("Accuracy of the model {model_name} = {acc:.4f}".format(model_name = model_name,
                                                                      acc = acc * 100))

In [14]:
for i in range(total_models):
    testModel(models[i], names[i])

Accuracy of the model SGD = 69.0236
Accuracy of the model SVC = 86.5320
Accuracy of the model LinearSVC = 80.0224
Accuracy of the model DTC = 92.1437
Accuracy of the model KNC = 74.5230
Accuracy of the model LinearReg = 0.0000
Accuracy of the model LogisticReg = 79.6857
Accuracy of the model RFC = 93.3782


In [15]:
test.head()

test.shape

print (test.dtypes)

Pclass      int64
Sex         int64
Age         int64
SibSp       int64
Parch       int64
Ticket      int64
Fare        int64
Cabin       int64
Embarked    int64
dtype: object


In [32]:
model = rfc

submission = []

test = np.array(test)

for input_array, i in zip(test, passenger_ids):
    input_array = input_array.reshape(1, -1)
    prediction = model.predict(input_array)[0]
    submission.append([i, prediction])

submission = pd.DataFrame(np.array(submission).reshape(418,2), columns = ["PassengerId", "Survived"])

submission.to_csv('submission.csv', index = False)

In [33]:
print (pd.read_csv("submission.csv"))

     PassengerId  Survived
0            892         0
1            893         1
2            894         0
3            895         0
4            896         1
5            897         0
6            898         1
7            899         0
8            900         1
9            901         0
10           902         0
11           903         0
12           904         1
13           905         0
14           906         1
15           907         1
16           908         0
17           909         1
18           910         1
19           911         1
20           912         0
21           913         0
22           914         1
23           915         0
24           916         1
25           917         0
26           918         1
27           919         1
28           920         0
29           921         1
..           ...       ...
388         1280         0
389         1281         1
390         1282         0
391         1283         1
392         1284         1
3