In [167]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing, neighbors, svm
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,GradientBoostingClassifier, BaggingClassifier
from sklearn.cluster import KMeans
from scipy.stats import mode
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
import csv as csv
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report

# import warnings, random
# from collections import Counter
# from math import sqrt

In [98]:
df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [99]:
def handle_non_numerical_data(df): 
    columns = df.columns.values
    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)

            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x += 1

            df[column] = list(map(convert_to_int,df[column] ))
    
    return df

In [154]:
# Deal with empty

# All missing Embarked -> just make them embark from most common place
if len(df.Embarked[ df.Embarked.isnull() ]) > 0:
    df.Embarked[ df.Embarked.isnull() ] = df.Embarked.dropna().mode().values

# All the ages with no data -> make the median of all Ages
median_age = df['Age'].dropna().median()
if len(df.Age[ df.Age.isnull() ]) > 0:
    df.loc[ (df.Age.isnull()), 'Age'] = median_age


# All the missing Fares -> assume median of their respective class
if len(df.Fare[ df.Fare.isnull() ]) > 0:
    median_fare = np.zeros(3)
    for f in range(0,3):                                              # loop 0 to 2
        median_fare[f] = df[ df.Pclass == f+1 ]['Fare'].dropna().median()
    for f in range(0,3):                                              # loop 0 to 2
        df.loc[ (df.Fare.isnull()) & (df.Pclass == f+1 ), 'Fare'] = median_fare[f]

# # Assume room number on floor does not matter, change cabin to just be the floor
# df.Cabin[ df.Cabin.notnull()] = df.Cabin[ df.Cabin.notnull()].str[0]

        
# # All the missing Cabins -> assume median of their respective class
# if len(df.Cabin[ df.Cabin.isnull() ]) > 0:
#     median_fare = ["","",""]
#     for f in range(0,3):                                              # loop 0 to 2
#         z = df[ df.Pclass == f+1 ]['Cabin'].dropna().value_counts().index.values[0]#.median().astype(str)
#         median_fare[f] = z
#     for f in range(0,3):                                              # loop 0 to 2
#         df.loc[ (df.Cabin.isnull()) & (df.Pclass == f+1 ), 'Cabin'] = median_fare[f]



In [101]:
df.drop(['Name', 'PassengerId','Cabin','Ticket'], 1, inplace=True)
df.convert_objects(convert_numeric=True)
df.fillna(0,inplace=True)

  from ipykernel import kernelapp as app


In [102]:
df = handle_non_numerical_data(df)
df.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.352413,29.361582,0.523008,0.381594,32.204208,0.897868
std,0.486592,0.836071,0.47799,13.019697,1.102743,0.806057,49.693429,0.514624
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,22.0,0.0,0.0,7.9104,1.0
50%,0.0,3.0,0.0,28.0,0.0,0.0,14.4542,1.0
75%,1.0,3.0,1.0,35.0,1.0,0.0,31.0,1.0
max,1.0,3.0,1.0,80.0,8.0,6.0,512.3292,2.0


In [141]:

df_test.describe()
    

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,418.0,418.0,418.0,418.0,418.0,418.0,418.0
mean,2.26555,0.363636,29.599282,0.447368,0.392344,35.560845,0.866029
std,0.841838,0.481622,12.70377,0.89676,0.981429,55.856972,0.580452
min,1.0,0.0,0.17,0.0,0.0,0.0,0.0
25%,1.0,0.0,23.0,0.0,0.0,7.8958,1.0
50%,3.0,0.0,27.0,0.0,0.0,14.4542,1.0
75%,3.0,1.0,35.75,1.0,0.0,31.471875,1.0
max,3.0,1.0,76.0,8.0,9.0,512.3292,2.0


In [103]:

df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,22.0,1,0,7.25,1
1,1,1,1,38.0,1,0,71.2833,0
2,1,3,1,26.0,0,0,7.925,1
3,1,1,1,35.0,1,0,53.1,1
4,0,3,0,35.0,0,0,8.05,1


In [160]:
X = np.array(df.drop(['Survived'], 1).astype(float))
print(X.shape)
# enc = OneHotEncoder(categorical_features=np.array([True,True,False,False,False,False,True]))#[0,1,4,6])
# X = enc.fit_transform(X)


print(X.shape)
X = preprocessing.scale(X, with_mean=False)
y = np.array(df['Survived'])





X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.2)

(891, 7)
(891, 7)


In [72]:
# clf = KMeans(n_clusters=2)
# clf.fit(X_train)

# correct = 0
# for i in range(len(X_test)):
#     predict_me = np.array(X[i].astype(float))
#     predict_me = predict_me.reshape(-1, len(predict_me))
#     prediction = clf.predict(predict_me)
#     if prediction[0] == y_test[i]:
#         correct += 1

# predict = correct/len(X_test)
# print(max(1- predict, predict))

In [161]:
df_test = pd.read_csv('data/test.csv')

PassengerId = df_test['PassengerId'].values


# Deal with empty

# All missing Embarked -> just make them embark from most common place
if len(df_test.Embarked[ df_test.Embarked.isnull() ]) > 0:
    df_test.Embarked[ df_test.Embarked.isnull() ] = df_test.Embarked.dropna().mode().values

# All the ages with no data -> make the median of all Ages
median_age = df_test['Age'].dropna().median()
if len(df_test.Age[ df_test.Age.isnull() ]) > 0:
    df_test.loc[ (df_test.Age.isnull()), 'Age'] = median_age


# All the missing Fares -> assume median of their respective class
if len(df_test.Fare[ df_test.Fare.isnull() ]) > 0:
    median_fare = np.zeros(3)
    for f in range(0,3):                                              # loop 0 to 2
        median_fare[f] = df_test[ df_test.Pclass == f+1 ]['Fare'].dropna().median()
    for f in range(0,3):                                              # loop 0 to 2
        df_test.loc[ (df_test.Fare.isnull()) & (df_test.Pclass == f+1 ), 'Fare'] = median_fare[f]

# # Assume room number on floor does not matter, change cabin to just be the floor
# df_test.Cabin[ df_test.Cabin.notnull()] = df_test.Cabin[ df_test.Cabin.notnull()].str[0]

        
# # All the missing Cabins -> assume median of their respective class
# if len(df_test.Cabin[ df_test.Cabin.isnull() ]) > 0:
#     median_fare = ["","",""]
#     for f in range(0,3):                                              # loop 0 to 2
#         z = df_test[ df_test.Pclass == f+1 ]['Cabin'].dropna().value_counts().index.values[0]#.median().astype(str)
#         median_fare[f] = z
#     for f in range(0,3):                                              # loop 0 to 2
#         df_test.loc[ (df_test.Cabin.isnull()) & (df_test.Pclass == f+1 ), 'Cabin'] = median_fare[f]





df_test.drop(['Name', 'PassengerId','Ticket','Cabin'], 1, inplace=True)
df_test.convert_objects(convert_numeric=True)
df_test.fillna(0,inplace=True)




df_test = handle_non_numerical_data(df_test)
print(df.head())
print(df_test.head())
X_valid = np.array(df_test.astype(float))

# enc = OneHotEncoder(categorical_features=np.array([True,True,False,False,False,False,True]))#[0,1,4,6])
# X_valid = enc.fit_transform(X_valid)

X_valid = preprocessing.scale(X_valid, with_mean = False)
print(X_valid.shape)

   Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0         0       3    0  22.0      1      0   7.2500         1
1         1       1    1  38.0      1      0  71.2833         0
2         1       3    1  26.0      0      0   7.9250         1
3         1       1    1  35.0      1      0  53.1000         1
4         0       3    0  35.0      0      0   8.0500         1
   Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0       3    0  34.5      0      0   7.8292         2
1       3    1  47.0      1      0   7.0000         1
2       2    0  62.0      0      0   9.6875         2
3       3    0  27.0      0      0   8.6625         1
4       3    1  22.0      1      1  12.2875         1
(418, 7)




In [74]:
# clf = BaggingClassifier( neighbors.KNeighborsClassifier(n_jobs=-1))
# clf.fit(X_train, y_train)
# train_accuracy = clf.score(X_train, y_train)
# test_accuracy = clf.score(X_test, y_test)
# print(train_accuracy, test_accuracy)

In [126]:

param_grid = [
  {'C': [1,10,100,1000], 'gamma': [0.001], 'kernel': ['rbf'],
'class_weight' : ['balanced', None]}]

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.2)

num_folds = 10
num_instances = len(y_train)
seed = 7
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)


clf = GridSearchCV(svm.SVC(C=1), param_grid, cv=kfold, scoring='accuracy')
clf.fit(X_train, y_train)

print(clf.best_params_)
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
print()

{'C': 1000, 'kernel': 'rbf', 'gamma': 0.001, 'class_weight': None}
0.775 (+/-0.093) for {'C': 1, 'kernel': 'rbf', 'gamma': 0.001, 'class_weight': 'balanced'}
0.779 (+/-0.097) for {'C': 1, 'kernel': 'rbf', 'gamma': 0.001, 'class_weight': None}
0.779 (+/-0.097) for {'C': 10, 'kernel': 'rbf', 'gamma': 0.001, 'class_weight': 'balanced'}
0.779 (+/-0.097) for {'C': 10, 'kernel': 'rbf', 'gamma': 0.001, 'class_weight': None}
0.785 (+/-0.097) for {'C': 100, 'kernel': 'rbf', 'gamma': 0.001, 'class_weight': 'balanced'}
0.779 (+/-0.095) for {'C': 100, 'kernel': 'rbf', 'gamma': 0.001, 'class_weight': None}
0.801 (+/-0.120) for {'C': 1000, 'kernel': 'rbf', 'gamma': 0.001, 'class_weight': 'balanced'}
0.812 (+/-0.116) for {'C': 1000, 'kernel': 'rbf', 'gamma': 0.001, 'class_weight': None}
Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

             precision    recall  f1-score   support

          0       0.84     

In [76]:

svm_kernels = ["poly", "linear", "rbf"]
accuracies = {x : [0.0,0.0, [],[]]  for x in svm_kernels}


X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.2)

mat = []
for kernel in svm_kernels:
    clf = BaggingClassifier(svm.SVC(kernel=kernel, C = 2))
    clf.fit(X_train, y_train)
        
        
    accuracies[kernel][0] += clf.score(X_train, y_train)
    accuracies[kernel][1] += clf.score(X_test, y_test)
    
    accuracies[kernel][2] = clf.predict(X_train)
    accuracies[kernel][3] = clf.predict(X_test)

    print(kernel, ": train accuracy", accuracies[kernel][0], ", test accuracy", accuracies[kernel][1])
    mat.append(np.array(accuracies[kernel][3]))
# mat = [np.array(x[2]) for x in accuracies]

forest = RandomForestClassifier(n_estimators=200)
forest = forest.fit( X_train, y_train )


test_output = forest.predict( X_test).astype(int)
accuracy = forest.score(X_test,y_test)


weights = np.array([x[1] for x in accuracies.values()])
weights = np.append(weights,accuracy)
mat.append(test_output)
print(weights)
mat = np.matrix(mat)

mat = np.array(np.average(mat, axis = 0, weights = weights)) #basically mode
mat = np.round(mat[0])

correct = 0

for i in range(len(X_test)):
    prediction = mat[i]
    if prediction == y_test[i]:
        correct += 1
print(correct/len(X_test))

poly : train accuracy 0.85393258427 , test accuracy 0.77094972067
linear : train accuracy 0.796348314607 , test accuracy 0.782122905028
rbf : train accuracy 0.862359550562 , test accuracy 0.782122905028
[ 0.78212291  0.77094972  0.78212291  0.77094972]
0.776536312849162


In [163]:
train_accuracy = 0.0
test_accuracy = 0.0

for i in range(n):
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.2)
        
    forest = GradientBoostingClassifier(n_estimators=200)# RandomForestClassifier(n_estimators=200,n_jobs=-1)
    forest = forest.fit( X_train, y_train )


    test_output = forest.predict( X_test).astype(int)
    correct = 0
    for i in range(len(y_test)):
        prediction = test_output[i]
        if prediction == y_test[i]:
            correct += 1

    test_predict = correct/len(y_test)
    test_accuracy += test_predict
    
    
    train_output = forest.predict( X_train).astype(int)
    correct = 0
    for i in range(len(y_train)):
        prediction = train_output[i]
        if prediction == y_train[i]:
            correct += 1

    train_predict = correct/len(y_train)
    train_accuracy += train_predict
test_accuracy /= n
train_accuracy /= n
print("train accuracy:", train_accuracy, ", test accuracy:", test_accuracy)

train accuracy: 0.9351123595505622 , test accuracy: 0.8290502793296086


In [174]:
train_accuracy = 0.0
test_accuracy = 0.0

for i in range(n):
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.2)
        
    clf = LogisticRegression(C=.1) #svm.SVC(kernel='rbf',C=10000,gamma=0.001)
    clf.fit(X_train,y_train)


    test_output = clf.predict( X_test).astype(int)
    correct = 0
    for i in range(len(y_test)):
        prediction = test_output[i]
        if prediction == y_test[i]:
            correct += 1

    test_predict = correct/len(y_test)
    test_accuracy += test_predict
    
    
    train_output = clf.predict( X_train).astype(int)
    correct = 0
    for i in range(len(y_train)):
        prediction = train_output[i]
        if prediction == y_train[i]:
            correct += 1

    train_predict = correct/len(y_train)
    train_accuracy += train_predict
test_accuracy /= n
train_accuracy /= n
print("train accuracy:", train_accuracy, ", test accuracy:", test_accuracy)

train accuracy: 0.7964536516853936 , test accuracy: 0.7948324022346369


In [166]:
forest =  GradientBoostingClassifier(n_estimators=200)
forest = forest.fit( X, y )
output = forest.predict(X_valid)


predictions_file = open("data/titanicSVM5.csv", "w")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(PassengerId, output))
predictions_file.close()
print('Done.')

Done.


In [151]:
clf = svm.SVC(kernel='rbf',C=10000,gamma=0.001)
clf.fit(X,y)
output = clf.predict(X_valid)


predictions_file = open("data/titanicSVM4.csv", "w")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(PassengerId, output))
predictions_file.close()
print('Done.')

Done.


In [185]:

param_grid = [
  {'C': [0.001,0.01,0.1,1,10,100,1000]}]

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.2)

num_folds = 10
num_instances = len(y_train)
seed = 7
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)


clf = GridSearchCV(LogisticRegression(n_jobs=-1), param_grid, cv=kfold, scoring='accuracy')
clf.fit(X_train, y_train)

print(clf.best_params_)
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
print()

{'C': 1}
0.688 (+/-0.120) for {'C': 0.001}
0.788 (+/-0.074) for {'C': 0.01}
0.796 (+/-0.072) for {'C': 0.1}
0.799 (+/-0.088) for {'C': 1}
0.795 (+/-0.087) for {'C': 10}
0.798 (+/-0.089) for {'C': 100}
0.796 (+/-0.094) for {'C': 1000}
Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

             precision    recall  f1-score   support

          0       0.80      0.85      0.82       116
          1       0.69      0.60      0.64        63

avg / total       0.76      0.77      0.76       179


