In [406]:
import numpy as np
import pandas as pd

from sklearn import preprocessing, cross_validation, neighbors, svm
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.cluster import KMeans
from scipy.stats import mode
import csv as csv




# import warnings, random
# from collections import Counter
# from math import sqrt

In [385]:
df = pd.read_csv('data/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [386]:
def handle_non_numerical_data(df): 
    columns = df.columns.values
    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)

            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x += 1

            df[column] = list(map(convert_to_int,df[column] ))
    
    return df

In [387]:
# Deal with empty

# All missing Embarked -> just make them embark from most common place
if len(df.Embarked[ df.Embarked.isnull() ]) > 0:
    df.Embarked[ df.Embarked.isnull() ] = df.Embarked.dropna().mode().values

# All the ages with no data -> make the median of all Ages
median_age = df['Age'].dropna().median()
if len(df.Age[ df.Age.isnull() ]) > 0:
    df.loc[ (df.Age.isnull()), 'Age'] = median_age


# All the missing Fares -> assume median of their respective class
if len(df.Fare[ df.Fare.isnull() ]) > 0:
    median_fare = np.zeros(3)
    for f in range(0,3):                                              # loop 0 to 2
        median_fare[f] = df[ df.Pclass == f+1 ]['Fare'].dropna().median()
    for f in range(0,3):                                              # loop 0 to 2
        df.loc[ (df.Fare.isnull()) & (df.Pclass == f+1 ), 'Fare'] = median_fare[f]

# Assume room number on floor does not matter, change cabin to just be the floor
df.Cabin[ df.Cabin.notnull()] = df.Cabin[ df.Cabin.notnull()].str[0]

        
# All the missing Cabins -> assume median of their respective class
if len(df.Cabin[ df.Cabin.isnull() ]) > 0:
    median_fare = ["","",""]
    for f in range(0,3):                                              # loop 0 to 2
        z = df[ df.Pclass == f+1 ]['Cabin'].dropna().value_counts().index.values[0]#.median().astype(str)
        median_fare[f] = z
    for f in range(0,3):                                              # loop 0 to 2
        df.loc[ (df.Cabin.isnull()) & (df.Pclass == f+1 ), 'Cabin'] = median_fare[f]



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [388]:
df.drop(['Name', 'PassengerId'], 1, inplace=True)
df.convert_objects(convert_numeric=True)
df.fillna(0,inplace=True)

  from ipykernel import kernelapp as app


In [389]:
df = handle_non_numerical_data(df)
df.describe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,0.352413,29.361582,0.523008,0.381594,349.942761,32.204208,3.432099,1.638608
std,0.486592,0.836071,0.47799,13.019697,1.102743,0.806057,193.40926,49.693429,1.473604,0.635673
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,2.0,0.0,22.0,0.0,0.0,185.5,7.9104,4.0,1.0
50%,0.0,3.0,0.0,28.0,0.0,0.0,362.0,14.4542,4.0,2.0
75%,1.0,3.0,1.0,35.0,1.0,0.0,516.5,31.0,4.0,2.0
max,1.0,3.0,1.0,80.0,8.0,6.0,680.0,512.3292,7.0,2.0


In [390]:

df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,0,22.0,1,0,615,7.25,4,2
1,1,1,1,38.0,1,0,394,71.2833,0,1
2,1,3,1,26.0,0,0,541,7.925,4,2
3,1,1,1,35.0,1,0,527,53.1,0,2
4,0,3,0,35.0,0,0,313,8.05,4,2


In [391]:
X = np.array(df.drop(['Survived'], 1).astype(float))
X = preprocessing.scale(X)
y = np.array(df['Survived'])
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.2)

In [392]:
clf = KMeans(n_clusters=2)
clf.fit(X_train)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=2, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [393]:
correct = 0
for i in range(len(X_test)):
    predict_me = np.array(X[i].astype(float))
    predict_me = predict_me.reshape(-1, len(predict_me))
    prediction = clf.predict(predict_me)
    if prediction[0] == y_test[i]:
        correct += 1

predict = correct/len(X_test)
print(max(1- predict, predict))

0.5586592178770949


In [394]:
df_test = pd.read_csv('data/test.csv')

PassengerId = df_test['PassengerId'].values


# Deal with empty

# All missing Embarked -> just make them embark from most common place
if len(df_test.Embarked[ df_test.Embarked.isnull() ]) > 0:
    df_test.Embarked[ df_test.Embarked.isnull() ] = df_test.Embarked.dropna().mode().values

# All the ages with no data -> make the median of all Ages
median_age = df_test['Age'].dropna().median()
if len(df_test.Age[ df_test.Age.isnull() ]) > 0:
    df_test.loc[ (df_test.Age.isnull()), 'Age'] = median_age


# All the missing Fares -> assume median of their respective class
if len(df_test.Fare[ df_test.Fare.isnull() ]) > 0:
    median_fare = np.zeros(3)
    for f in range(0,3):                                              # loop 0 to 2
        median_fare[f] = df_test[ df_test.Pclass == f+1 ]['Fare'].dropna().median()
    for f in range(0,3):                                              # loop 0 to 2
        df_test.loc[ (df_test.Fare.isnull()) & (df_test.Pclass == f+1 ), 'Fare'] = median_fare[f]

# Assume room number on floor does not matter, change cabin to just be the floor
df_test.Cabin[ df_test.Cabin.notnull()] = df_test.Cabin[ df_test.Cabin.notnull()].str[0]

        
# All the missing Cabins -> assume median of their respective class
if len(df_test.Cabin[ df_test.Cabin.isnull() ]) > 0:
    median_fare = ["","",""]
    for f in range(0,3):                                              # loop 0 to 2
        z = df_test[ df_test.Pclass == f+1 ]['Cabin'].dropna().value_counts().index.values[0]#.median().astype(str)
        median_fare[f] = z
    for f in range(0,3):                                              # loop 0 to 2
        df_test.loc[ (df_test.Cabin.isnull()) & (df_test.Pclass == f+1 ), 'Cabin'] = median_fare[f]





df_test.drop(['Name', 'PassengerId'], 1, inplace=True)
df_test.convert_objects(convert_numeric=True)
df_test.fillna(0,inplace=True)




df_test = handle_non_numerical_data(df_test)
print(df.head())
print(df_test.head())
X_valid = np.array(df_test.astype(float))
X_valid = preprocessing.scale(X_valid)

   Survived  Pclass  Sex   Age  SibSp  Parch  Ticket     Fare  Cabin  Embarked
0         0       3    0  22.0      1      0     615   7.2500      4         2
1         1       1    1  38.0      1      0     394  71.2833      0         1
2         1       3    1  26.0      0      0     541   7.9250      4         2
3         1       1    1  35.0      1      0     527  53.1000      0         2
4         0       3    0  35.0      0      0     313   8.0500      4         2
   Pclass  Sex   Age  SibSp  Parch  Ticket     Fare  Cabin  Embarked
0       3    0  34.5      0      0     225   7.8292      4         1
1       3    1  47.0      1      0     103   7.0000      4         2
2       2    0  62.0      0      0     216   9.6875      4         1
3       3    0  27.0      0      0     286   8.6625      4         2
4       3    1  22.0      1      1     117  12.2875      4         2


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [408]:
clf = BaggingClassifier( neighbors.KNeighborsClassifier(n_jobs=-1))
clf.fit(X_train, y_train)
train_accuracy = clf.score(X_train, y_train)
test_accuracy = clf.score(X_test, y_test)
print(train_accuracy, test_accuracy)

0.865168539326 0.776536312849


In [409]:
train_accuracy = 0.0
test_accuracy = 0.0
n = 40
svm_kernels = ["poly", "linear", "rbf"]
accuracies = {x : [0.0,0.0]  for x in svm_kernels}


for i in range(n):
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.2)
        
    for kernel in svm_kernels:
        clf = BaggingClassifier(svm.SVC(kernel=kernel, C = 2))
        clf.fit(X_train, y_train)
        
        
        accuracies[kernel][0] += clf.score(X_train, y_train)
        accuracies[kernel][1] += clf.score(X_test, y_test)
for kernel in svm_kernels:
    accuracies[kernel][0] /= n 
    accuracies[kernel][1] /= n

    print(kernel, ": train accuracy", accuracies[kernel][0], ", test accuracy", accuracies[kernel][1])

poly : train accuracy 0.856987359551 , test accuracy 0.802653631285
linear : train accuracy 0.792134831461 , test accuracy 0.790083798883
sigmoid : train accuracy 0.616046348315 , test accuracy 0.616620111732
rbf : train accuracy 0.857514044944 , test accuracy 0.823324022346


In [495]:

svm_kernels = ["poly", "linear", "rbf"]
accuracies = {x : [0.0,0.0, [],[]]  for x in svm_kernels}


X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.2)
        
mat = []
for kernel in svm_kernels:
    clf = BaggingClassifier(svm.SVC(kernel=kernel, C = 2))
    clf.fit(X_train, y_train)
        
        
    accuracies[kernel][0] += clf.score(X_train, y_train)
    accuracies[kernel][1] += clf.score(X_test, y_test)
    
    accuracies[kernel][2] = clf.predict(X_train)
    accuracies[kernel][3] = clf.predict(X_test)

    print(kernel, ": train accuracy", accuracies[kernel][0], ", test accuracy", accuracies[kernel][1])
    mat.append(np.array(accuracies[kernel][3]))
# mat = [np.array(x[2]) for x in accuracies]

forest = RandomForestClassifier(n_estimators=200)
forest = forest.fit( X_train, y_train )


test_output = forest.predict( X_test).astype(int)
accuracy = forest.score(X_test,y_test)


weights = np.array([x[1] for x in accuracies.values()])
weights = np.append(weights,accuracy)
mat.append(test_output)
print(weights)
mat = np.matrix(mat)

mat = np.array(np.average(mat, axis = 0, weights = weights)) #basically mode
mat = np.round(mat[0])

correct = 0

for i in range(len(X_test)):
    prediction = mat[i]
    if prediction == y_test[i]:
        correct += 1
print(correct/len(X_test))

poly : train accuracy 0.858146067416 , test accuracy 0.798882681564
linear : train accuracy 0.790730337079 , test accuracy 0.787709497207
rbf : train accuracy 0.86095505618 , test accuracy 0.793296089385
[ 0.7877095   0.79329609  0.79888268  0.82122905]
0.8156424581005587


In [494]:
train_accuracy = 0.0
test_accuracy = 0.0

for i in range(n):
    X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,test_size=0.2)
        
    forest = RandomForestClassifier(n_estimators=200)
    forest = forest.fit( X_train, y_train )


    test_output = forest.predict( X_test).astype(int)
    correct = 0
    for i in range(len(X_test)):
        prediction = test_output[i]
        if prediction == y_test[i]:
            correct += 1

    test_predict = correct/len(X_test)
    test_accuracy += test_predict
    
    
    train_output = forest.predict( X_train).astype(int)
    correct = 0
    for i in range(len(X_train)):
        prediction = train_output[i]
        if prediction == y_train[i]:
            correct += 1

    train_predict = correct/len(X_train)
    train_accuracy += train_predict
test_accuracy /= n
train_accuracy /= n
print("train accuracy:", train_accuracy, ", test accuracy:", test_accuracy)

train accuracy: 0.9981741573033709 , test accuracy: 0.825139664804469
