In [1]:
import pandas as pd

train = pd.read_csv("train.csv", index_col=0)

y = train["Survived"] #Answer vector
print train.head()

             Survived  Pclass  \
PassengerId                     
1                   0       3   
2                   1       1   
3                   1       3   
4                   1       1   
5                   0       3   

                                                          Name     Sex   Age  \
PassengerId                                                                    
1                                      Braund, Mr. Owen Harris    male  22.0   
2            Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0   
3                                       Heikkinen, Miss. Laina  female  26.0   
4                 Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0   
5                                     Allen, Mr. William Henry    male  35.0   

             SibSp  Parch            Ticket     Fare Cabin Embarked  
PassengerId                                                          
1                1      0         A/5 21171   7.2500   NaN        S

In [2]:
def process_name(name):
    temp_name = name.split(',')[1].split()[0]
    if temp_name in ['Miss.', 'Rev.', 'Dona.', 'Mrs.', 'Col.', 'Ms.', 'Mr.', 'Dr.', 'Master.']:
        return temp_name
    return "NaN"
    #return name.split(',')[1].split()[0] # Just the prefix, Mr. Mrs. etc.

def process_cabin(cabin):
    if type(cabin) is float:
        return cabin
    return cabin[0] #We only want the letter, numbers don't matter
train = train.drop("Ticket", 1) # There's no method to the madness there, impossible to parse

train["Name"] = train["Name"].apply(process_name)
train["Cabin"] = train["Cabin"].apply(process_cabin)

print list(set(list(train["Name"]))) #Verify all the different names that pop up
print train.head()

['Miss.', 'Rev.', 'NaN', 'Mrs.', 'Col.', 'Ms.', 'Mr.', 'Dr.', 'Master.']
             Survived  Pclass   Name     Sex   Age  SibSp  Parch     Fare  \
PassengerId                                                                 
1                   0       3    Mr.    male  22.0      1      0   7.2500   
2                   1       1   Mrs.  female  38.0      1      0  71.2833   
3                   1       3  Miss.  female  26.0      0      0   7.9250   
4                   1       1   Mrs.  female  35.0      1      0  53.1000   
5                   0       3    Mr.    male  35.0      0      0   8.0500   

            Cabin Embarked  
PassengerId                 
1             NaN        S  
2               C        C  
3             NaN        S  
4               C        S  
5             NaN        S  


In [3]:
#Next step is converting everything that isn't NaN to an integer

from sklearn.preprocessing import LabelEncoder

sex_le = LabelEncoder()
train["Sex"] = sex_le.fit_transform(train["Sex"].values) #This works good for something binary, but not cabin/embarked

cols_to_transform = ["Name", "Cabin", "Embarked"]
train = pd.get_dummies(train, prefix=cols_to_transform)
print train.head()

             Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Name_Col.  \
PassengerId                                                                  
1                   0       3    1  22.0      1      0   7.2500        0.0   
2                   1       1    0  38.0      1      0  71.2833        0.0   
3                   1       3    0  26.0      0      0   7.9250        0.0   
4                   1       1    0  35.0      1      0  53.1000        0.0   
5                   0       3    1  35.0      0      0   8.0500        0.0   

             Name_Dr.  Name_Master.     ...      Cabin_B  Cabin_C  Cabin_D  \
PassengerId                             ...                                  
1                 0.0           0.0     ...          0.0      0.0      0.0   
2                 0.0           0.0     ...          0.0      1.0      0.0   
3                 0.0           0.0     ...          0.0      0.0      0.0   
4                 0.0           0.0     ...          0.0      1

In [4]:
#Next we remove everything that is NaN

from sklearn.preprocessing import Imputer
imr = Imputer(missing_values="NaN", strategy="mean", axis=0)

col_names = train.columns #Array of the column headers

imr.fit(train)
imputed_train = imr.transform(train.values)
print imputed_train[0]
train = pd.DataFrame(imputed_train)
train.columns = col_names
train = train.drop(["Name_NaN", "Cabin_T"], 1) # Not in the test dataset
print train.head()
print train.columns

[  0.     3.     1.    22.     1.     0.     7.25   0.     0.     0.     0.
   1.     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
   0.     0.     0.     0.     1.  ]
   Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Name_Col.  Name_Dr.  \
0       0.0     3.0  1.0  22.0    1.0    0.0   7.2500        0.0       0.0   
1       1.0     1.0  0.0  38.0    1.0    0.0  71.2833        0.0       0.0   
2       1.0     3.0  0.0  26.0    0.0    0.0   7.9250        0.0       0.0   
3       1.0     1.0  0.0  35.0    1.0    0.0  53.1000        0.0       0.0   
4       0.0     3.0  1.0  35.0    0.0    0.0   8.0500        0.0       0.0   

   Name_Master.     ...      Cabin_A  Cabin_B  Cabin_C  Cabin_D  Cabin_E  \
0           0.0     ...          0.0      0.0      0.0      0.0      0.0   
1           0.0     ...          0.0      0.0      1.0      0.0      0.0   
2           0.0     ...          0.0      0.0      0.0      0.0      0.0   
3           0.0     ...          0.0  

In [5]:
#Finally we want to standard some of the columns: age and fare

from sklearn.preprocessing import StandardScaler

stdsc = StandardScaler() #This uses the distrobution of the data to calculate the new values

to_transform = train[["Age", "Fare"]]

to_transform = stdsc.fit_transform(to_transform)

train[["Age", "Fare"]] = pd.DataFrame(to_transform)

from sklearn.cross_validation import train_test_split
#Now we split the data

train_train, train_test, y_train, y_test = train_test_split(train, y, test_size=0.4)


print train_train.head()

     Survived  Pclass  Sex       Age  SibSp  Parch      Fare  Name_Col.  \
728       0.0     2.0  1.0 -0.361618    1.0    0.0 -0.124920        0.0   
238       0.0     2.0  1.0 -0.823344    0.0    0.0 -0.437007        0.0   
871       1.0     1.0  0.0  1.331378    1.0    1.0  0.409741        0.0   
368       1.0     3.0  0.0  0.000000    0.0    0.0 -0.492378        0.0   
630       1.0     1.0  1.0  3.870872    0.0    0.0 -0.044381        0.0   

     Name_Dr.  Name_Master.     ...      Cabin_A  Cabin_B  Cabin_C  Cabin_D  \
728       0.0           0.0     ...          0.0      0.0      0.0      0.0   
238       0.0           0.0     ...          0.0      0.0      0.0      0.0   
871       0.0           0.0     ...          0.0      0.0      0.0      1.0   
368       0.0           0.0     ...          0.0      0.0      0.0      0.0   
630       0.0           0.0     ...          1.0      0.0      0.0      0.0   

     Cabin_E  Cabin_F  Cabin_G  Embarked_C  Embarked_Q  Embarked_S  
728  

In [6]:
#Alright, that didn't work let's go right to the PCA/LDA class

from sklearn.decomposition import PCA
from sklearn.lda import LDA
from sklearn.linear_model import LogisticRegression

pca = PCA(n_components=8)
lda = LDA(n_components=8)

train_data_lda = lda.fit_transform(train_train, y_train)
train_data_pca = pca.fit_transform(train_train)

lr = LogisticRegression()

lr.fit(train_train, y_train)

print sum([lr.predict(train_test)[i]==y_test.values[i] for i in range(len(y_test.values))])/(len(y_test.values)*1.0)

print y_test.values



1.0
[0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 1 0 1 1 0 1 1 0 0 0 0 0 0 0
 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0 0 1 1 1 1 0 0 0 0 0
 1 0 0 1 0 1 0 1 1 1 0 0 0 1 0 0 1 1 0 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 1 1
 0 0 1 1 0 0 0 0 1 1 1 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1
 0 1 1 0 1 1 1 0 0 0 0 0 0 1 0 0 1 1 1 0 1 0 0 1 0 0 1 1 0 0 0 1 0 1 0 1 1
 0 0 1 0 0 0 1 0 1 0 1 0 0 1 1 1 1 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 0 0 1 0 1
 1 0 0 0 0 0 1 0 0 0 0 1 0 1 1 0 0 1 1 0 0 1 1 1 0 0 0 0 1 1 0 0 0 0 1 0 0
 0 1 1 0 0 1 1 0 0 1 0 0 0 0 1 0 0 0 1 0 0 1 1 1 0 0 0 1 0 1 0 0 1 0 0 1 0
 0 1 0 1 1 1 1 0 0 1 0 0 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 1 1 0 1 1
 1 0 0 1 0 0 0 0 0 1 1 0 0 0 0 1 0 0 1 1 0 1 1 1]




In [18]:
real_test = pd.read_csv("test.csv")

def prepare_data(train):
    def process_name(name):
        temp_name = name.split(',')[1].split()[0]
        if temp_name in ['Miss.', 'Rev.', 'Dona.', 'Mrs.', 'Col.', 'Ms.', 'Mr.', 'Dr.', 'Master.']:
            return temp_name
        return "NaN"
    #return name.split(',')[1].split()[0] # Just the prefix, Mr. Mrs. etc.

    def process_cabin(cabin):
        if type(cabin) is float:
            return cabin
        return cabin[0]
    train = train.drop("Ticket", 1)

    train["Name"] = train["Name"].apply(process_name)
    train["Cabin"] = train["Cabin"].apply(process_cabin)
    
    sex_le = LabelEncoder()
    train["Sex"] = sex_le.fit_transform(train["Sex"].values)
    
    
    cols_to_transform = ["Name", "Cabin", "Embarked"]
    train = pd.get_dummies(train, prefix=cols_to_transform)
    
    imr = Imputer(missing_values="NaN", strategy="mean", axis=0)
    col_names = train.columns #Array of the column headers

    imr.fit(train)
    imputed_train = imr.transform(train.values)
    train = pd.DataFrame(imputed_train)
    train.columns = col_names
    
    stdsc = StandardScaler() #This uses the distrobution of the data to calculate the new values
    to_transform = train[["Age", "Fare"]]
    to_transform = stdsc.fit_transform(to_transform)
    train[["Age", "Fare"]] = pd.DataFrame(to_transform)
    return train
    
real_test = prepare_data(real_test)
real_test = real_test.drop(["Name_Dona."], 1) #Not in the training data

print real_test.columns

print "PassengerId,Survived"
for index, i in enumerate(lr.predict(real_test)):
    print str(int(real_test.values[index][0])) + "," + str(i)



Index([u'PassengerId', u'Pclass', u'Sex', u'Age', u'SibSp', u'Parch', u'Fare',
       u'Name_Col.', u'Name_Dr.', u'Name_Master.', u'Name_Miss.', u'Name_Mr.',
       u'Name_Mrs.', u'Name_Ms.', u'Name_Rev.', u'Cabin_A', u'Cabin_B',
       u'Cabin_C', u'Cabin_D', u'Cabin_E', u'Cabin_F', u'Cabin_G',
       u'Embarked_C', u'Embarked_Q', u'Embarked_S'],
      dtype='object')
PassengerId,Survived
892,1
893,1
894,1
895,1
896,1
897,1
898,1
899,1
900,1
901,1
902,1
903,1
904,1
905,1
906,1
907,1
908,1
909,1
910,1
911,1
912,1
913,1
914,1
915,1
916,1
917,1
918,1
919,1
920,1
921,1
922,1
923,1
924,1
925,1
926,1
927,1
928,1
929,1
930,1
931,1
932,1
933,1
934,1
935,1
936,1
937,1
938,1
939,1
940,1
941,1
942,1
943,1
944,1
945,1
946,1
947,1
948,1
949,1
950,1
951,1
952,1
953,1
954,1
955,1
956,1
957,1
958,1
959,1
960,1
961,1
962,1
963,1
964,1
965,1
966,1
967,1
968,1
969,1
970,1
971,1
972,1
973,1
974,1
975,1
976,1
977,1
978,1
979,1
980,1
981,1
982,1
983,1
984,1
985,1
986,1
987,1
988,1
989,1
990,1
991,1
992,1
9