In [1]:
import pandas as pd

train = pd.read_csv("train.csv", index_col=0)

y = train["Survived"] #Answer vector
train = train.drop(["Survived"], 1) #Don't want to train with this
print train.head()

             Pclass                                               Name  \
PassengerId                                                              
1                 3                            Braund, Mr. Owen Harris   
2                 1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
3                 3                             Heikkinen, Miss. Laina   
4                 1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
5                 3                           Allen, Mr. William Henry   

                Sex   Age  SibSp  Parch            Ticket     Fare Cabin  \
PassengerId                                                                
1              male  22.0      1      0         A/5 21171   7.2500   NaN   
2            female  38.0      1      0          PC 17599  71.2833   C85   
3            female  26.0      0      0  STON/O2. 3101282   7.9250   NaN   
4            female  35.0      1      0            113803  53.1000  C123   
5              male  35.0

In [2]:
def process_name(name):
    temp_name = name.split(',')[1].split()[0]
    if temp_name in ['Miss.', 'Rev.', 'Dona.', 'Mrs.', 'Col.', 'Ms.', 'Mr.', 'Dr.', 'Master.']:
        return temp_name
    return "NaN"
    #return name.split(',')[1].split()[0] # Just the prefix, Mr. Mrs. etc.

def process_cabin(cabin):
    if type(cabin) is float:
        return cabin
    return cabin[0] #We only want the letter, numbers don't matter
train = train.drop("Ticket", 1) # There's no method to the madness there, impossible to parse

train["Name"] = train["Name"].apply(process_name)
train["Cabin"] = train["Cabin"].apply(process_cabin)

print list(set(list(train["Name"]))) #Verify all the different names that pop up
print train.head()

['Miss.', 'Rev.', 'NaN', 'Mrs.', 'Col.', 'Ms.', 'Mr.', 'Dr.', 'Master.']
             Pclass   Name     Sex   Age  SibSp  Parch     Fare Cabin Embarked
PassengerId                                                                   
1                 3    Mr.    male  22.0      1      0   7.2500   NaN        S
2                 1   Mrs.  female  38.0      1      0  71.2833     C        C
3                 3  Miss.  female  26.0      0      0   7.9250   NaN        S
4                 1   Mrs.  female  35.0      1      0  53.1000     C        S
5                 3    Mr.    male  35.0      0      0   8.0500   NaN        S


In [3]:
#Next step is converting everything that isn't NaN to an integer

from sklearn.preprocessing import LabelEncoder

sex_le = LabelEncoder()
train["Sex"] = sex_le.fit_transform(train["Sex"].values) #This works good for something binary, but not cabin/embarked

cols_to_transform = ["Name", "Cabin", "Embarked"]
train = pd.get_dummies(train, prefix=cols_to_transform)
print train.head()

             Pclass  Sex   Age  SibSp  Parch     Fare  Name_Col.  Name_Dr.  \
PassengerId                                                                  
1                 3    1  22.0      1      0   7.2500        0.0       0.0   
2                 1    0  38.0      1      0  71.2833        0.0       0.0   
3                 3    0  26.0      0      0   7.9250        0.0       0.0   
4                 1    0  35.0      1      0  53.1000        0.0       0.0   
5                 3    1  35.0      0      0   8.0500        0.0       0.0   

             Name_Master.  Name_Miss.     ...      Cabin_B  Cabin_C  Cabin_D  \
PassengerId                               ...                                  
1                     0.0         0.0     ...          0.0      0.0      0.0   
2                     0.0         0.0     ...          0.0      1.0      0.0   
3                     0.0         1.0     ...          0.0      0.0      0.0   
4                     0.0         0.0     ...        

In [4]:
#Next we remove everything that is NaN

from sklearn.preprocessing import Imputer
imr = Imputer(missing_values="NaN", strategy="mean", axis=0)

col_names = train.columns #Array of the column headers

imr.fit(train)
imputed_train = imr.transform(train.values)
print imputed_train[0]
train = pd.DataFrame(imputed_train)
train.columns = col_names
train = train.drop(["Name_NaN", "Cabin_T"], 1) # Not in the test dataset
print train.head()
print train.columns

[  3.     1.    22.     1.     0.     7.25   0.     0.     0.     0.     1.
   0.     0.     0.     0.     0.     0.     0.     0.     0.     0.     0.
   0.     0.     0.     1.  ]
   Pclass  Sex   Age  SibSp  Parch     Fare  Name_Col.  Name_Dr.  \
0     3.0  1.0  22.0    1.0    0.0   7.2500        0.0       0.0   
1     1.0  0.0  38.0    1.0    0.0  71.2833        0.0       0.0   
2     3.0  0.0  26.0    0.0    0.0   7.9250        0.0       0.0   
3     1.0  0.0  35.0    1.0    0.0  53.1000        0.0       0.0   
4     3.0  1.0  35.0    0.0    0.0   8.0500        0.0       0.0   

   Name_Master.  Name_Miss.     ...      Cabin_A  Cabin_B  Cabin_C  Cabin_D  \
0           0.0         0.0     ...          0.0      0.0      0.0      0.0   
1           0.0         0.0     ...          0.0      0.0      1.0      0.0   
2           0.0         1.0     ...          0.0      0.0      0.0      0.0   
3           0.0         0.0     ...          0.0      0.0      1.0      0.0   
4           0.

In [5]:
#Finally we want to standard some of the columns: age and fare

from sklearn.preprocessing import StandardScaler

stdsc = StandardScaler() #This uses the distrobution of the data to calculate the new values

to_transform = train[["Age", "Fare"]]

to_transform = stdsc.fit_transform(to_transform)

train[["Age", "Fare"]] = pd.DataFrame(to_transform)

from sklearn.cross_validation import train_test_split
#Now we split the data

train_train, train_test, y_train, y_test = train_test_split(train, y, test_size=0.4)


print train_train.head()

     Pclass  Sex       Age  SibSp  Parch      Fare  Name_Col.  Name_Dr.  \
813     3.0  0.0 -1.823750    4.0    2.0 -0.018709        0.0       0.0   
208     3.0  0.0 -1.054207    0.0    0.0 -0.492378        0.0       0.0   
444     3.0  1.0  0.000000    0.0    0.0 -0.485079        0.0       0.0   
259     2.0  0.0  1.562241    0.0    1.0 -0.124920        0.0       0.0   
597     3.0  1.0  1.485287    0.0    0.0 -0.648422        0.0       0.0   

     Name_Master.  Name_Miss.     ...      Cabin_A  Cabin_B  Cabin_C  Cabin_D  \
813           0.0         1.0     ...          0.0      0.0      0.0      0.0   
208           0.0         1.0     ...          0.0      0.0      0.0      0.0   
444           0.0         0.0     ...          0.0      0.0      0.0      0.0   
259           0.0         0.0     ...          0.0      0.0      0.0      0.0   
597           0.0         0.0     ...          0.0      0.0      0.0      0.0   

     Cabin_E  Cabin_F  Cabin_G  Embarked_C  Embarked_Q  Embark

In [6]:
#Alright, that didn't work let's go right to the PCA/LDA class

from sklearn.decomposition import PCA
from sklearn.lda import LDA
from sklearn.linear_model import LogisticRegression

pca = PCA(n_components=10)
lda = LDA(n_components=20)

train_data_lda = lda.fit_transform(train_train, y_train)
train_data_pca = pca.fit_transform(train_train)

lr = LogisticRegression()

lr.fit(train_train, y_train)

print sum([lr.predict(train_test)[i]==y_test.values[i] for i in range(len(y_test.values))])/(len(y_test.values)*1.0)

print y_test.values



0.854341736695
[0 0 0 0 0 1 1 0 1 0 0 1 0 1 1 0 0 1 0 0 0 0 1 1 0 1 0 1 0 1 1 1 0 1 0 0 0
 1 0 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 1 1 0 0 0 0 1 0 1 0 1 0 1 1 1 1 0 1
 0 1 1 0 0 1 1 1 0 1 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 0 1 0 0 0 1 1 1
 0 1 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 1 1 0 1 0 1
 0 1 0 1 1 0 1 1 0 1 0 1 1 0 0 0 1 1 0 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1 1
 1 1 0 1 1 1 0 1 0 0 1 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 1 0 1 0 0 1 0 0 1 0 0 0 1 1 0 0 0 0 1 0 1 1
 1 0 1 0 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 1 0 0 1
 0 1 0 0 0 0 1 1 1 0 1 1 1 1 1 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 1 0 0 0 1 1 1
 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 1 0 0 0 0 1]




In [7]:
real_test = pd.read_csv("test.csv")

def prepare_data(train):
    def process_name(name):
        temp_name = name.split(',')[1].split()[0]
        if temp_name in ['Miss.', 'Rev.', 'Dona.', 'Mrs.', 'Col.', 'Ms.', 'Mr.', 'Dr.', 'Master.']:
            return temp_name
        return "NaN"
    #return name.split(',')[1].split()[0] # Just the prefix, Mr. Mrs. etc.

    def process_cabin(cabin):
        if type(cabin) is float:
            return cabin
        return cabin[0]
    train = train.drop("Ticket", 1)

    train["Name"] = train["Name"].apply(process_name)
    train["Cabin"] = train["Cabin"].apply(process_cabin)
    
    sex_le = LabelEncoder()
    train["Sex"] = sex_le.fit_transform(train["Sex"].values)
    
    
    cols_to_transform = ["Name", "Cabin", "Embarked"]
    train = pd.get_dummies(train, prefix=cols_to_transform)
    
    imr = Imputer(missing_values="NaN", strategy="mean", axis=0)
    col_names = train.columns #Array of the column headers

    imr.fit(train)
    imputed_train = imr.transform(train.values)
    train = pd.DataFrame(imputed_train)
    train.columns = col_names
    
    stdsc = StandardScaler() #This uses the distrobution of the data to calculate the new values
    to_transform = train[["Age", "Fare"]]
    to_transform = stdsc.fit_transform(to_transform)
    train[["Age", "Fare"]] = pd.DataFrame(to_transform)
    return train
    
real_test = prepare_data(real_test)
real_test = real_test.drop(["Name_Dona.", "PassengerId"], 1) #Not in the training data

print real_test.columns
print train_train.columns

print train_train.values[0]
print lr.predict([train_train.values[0]])

print real_test.values[0]

print "PassengerId,Survived"
for index, i in enumerate(lr.predict(real_test)):
    print str(index+892) + "," + str(i)



Index([u'Pclass', u'Sex', u'Age', u'SibSp', u'Parch', u'Fare', u'Name_Col.',
       u'Name_Dr.', u'Name_Master.', u'Name_Miss.', u'Name_Mr.', u'Name_Mrs.',
       u'Name_Ms.', u'Name_Rev.', u'Cabin_A', u'Cabin_B', u'Cabin_C',
       u'Cabin_D', u'Cabin_E', u'Cabin_F', u'Cabin_G', u'Embarked_C',
       u'Embarked_Q', u'Embarked_S'],
      dtype='object')
Index([u'Pclass', u'Sex', u'Age', u'SibSp', u'Parch', u'Fare', u'Name_Col.',
       u'Name_Dr.', u'Name_Master.', u'Name_Miss.', u'Name_Mr.', u'Name_Mrs.',
       u'Name_Ms.', u'Name_Rev.', u'Cabin_A', u'Cabin_B', u'Cabin_C',
       u'Cabin_D', u'Cabin_E', u'Cabin_F', u'Cabin_G', u'Embarked_C',
       u'Embarked_Q', u'Embarked_S'],
      dtype='object')
[ 3.          0.         -1.82375021  4.          2.         -0.01870931
  0.          0.          0.          1.          0.          0.          0.
  0.          0.          0.          0.          0.          0.          0.
  0.          0.          0.          1.        ]
[0]
[ 3.   