In [13]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math

%matplotlib inline

pd.set_option('precision', 3)

In [14]:
def transform_df(df):
    
    def fill_Age_Class(row, dictionary):
        return int(round(dictionary.loc[row['Class'],row['Sex']]))
    
    def get_title(row):
        step1 = row['Name'].split(',', 1)
        step2 = step1[1].split('.', 1)
    
        return step2[0].strip()
    
    def get_surname(row):
        step1 = row['Name'].split(',', 1)
    
        return step1[0].strip()
    
    def get_names(row):
        step1 = row['Name'].split(',', 1)
        step2 = step1[1].split('.', 1)
    
        return step2[1].strip()
    
    def get_deck(code):
        if isinstance(code, str):
            return code[0]
        
    def is_baby(age):
        if age < 1:
            return 1
        else:
            return 0
        
    def family_size(row):
        if row['SibSp'] == 0 and row['Parch'] == 0:
            return 0
        else:
            return row['SibSp'] + row['Parch'] + 1
        
    ticket_dict = df['Ticket'].value_counts()

    def get_same_ticket(ticket, dictionary):
        return dictionary[ticket]
    
    df['Title']       = df.apply(lambda x: get_title(x), axis=1)
    #df['Surname']     = df.apply(lambda x: get_surname(x), axis=1)
    #df['Names']       = df.apply(lambda x: get_names(x), axis=1)
    #df['Deck']        = df['Cabin'].apply(lambda x: get_deck(x))
    df['Group Size']  = df['Ticket'].apply(lambda x: get_same_ticket(x, ticket_dict))
    df['Family Size'] = df.apply(lambda x: family_size(x), axis=1)
    
    # filling missing Age_Class values
    index = df[df['Title'] == ' Master'].index
    age = df.loc[index,'Age'].mean()
    
    
    
    df['Age_Class']   = pd.cut(df['Age'], [0,18,30,50,65,150], include_lowest=True, right=False, labels=False) 

    df.rename(columns=colNames, inplace=True)
    df.drop('Name', axis=1, inplace=True)
    
    index = df[df['Age_Class'].isnull() == True].index
    dictionary = pd.crosstab(df['Class'], df['Sex'], df['Age_Class'], aggfunc='mean')
    df.loc[index,'Age_Class'] = df.loc[index].apply(lambda row: fill_Age_Class(row, dictionary), axis=1)
    
    # filling missing POL values
    df['POL'].fillna('C', inplace=True)
    
    df.set_index('id', inplace=True)
    
def ml_ready(df):
    from sklearn.cross_validation import train_test_split
    
    features = ['Age_Class', 'Class', 'Sex', 'Family Size', 'Group Size', 'SibSp', 'Parch']
    target   = 'Survived'
    
    X = df[features]
    y = df[target]
    
    for column in X.columns:
        X.loc[:,column], junk = X.loc[:,column].factorize()

    return train_test_split(X, y, test_size=0.3, random_state=379582)

def infer_ready(df):
    features = ['Age_Class', 'Class', 'Sex', 'Family Size', 'Group Size', 'SibSp', 'Parch']
    
    X = df[features]
    
    for column in X.columns:
        X.loc[:,column], junk = X.loc[:,column].factorize()
    
    return X

def train_and_evaluate(X_train, X_test, y_train, y_test, infer):

    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.metrics import confusion_matrix, classification_report
    from sklearn.model_selection import GridSearchCV

    gboost = GradientBoostingClassifier(n_estimators=500)

    gboost.fit(X_train, y_train)

    predictions = gboost.predict(X_test)

    print(classification_report(y_test, predictions))
    print(confusion_matrix(y_test, predictions))
    
    submissions = gboost.predict(infer)
    
    submission = pd.DataFrame(index=infer.index)
    submission['Survived'] = submissions
    
    #submission.to_csv('submission-02.csv')

In [60]:
train_csv = pd.read_csv('../input/train.csv')
infer_csv = pd.read_csv('../input/test.csv')

In [61]:
train_csv.set_index('PassengerId', inplace=True)
infer_csv.set_index('PassengerId', inplace=True)
df = pd.concat([train_csv, infer_csv], axis=0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [63]:
df.tail()

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1305,,,S,8.05,"Spector, Mr. Woolf",0,3,male,0,,A.5. 3236
1306,39.0,C105,C,108.9,"Oliva y Ocana, Dona. Fermina",0,1,female,0,,PC 17758
1307,38.5,,S,7.25,"Saether, Mr. Simon Sivertsen",0,3,male,0,,SOTON/O.Q. 3101262
1308,,,S,8.05,"Ware, Mr. Frederick",0,3,male,0,,359309
1309,,,C,22.358,"Peter, Master. Michael J",1,3,male,1,,2668


In [16]:
#transform_df(train_csv)
#transform_df(infer_csv)

In [24]:
train_csv['Title'].value_counts()

 Mr              517
 Miss            182
 Mrs             125
 Master           40
 Dr                7
 Rev               6
 Major             2
 Mlle              2
 Col               2
 Sir               1
 Ms                1
 Don               1
 Jonkheer          1
 Capt              1
 Lady              1
 the Countess      1
 Mme               1
Name: Title, dtype: int64

In [54]:
train_csv[train_csv['Title'] == " Major"]

Unnamed: 0_level_0,Survived,Class,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,POL,Title,Surname,Names,Deck,Group Size,Family Size,Age_Class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
450,1,1,male,52.0,0,0,113786,30.5,C104,S,Major,Peuchen,Arthur Godfrey,C,1,0,3.0
537,0,1,male,45.0,0,0,113050,26.55,B38,S,Major,Butt,Archibald Willingham,B,1,0,2.0


In [None]:
title_vocab = {
    'Mlle' : 'Miss', # Mademoiselle
    'Mme'  : 'Miss', # Madame
}

In [52]:
train_csv[train_csv['Ticket'] == '250651']

Unnamed: 0_level_0,Survived,Class,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,POL,Title,Surname,Names,Deck,Group Size,Family Size,Age_Class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
313,0,2,female,26.0,1,1,250651,26.0,,S,Mrs,Lahtinen,William (Anna Sylfven),,1,3,1.0


In [40]:
infer_csv['Title'].value_counts()

 Mr        240
 Miss       78
 Mrs        72
 Master     21
 Col         2
 Rev         2
 Dr          1
 Dona        1
 Ms          1
Name: Title, dtype: int64

In [53]:
infer_csv[infer_csv['Title'] == " Dona"]

Unnamed: 0_level_0,Class,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,POL,Title,Surname,Names,Deck,Group Size,Family Size,Age_Class
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1306,1,female,39.0,0,0,PC 17758,108.9,C105,C,Dona,Oliva y Ocana,Fermina,C,1,0,2.0


In [20]:
X_train, X_test, y_train, y_test = ml_ready(train_csv)
X = infer_ready(infer_csv)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item_labels[indexer[info_axis]]] = value


In [21]:
train_and_evaluate(X_train, X_test, y_train, y_test, X)

             precision    recall  f1-score   support

          0       0.84      0.85      0.84       162
          1       0.77      0.75      0.76       106

avg / total       0.81      0.81      0.81       268

[[138  24]
 [ 27  79]]
