In [598]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math

%matplotlib inline

pd.set_option('precision', 3)

In [599]:
def transform_df(df):
    
    def get_title(row):
        step1 = row['Name'].split(',', 1)
        step2 = step1[1].split('.', 1)
    
        return step2[0].strip()

    def family_size(row):
        if row['SibSp'] == 0 and row['Parch'] == 0:
            return 0
        else:
            return row['SibSp'] + row['Parch'] + 1
        
    ticket_dict = df['Ticket'].value_counts()

    def get_same_ticket(ticket, dictionary):
        return dictionary[ticket]
    
    df['Title']       = df.apply(lambda x: get_title(x), axis=1)
    df['Group Size']  = df['Ticket'].apply(lambda x: get_same_ticket(x, ticket_dict))
    df['Family Size'] = df.apply(lambda x: family_size(x), axis=1)
    
    # calculating the fare per person, otherwise in the dataset I have the total fare per ticket
    df.loc[:,'Fare']  = df.loc[:,'Fare'] / df.loc[:,'Group Size']
    
    #df['Age'].fillna(df.groupby(['Pclass', 'Sex'])['Age'].transform("median"), inplace=True)

In [600]:
def ml_ready(df):
    from sklearn.cross_validation import train_test_split
    
    features = ['Age', 'Pclass', 'Sex', 'Family Size', 'Group Size', 'Title', 'Ticket']
    target   = 'Survived'
    
    X = df[features]
    y = df[target]
    
    for column in X.columns:
        if column != 'Age':
            X.loc[:,column], junk = X.loc[:,column].factorize()

    return train_test_split(X, y, test_size=0.3, random_state=379582)

def infer_ready(df):
    features = ['Age','Pclass', 'Sex', 'Family Size', 'Group Size', 'Title', 'Ticket']
    
    X = df[features]
    
    for column in X.columns:
        if column != 'Age':
            X.loc[:,column], junk = X.loc[:,column].factorize()
    
    return X

def train_and_evaluate(X_train, X_test, y_train, y_test, infer):

    from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
    from sklearn.metrics import confusion_matrix, classification_report
    from sklearn.model_selection import GridSearchCV

    gboost = GradientBoostingClassifier(n_estimators=40, learning_rate=0.09)

    gboost.fit(X_train, y_train)

    predictions = gboost.predict(X_test)

    print(classification_report(y_test, predictions))
    print(confusion_matrix(y_test, predictions))
    
    submissions = gboost.predict(infer)
    
    submission = pd.DataFrame(index=infer.index)
    submission['Survived'] = submissions
    submission['Survived'] = submission['Survived'].astype('int32')
    
    submission.to_csv('submission-03.csv')

In [601]:
train_csv = pd.read_csv('../input/train.csv')
infer_csv = pd.read_csv('../input/test.csv')

In [602]:
train_csv.set_index('PassengerId', inplace=True)
infer_csv.set_index('PassengerId', inplace=True)
df = pd.concat([train_csv, infer_csv], axis=0, sort=False)

In [603]:
transform_df(df)

Building a model to predict a passenger's age using the other columns

In [607]:
train = df[df['Age'].isnull() == False]
infer = df[df['Age'].isnull() == True]

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

features = ['Title', 'Group Size', 'Pclass']
target = 'Age'

X = train[features]
y = train[target]

ukn = infer[features]

for column in X.columns:
    X[column], junk = X[column].factorize()
    
for column in ukn.columns:
    ukn[column], junk = ukn[column].factorize()

params = {
    'n_estimators'      : [40, 60, 65],
    'learning_rate'     : [0.09, 0.1, 0.011],
    'max_depth'         : [5],
    'subsample'         : [1.0],
    'min_samples_split' : [2],
    'min_samples_leaf'  : [10],
}

model = GradientBoostingRegressor(random_state=369582)

grid = GridSearchCV(
    model, 
    params,
    scoring='neg_mean_squared_error',
    iid=False,
    cv=5
)
grid.fit(X, y)

grid.grid_scores_, grid.best_params_, grid.best_score_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


([mean: -118.42889, std: 11.85648, params: {'learning_rate': 0.09, 'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 40, 'subsample': 1.0},
  mean: -119.15500, std: 12.49263, params: {'learning_rate': 0.09, 'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 60, 'subsample': 1.0},
  mean: -119.31371, std: 12.60655, params: {'learning_rate': 0.09, 'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 65, 'subsample': 1.0},
  mean: -118.45483, std: 12.02994, params: {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 40, 'subsample': 1.0},
  mean: -119.20567, std: 12.54854, params: {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 60, 'subsample': 1.0},
  mean: -119.31248, std: 12.68117, params: {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 2, 'n_estimators': 65, 's

In [578]:
model = GradientBoostingRegressor(
    random_state=369582,
    max_depth=5,
    learning_rate=0.0992,
    min_samples_leaf=10,
    n_estimators=65
)

model.fit(X,y)

predictions = model.predict(ukn)

In [579]:
df.loc[ukn.index,'Age'] = predictions

Actual Prediction of 'Survived'

In [580]:
index1 = df[df['Survived'].isnull() == False].index
index2 = df[df['Survived'].isnull() == True].index

train_csv = df.loc[index1,:]
infer_csv = df.loc[index2,:]

In [581]:
X_train, X_test, y_train, y_test = ml_ready(train_csv)
X = infer_ready(infer_csv)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [582]:
train_and_evaluate(X_train, X_test, y_train, y_test, X)

             precision    recall  f1-score   support

        0.0       0.82      0.90      0.86       162
        1.0       0.81      0.70      0.75       106

avg / total       0.82      0.82      0.81       268

[[145  17]
 [ 32  74]]


Currently no influence on the model, just for benchmark info

In [569]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

params = {
    'n_estimators'      : [35,40,45],
    'learning_rate'     : [0.09],
    'max_depth'         : [3],
    'subsample'         : [1.0],
    'min_samples_split' : [2],
    'min_samples_leaf'  : [1],
    'max_features'      : [None]
}

model = GradientBoostingClassifier(random_state=369582)

grid = GridSearchCV(
    model, 
    params,
    scoring='accuracy',
    iid=False,
    cv=5
)
grid.fit(X_train, y_train)

grid.grid_scores_, grid.best_params_, grid.best_score_



([mean: 0.81228, std: 0.02448, params: {'learning_rate': 0.09, 'max_depth': 3, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 35, 'subsample': 1.0},
  mean: 0.80588, std: 0.02575, params: {'learning_rate': 0.09, 'max_depth': 3, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 40, 'subsample': 1.0},
  mean: 0.80585, std: 0.02382, params: {'learning_rate': 0.09, 'max_depth': 3, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 45, 'subsample': 1.0}],
 {'learning_rate': 0.09,
  'max_depth': 3,
  'max_features': None,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 35,
  'subsample': 1.0},
 0.8122781362007168)