In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

In [2]:
train_data = pd.read_csv('../input/titanic/train.csv')
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train_data.drop(['PassengerId','Name', 'Ticket', 'Embarked'], axis=1, inplace=True)
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin
0,0,3,male,22.0,1,0,7.25,
1,1,1,female,38.0,1,0,71.2833,C85
2,1,3,female,26.0,0,0,7.925,
3,1,1,female,35.0,1,0,53.1,C123
4,0,3,male,35.0,0,0,8.05,


In [4]:
gender = {'male':0, 'female':1}
train_data['Sex'] = train_data['Sex'].map(gender)
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin
0,0,3,0,22.0,1,0,7.25,
1,1,1,1,38.0,1,0,71.2833,C85
2,1,3,1,26.0,0,0,7.925,
3,1,1,1,35.0,1,0,53.1,C123
4,0,3,0,35.0,0,0,8.05,


In [5]:
train_data['Cabin_ind'] = np.where(train_data['Cabin'].isnull(), 0, 1)
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Cabin_ind
0,0,3,0,22.0,1,0,7.25,,0
1,1,1,1,38.0,1,0,71.2833,C85,1
2,1,3,1,26.0,0,0,7.925,,0
3,1,1,1,35.0,1,0,53.1,C123,1
4,0,3,0,35.0,0,0,8.05,,0


In [6]:
train_data['Age'].fillna(train_data['Age'].mean(), inplace=True)
train_data.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Cabin_ind
0,0,3,0,22.0,1,0,7.25,,0
1,1,1,1,38.0,1,0,71.2833,C85,1
2,1,3,1,26.0,0,0,7.925,,0
3,1,1,1,35.0,1,0,53.1,C123,1
4,0,3,0,35.0,0,0,8.05,,0
5,0,3,0,29.699118,0,0,8.4583,,0
6,0,1,0,54.0,0,0,51.8625,E46,1
7,0,3,0,2.0,3,1,21.075,,0
8,1,3,1,27.0,0,2,11.1333,,0
9,1,2,1,14.0,1,0,30.0708,,0


In [7]:
train_data['Family_cnt'] = train_data['SibSp']+train_data['Parch']
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Cabin_ind,Family_cnt
0,0,3,0,22.0,1,0,7.25,,0,1
1,1,1,1,38.0,1,0,71.2833,C85,1,1
2,1,3,1,26.0,0,0,7.925,,0,0
3,1,1,1,35.0,1,0,53.1,C123,1,1
4,0,3,0,35.0,0,0,8.05,,0,0


In [8]:
train_data.drop(['Cabin','Parch','SibSp'], axis=1, inplace=True)
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Cabin_ind,Family_cnt
0,0,3,0,22.0,7.25,0,1
1,1,1,1,38.0,71.2833,1,1
2,1,3,1,26.0,7.925,0,0
3,1,1,1,35.0,53.1,1,1
4,0,3,0,35.0,8.05,0,0


In [9]:
X = train_data.drop(['Survived'], axis=1)
y = train_data['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5)

In [10]:
for item in [y_train, y_val, y_test]:
    print(round(np.size(item)/np.size(y)*100))

60
20
20


In [11]:
model = GradientBoostingClassifier()
scores = cross_val_score(model, X_train, y_train, cv=5)
scores

array([0.8411215 , 0.72897196, 0.79439252, 0.85046729, 0.76415094])

In [12]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.80 (+/- 0.09)


In [13]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))
        
def print_best_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    params = results.cv_results_['params']
    
    df = pd.DataFrame(np.array(means,stds))#, stds, columns=['Means', 'Stds'])#, params)
    return df
#     for mean, std, params in zip(means, stds, results.cv_results_['params']):
#         print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [14]:
GB = GradientBoostingClassifier()
parameters = {
    'n_estimators': [5, 50, 250, 500],
    'max_depth': [1, 3, 5, 7, 9],
    'learning_rate': [0.001, 0.01, 0.1, 1, 10]
}

cv = GridSearchCV(GB, parameters, cv=5)
cv.fit(X_train, y_train)

print_results(cv)

BEST PARAMS: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 250}

0.629 (+/-0.008) for {'learning_rate': 0.001, 'max_depth': 1, 'n_estimators': 5}
0.629 (+/-0.008) for {'learning_rate': 0.001, 'max_depth': 1, 'n_estimators': 50}
0.629 (+/-0.008) for {'learning_rate': 0.001, 'max_depth': 1, 'n_estimators': 250}
0.742 (+/-0.124) for {'learning_rate': 0.001, 'max_depth': 1, 'n_estimators': 500}
0.629 (+/-0.008) for {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 5}
0.629 (+/-0.008) for {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 50}
0.732 (+/-0.127) for {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 250}
0.786 (+/-0.034) for {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 500}
0.629 (+/-0.008) for {'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 5}
0.629 (+/-0.008) for {'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 50}
0.751 (+/-0.058) for {'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 250}
0.775 (+/-0.064) for {

In [15]:
means = cv.cv_results_['mean_test_score']
stds = cv.cv_results_['std_test_score']
params = cv.cv_results_['params']

max_depth = []
n_estimators = []
learning_rate = []
for item in params:
    max_depth.append(item['max_depth'])
    n_estimators.append(item['n_estimators'])
    learning_rate.append(item['learning_rate'])

data = {"Means": means,
       "Standard deviations": stds,
       "Max depth": max_depth,
       "Number of estimators": n_estimators,
       "Learning rate": learning_rate}
df = pd.DataFrame(data)

In [16]:
df.sort_values(by=['Means'], ascending=False, inplace=True)
df.reset_index(drop=True, inplace=True)
df.head(10)

Unnamed: 0,Means,Standard deviations,Max depth,Number of estimators,Learning rate
0,0.808923,0.03314,3,250,0.1
1,0.807018,0.046896,3,500,0.1
2,0.805202,0.021226,3,250,0.01
3,0.805149,0.03677,1,50,0.1
4,0.805149,0.03677,1,500,0.01
5,0.797725,0.040489,1,500,0.1
6,0.795838,0.019531,3,5,1.0
7,0.795821,0.034555,3,500,0.01
8,0.795803,0.02923,1,250,1.0
9,0.793969,0.043223,1,250,0.1


In [17]:
gb1 = GradientBoostingClassifier(max_depth=3, n_estimators=250, learning_rate=0.1)
gb1.fit(X_train, y_train)

gb2 = GradientBoostingClassifier(max_depth=1, n_estimators=250, learning_rate=1)
gb2.fit(X_train, y_train)

gb3 = GradientBoostingClassifier(max_depth=3, n_estimators=500, learning_rate=0.1)
gb3.fit(X_train, y_train)

gb7 = GradientBoostingClassifier(max_depth=1, n_estimators=500, learning_rate=0.1)
gb7.fit(X_train, y_train)

GradientBoostingClassifier(max_depth=1, n_estimators=500)

In [18]:
for mdl in [gb1, gb2, gb3, gb7]:
    y_pred = mdl.predict(X_val)
    accuracy = round(accuracy_score(y_val, y_pred), 3)
    precision = round(precision_score(y_val, y_pred), 3)
    recall = round(recall_score(y_val, y_pred), 3)
    print('MAX DEPTH: {} / # OF EST: {} -- A: {} / P: {} / R: {}'.format(mdl.max_depth,
                                                                         mdl.n_estimators,
                                                                         accuracy,
                                                                         precision,
                                                                         recall))

MAX DEPTH: 3 / # OF EST: 250 -- A: 0.826 / P: 0.843 / R: 0.747
MAX DEPTH: 1 / # OF EST: 250 -- A: 0.82 / P: 0.831 / R: 0.747
MAX DEPTH: 3 / # OF EST: 500 -- A: 0.803 / P: 0.824 / R: 0.709
MAX DEPTH: 1 / # OF EST: 500 -- A: 0.809 / P: 0.808 / R: 0.747


In [19]:
y_pred = gb1.predict(X_test)
accuracy = round(accuracy_score(y_test, y_pred), 3)
precision = round(precision_score(y_test, y_pred), 3)
recall = round(recall_score(y_test, y_pred), 3)
print('MAX DEPTH: {} / # OF EST: {} -- A: {} / P: {} / R: {}'.format(gb1.max_depth,
                                                                     gb1.n_estimators,
                                                                     accuracy,
                                                                     precision,
                                                                     recall))

MAX DEPTH: 3 / # OF EST: 250 -- A: 0.827 / P: 0.793 / R: 0.708


In [20]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

gender_num = {"male":0,"female":1}
test_data['Sex'] = test_data['Sex'].map(gender_num)
test_data['Age'].fillna(test_data['Age'].mean(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].mean(), inplace=True)
test_data['Family_cnt'] = test_data['SibSp'] + test_data['Parch']
test_data['Cabin_ind'] = np.where(test_data['Cabin'].isnull(),0,1)
test_data.drop(['SibSp','Parch','Cabin','Name','Ticket','Embarked'], axis=1, inplace=True)
features = ["Pclass", "Sex", "Age", "Fare",'Family_cnt','Cabin_ind']
X_test = pd.get_dummies(test_data[features])

# gb1.fit(X, y)

predictions = gb1.predict(X_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('GradientBoostingClassifier2.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [21]:
my_data = pd.read_csv("/kaggle/working/GradientBoostingClassifier2.csv")
my_data.head(10)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,1
5,897,0
6,898,0
7,899,0
8,900,1
9,901,0
