In [80]:
import pandas as pd
import numpy as np
from time import time
import xgboost

from sklearn.preprocessing import RobustScaler, Imputer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn import metrics



In [2]:
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
ids = test['PassengerId']


In [3]:
print(len(df), len(test))

891 418


In [4]:
t = np.count_nonzero(df.isnull().values)
print(t)

866


In [5]:
df = df.drop(['Ticket','Cabin', 'PassengerId', 'Name'], axis=1)
# Remove NaN values
print(df.shape)
# df = df.dropna()
print(df.shape)

test = test.drop(['Ticket','Cabin', 'PassengerId', 'Name'], axis=1)
print(test.shape)
# test = test.dropna()
print(test.shape)

(891, 8)
(891, 8)
(418, 7)
(418, 7)


In [6]:
df['sex_f'] = np.where(df['Sex'] == 'female', 1, 0)
test['sex_f'] = np.where(test['Sex'] == 'female', 1, 0)

C = Cherbourg, Q = Queenstown, S = Southampton

In [7]:
embarked_dum = pd.get_dummies(df['Embarked'], prefix='Embarked')
embarked_dum2 = pd.get_dummies(test['Embarked'], prefix='Embarked')

In [8]:
train = pd.concat([df, embarked_dum], axis=1)
test = pd.concat([test, embarked_dum2], axis=1)

In [9]:
train = train.drop(['Embarked','Sex'], axis=1)
test = test.drop(['Embarked','Sex'], axis=1)

In [10]:
print(list(train))
print(list(test))
cols = test[[c for c in test if 'Survived' not in c]]
cols

['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'sex_f', 'Embarked_C', 'Embarked_Q', 'Embarked_S']
['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'sex_f', 'Embarked_C', 'Embarked_Q', 'Embarked_S']


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,sex_f,Embarked_C,Embarked_Q,Embarked_S
0,3,34.5,0,0,7.8292,0,0,1,0
1,3,47.0,1,0,7.0000,1,0,0,1
2,2,62.0,0,0,9.6875,0,0,1,0
3,3,27.0,0,0,8.6625,0,0,0,1
4,3,22.0,1,1,12.2875,1,0,0,1
5,3,14.0,0,0,9.2250,0,0,0,1
6,3,30.0,0,0,7.6292,1,0,1,0
7,2,26.0,1,1,29.0000,0,0,0,1
8,3,18.0,0,0,7.2292,1,1,0,0
9,3,21.0,2,0,24.1500,0,0,0,1


In [11]:
X = train[[c for c in train if 'Survived' not in c]]
y = train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=36)

In [81]:
preprocessing = Pipeline([
    ('impute', Imputer(missing_values='NaN', strategy='mean', axis=0)),
    ('scale', StandardScaler())
])

X_train_sc = preprocessing.fit_transform(X_train)
X_test_sc = preprocessing.transform(X_test)
test = preprocessing.transform(test)

In [13]:
print(X_train_sc.shape)
print(X_test_sc.shape)

(712, 9)
(179, 9)


In [14]:
# lr = LogisticRegression()
# lr.fit(X_train_sc, y_train)

In [15]:
# preds = lr.predict(X_test_sc)
# preds

In [16]:
# probs = lr.predict_proba(X_test_sc)[:,1]

In [17]:
# metrics.accuracy_score(y_test, preds)

In [18]:
# metrics.roc_auc_score(y_test, probs)

In [19]:
# train_preds = lr.predict(X_train_sc)
# metrics.accuracy_score(y_train, train_preds)

In [20]:
# probs_train = lr.predict_proba(X_train_sc)[:,1]
# metrics.roc_auc_score(y_train, probs_train)

In [21]:
# probs.shape

http://scikit-learn.org/stable/modules/model_evaluation.html#accuracy-score

In [22]:
tuned_parameters = {"max_depth": [3, None],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"],
                "n_estimators": [10, 50]}
scores = ['roc_auc', 'accuracy']

In [23]:
for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(RandomForestClassifier(random_state=1), tuned_parameters, cv=5,
                       scoring=score)
    clf.fit(X_train_sc, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test_sc)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for roc_auc

Best parameters set found on development set:

{'bootstrap': True, 'criterion': 'gini', 'max_depth': 3, 'n_estimators': 50}

Grid scores on development set:

0.841 (+/-0.079) for {'bootstrap': True, 'criterion': 'gini', 'max_depth': 3, 'n_estimators': 10}
0.848 (+/-0.077) for {'bootstrap': True, 'criterion': 'gini', 'max_depth': 3, 'n_estimators': 50}
0.832 (+/-0.070) for {'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'n_estimators': 10}
0.844 (+/-0.075) for {'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'n_estimators': 50}
0.841 (+/-0.081) for {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 10}
0.848 (+/-0.078) for {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 50}
0.837 (+/-0.070) for {'bootstrap': True, 'criterion': 'entropy', 'max_depth': None, 'n_estimators': 10}
0.844 (+/-0.074) for {'bootstrap': True, 'criterion': 'entropy', 'max_depth': None, 'n_estimators':

In [24]:
# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 8
random_search = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_dist,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(X_train_sc, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))
print()

y_true, y_pred = y_test, random_search.predict(X_test_sc)
print(classification_report(y_true, y_pred))

RandomizedSearchCV took 0.38 seconds for 8 candidates parameter settings.

0.806 (+/-0.092) for {'bootstrap': True, 'criterion': 'gini', 'max_depth': 3, 'n_estimators': 10}
0.819 (+/-0.085) for {'bootstrap': True, 'criterion': 'gini', 'max_depth': 3, 'n_estimators': 50}
0.791 (+/-0.057) for {'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'n_estimators': 10}
0.801 (+/-0.047) for {'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'n_estimators': 50}
0.802 (+/-0.097) for {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 10}
0.819 (+/-0.093) for {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 50}
0.798 (+/-0.036) for {'bootstrap': True, 'criterion': 'entropy', 'max_depth': None, 'n_estimators': 10}
0.798 (+/-0.048) for {'bootstrap': True, 'criterion': 'entropy', 'max_depth': None, 'n_estimators': 50}
0.815 (+/-0.082) for {'bootstrap': False, 'criterion': 'gini', 'max_depth': 3, 'n_estimators': 10}
0.810 (+/-0.082) f

In [25]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

highest = dict()
a = []
max_estimators = 50
init = 1

for i in range(1, max_estimators):
    score = cross_val_score(GradientBoostingClassifier(n_estimators=i, learning_rate=10.0/float(i)), 
                        X_train_sc, y_train, cv=10, scoring='accuracy').mean()
    print('learning rate:', 10.0/float(i), 'estimators:', i, 'score:', score)
    
    if init == 1:
        highest['score'] = score
        highest['learning_rate'] = 10.0/float(i)
        highest['estimators'] = i
        init += 1
    
    if score > highest['score']:
        highest['score'] = score
        
    a.append(score)
highest

learning rate: 10.0 estimators: 1 score: 0.784781466577
learning rate: 5.0 estimators: 2 score: 0.323586519115
learning rate: 3.3333333333333335 estimators: 3 score: 0.719826179298
learning rate: 2.5 estimators: 4 score: 0.411967359714
learning rate: 2.0 estimators: 5 score: 0.801624189582
learning rate: 1.6666666666666667 estimators: 6 score: 0.770753409345
learning rate: 1.4285714285714286 estimators: 7 score: 0.782040576794
learning rate: 1.25 estimators: 8 score: 0.783350659513
learning rate: 1.1111111111111112 estimators: 9 score: 0.797476525822
learning rate: 1.0 estimators: 10 score: 0.798845293986
learning rate: 0.9090909090909091 estimators: 11 score: 0.788947574335
learning rate: 0.8333333333333334 estimators: 12 score: 0.808765369998
learning rate: 0.7692307692307693 estimators: 13 score: 0.797498323273
learning rate: 0.7142857142857143 estimators: 14 score: 0.808647999106
learning rate: 0.6666666666666666 estimators: 15 score: 0.79463894478
learning rate: 0.625 estimators: 

{'estimators': 1, 'learning_rate': 10.0, 'score': 0.82691873463000221}

In [26]:
tuned_parameters = [{'C': [0.1, 0.3, 0.5, 0.7, 1, 10, 100, 1000]}]
from scipy.stats import randint as sp_randint

scores = ['roc_auc', 'accuracy']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(LogisticRegression(random_state=1), tuned_parameters, cv=5,
                       scoring=score)
    clf.fit(X_train_sc, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test_sc)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for roc_auc

Best parameters set found on development set:

{'C': 1}

Grid scores on development set:

0.843 (+/-0.077) for {'C': 0.1}
0.843 (+/-0.081) for {'C': 0.3}
0.844 (+/-0.081) for {'C': 0.5}
0.844 (+/-0.081) for {'C': 0.7}
0.844 (+/-0.082) for {'C': 1}
0.844 (+/-0.082) for {'C': 10}
0.844 (+/-0.082) for {'C': 100}
0.844 (+/-0.082) for {'C': 1000}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.

             precision    recall  f1-score   support

          0       0.84      0.82      0.83       105
          1       0.75      0.78      0.77        74

avg / total       0.81      0.80      0.80       179


# Tuning hyper-parameters for accuracy

Best parameters set found on development set:

{'C': 0.1}

Grid scores on development set:

0.802 (+/-0.094) for {'C': 0.1}
0.794 (+/-0.095) for {'C': 0.3}
0.795 (+/-0.090) for {'C': 0.5}
0.795 (+/-0.090) for {'C': 0.7}
0.79

In [79]:
gr = GradientBoostingClassifier(n_estimators=1, learning_rate=10, random_state=36, loss='exponential')
gr.fit(X_train_sc, y_train)

predictions = gr.predict(test)

feat_imp = gr.train_score_[0]
print(feat_imp)

prob = gr.score(X_test_sc, y_test)
print(prob)


100.739528083
0.849162011173


In [69]:
output = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions })
output.to_csv('titanic-predictions.csv', index = False)
print(output.head())
len(output)

   PassengerId  Survived
0          892         0
1          893         0
2          894         1
3          895         0
4          896         0


418