In [None]:
import pandas as pd
import numpy as np
from time import time

from sklearn.preprocessing import RobustScaler, Imputer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn import metrics

In [None]:
df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
print(len(df), len(test))

In [None]:
t = np.count_nonzero(df.isnull().values)
print(t)

In [None]:
df = df.drop(['Ticket','Cabin', 'PassengerId', 'Name'], axis=1)
# Remove NaN values
print(df.shape)
df = df.dropna()
print(df.shape)

test = test.drop(['Ticket','Cabin', 'Name'], axis=1)
print(test.shape)
# test = test.dropna()
print(test.shape)

In [None]:
df['sex_f'] = np.where(df['Sex'] == 'female', 1, 0)
test['sex_f'] = np.where(test['Sex'] == 'female', 1, 0)

C = Cherbourg, Q = Queenstown, S = Southampton

In [None]:
embarked_dum = pd.get_dummies(df['Embarked'], prefix='Embarked')
embarked_dum2 = pd.get_dummies(test['Embarked'], prefix='Embarked')

In [None]:
train = pd.concat([df, embarked_dum], axis=1)
test = pd.concat([test, embarked_dum2], axis=1)

In [None]:
train = train.drop(['Embarked','Sex'], axis=1)
test = test.drop(['Embarked','Sex'], axis=1)

In [None]:
print(list(train))
print(list(test))

In [None]:
X = train[[c for c in train if 'Survived' not in c]]
y = train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=36)

In [None]:
preprocessing = Pipeline([
    ('impute', Imputer()),
    ('scale', StandardScaler())
])

X_train_sc = preprocessing.fit_transform(X_train)
X_test_sc = preprocessing.transform(X_test)

In [None]:
print(X_train_sc.shape)
print(X_test_sc.shape)

In [None]:
# lr = LogisticRegression()
# lr.fit(X_train_sc, y_train)

In [None]:
# preds = lr.predict(X_test_sc)
# preds

In [None]:
# probs = lr.predict_proba(X_test_sc)[:,1]

In [None]:
# metrics.accuracy_score(y_test, preds)

In [None]:
# metrics.roc_auc_score(y_test, probs)

In [None]:
# train_preds = lr.predict(X_train_sc)
# metrics.accuracy_score(y_train, train_preds)

In [None]:
# probs_train = lr.predict_proba(X_train_sc)[:,1]
# metrics.roc_auc_score(y_train, probs_train)

In [None]:
# probs.shape

http://scikit-learn.org/stable/modules/model_evaluation.html#accuracy-score

In [None]:
tuned_parameters = {"max_depth": [3, None],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"],
                "n_estimators": [10, 50]}
scores = ['roc_auc', 'accuracy']

In [None]:
for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(RandomForestClassifier(random_state=1), tuned_parameters, cv=5,
                       scoring=score)
    clf.fit(X_train_sc, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test_sc)
    print(classification_report(y_true, y_pred))
    print()

In [None]:
# specify parameters and distributions to sample from
param_dist = {"max_depth": [3, None],
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]}

# run randomized search
n_iter_search = 8
random_search = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_dist,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(X_train_sc, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
            % (mean, std * 2, params))
print()

y_true, y_pred = y_test, random_search.predict(X_test_sc)
print(classification_report(y_true, y_pred))

In [None]:
ids = test['PassengerId']
predictions = random_search.predict(test.drop('PassengerId', axis=1))


output = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions })
output.to_csv('titanic-predictions.csv', index = False)
print(output.head())
len(output)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score

highest = dict()
a = []
max_estimators = 50
init = 1

for i in range(1, max_estimators):
    score = cross_val_score(GradientBoostingClassifier(n_estimators=i, learning_rate=10.0/float(i)), 
                        X_train_sc, y_train, cv=10, scoring='accuracy').mean()
#     print('learning rate:', 10.0/float(i), 'estimators:', i, 'score:', score)
    
    if init == 1:
        highest['score'] = score
        highest['learning_rate'] = 10.0/float(i)
        highest['estimators'] = i
        init += 1
    
    if score > highest['score']:
        highest['score'] = score
        
    a.append(score)
    
highest

In [None]:
tuned_parameters = [{'C': [0.1, 0.3, 0.5, 0.7, 1, 10, 100, 1000]}]
from scipy.stats import randint as sp_randint

scores = ['roc_auc', 'accuracy']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(LogisticRegression(random_state=1), tuned_parameters, cv=5,
                       scoring=score)
    clf.fit(X_train_sc, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test_sc)
    print(classification_report(y_true, y_pred))
    print()