In [1]:
from logging import getLogger, Formatter, StreamHandler, INFO
import numpy as np
import pandas as pd
from plotly import express as px
from plotly import graph_objects as go
from tqdm import tqdm

log = getLogger(__name__)
log_fmt = Formatter('%(asctime)s %(name)s %(lineno)d [%(levelname)s][%(funcName)s] %(message)s ')
handler = StreamHandler()
handler.setLevel('INFO')
handler.setFormatter(log_fmt)
log.addHandler(handler)


In [2]:
train = pd.read_csv('titanic/train.csv', delimiter=',')
test = pd.read_csv('titanic/test.csv', delimiter=',')
gender_submission = pd.read_csv('titanic/train.csv', delimiter=',')

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
gender_submission.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [6]:
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, cross_val_predict, cross_val_score, cross_validate, ParameterGrid
from sklearn.linear_model import LinearRegression, Ridge, SGDClassifier, SGDRegressor, LogisticRegression
from sklearn.metrics import accuracy_score, average_precision_score, roc_auc_score, roc_curve
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.svm import SVC, LinearSVC
import sklearn

In [7]:
y = train['Survived'].values
X = train.drop(['Survived'], axis=1)

X['Sex'].replace(['male','female'], [0, 1], inplace=True)
X['Embarked'].fillna(('S'), inplace=True)
X['Embarked'] = X['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
X['Fare'].fillna(np.mean(X['Fare']), inplace=True)
X['Age'].fillna(X['Age'].median(), inplace=True)
X['FamilySize'] = X['Parch'] + X['SibSp'] + 1
X['IsAlone'] = 0
X.loc[X['FamilySize'] == 1, 'IsAlone'] = 1

X.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1, inplace=True)
X.fillna(0, inplace=True)
fold = KFold(n_splits=5, shuffle=True, random_state=0)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=0, shuffle=True)

## LinearRegression

In [8]:
model = LinearRegression()
model.fit(X_train, y_train)
pred = model.predict(X_test)
((pred > 0.5) == y_test).mean()

0.7910447761194029

In [10]:
params_list = {'penalty': ['l1', 'l2'], 'C': [10e-4, 10e-2, 1, 10e+2, 10e+4], 'random_state': [0], 'max_iter': [1000]}


X_test = test.drop(['Survived'], axis=1)

X['Sex'].replace(['male','female'], [0, 1], inplace=True)
X['Embarked'].fillna(('S'), inplace=True)
X['Embarked'] = X['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
X['Fare'].fillna(np.mean(X['Fare']), inplace=True)
X['Age'].fillna(X['Age'].median(), inplace=True)
X['FamilySize'] = X['Parch'] + X['SibSp'] + 1
X['IsAlone'] = 0
X.loc[X_test['FamilySize'] == 1, 'IsAlone'] = 1

X.drop(['PassengerId', 'Name', 'Cabin', 'Ticket'], axis=1, inplace=True)
X.fillna(0, inplace=True)



for params in tqdm(list(ParameterGrid(params_list))):
    for train_idx, val_index in fold.split(X_train, y_train):
        xx = X_train.iloc[train_idx, :]
        yy = y_train[train_idx]
        x_val = X_train.iloc[train_idx, :]
        y_val = y_train[train_idx]

        model = LogisticRegression(**params)
        model.fit(xx, yy)
        pred = model.predict(x_val)
        acc = (pred == y_val).mean()
        print(acc)



KeyError: "['Survived'] not found in axis"

In [0]:
import lightgbm as lgb

categories = ['Pclass', 'Age', 'Fare']
params = {
    'objective': 'binary'
}

# for train_idx, val_index in fold.split(X_train, y_train):
#     xx = X_train.iloc[train_idx, :]
#     yy = y_train[train_idx]
#     x_val = X_train.iloc[train_idx, :]
#     y_val = y_train[train_idx]

#     lgb_train = lgb.Dataset(xx, yy, categorical_feature=categories)
#     lgb_eval = lgb.Dataset(x_val, y_val, categorical_feature=categories)
#     model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_eval])
#     pred = model.predict(X_test)
#     log.info(((pred > 0.5) == y_test ).mean())
#     # acc = ((pred > 0.5) == y_val).mean()
#     print(acc)

xx, x_val, yy, y_val = train_test_split(X_train, y_train, test_size=0.25)

lgb_train = lgb.Dataset(xx, yy, categorical_feature=categories)
lgb_eval = lgb.Dataset(x_val, y_val, categorical_feature=categories)
model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_eval])
pred = model.predict(X_test)
log.info(((pred > 0.5) == y_test ).mean())
# acc = ((pred > 0.5) == y_val).mean()
print(acc)



In [0]:
test

In [0]:
import optuna
def objective(trial):
    x = trial.suggest_uniform('x', -2, 2)
    return 3*x**4 - 2*x**3 - 4*x**2 + 2

study = optuna.create_study()
study.optimize(objective, n_trials=100)


In [0]:
study.best_params

In [0]:
study.best_value

In [0]:
study.best_trial

In [0]:
print(len(study.trials))
plot_x = [t.params['x'] for t in study.trials]
plot_y = [t.value for t in study.trials]
best_x = [study.best_params['x']]
best_y = [study.best_value]

fig = go.Figure()
fig.add_trace(go.Scatter(x=plot_x, y=plot_y, mode='markers'))
fig.add_trace(go.Scatter(x=best_x, y=best_y, mode='markers'))
fig.show()

In [0]:

import lightgbm as lgb
from optuna.integration import LightGBMPruningCallback

def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        # 'verbosity': -1,
        'boosting_type': trial.suggest_categorical('boosting', ['gbdt', 'dart']),
        'num_leaves': trial.suggest_int('num_leaves', 10, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1.0)
    }

    xx, x_val, yy, y_val = train_test_split(X_train, y_train, test_size=0.25)

    lgb_train = lgb.Dataset(xx, yy)
    lgb_eval = lgb.Dataset(x_val, y_val)

    pruning_callback = LightGBMPruningCallback(trial, 'auc')
    model = lgb.train(params, lgb_train, valid_sets=[lgb_eval], callbacks=[pruning_callback])
    pred = model.predict(X_test)
    return accuracy_score(y_test, np.rint(pred))

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
# print(objective(None))



In [0]:
xx, x_val, yy, y_val = train_test_split(X_train, y_train, test_size=0.25)

lgb_train = lgb.Dataset(xx, yy)
lgb_eval = lgb.Dataset(x_val, y_val)

p = {'boosting': 'gbdt', 'num_leaves': 292, 'learning_rate': 0.0626668243036112}

model = lgb.train(study.best_params, lgb_train, valid_sets=[lgb_eval])
pred = model.predict(X_test)
accuracy_score(y_test, np.rint(pred))

In [11]:
import optuna.integration.lightgbm as lgb
xx, x_val, yy, y_val = train_test_split(X_train, y_train, test_size=0.25)

lgb_train = lgb.Dataset(xx, yy)
lgb_eval = lgb.Dataset(x_val, y_val)

params = {
    'objective': 'binary',
    'verbosity': -1
}

help(lgb.train)

# model = lgb.train(params, lgb_train, valid_sets=[lgb_eval])
# pred = model.predict(X_test)
# accuracy_score(y_test, np.rint(pred))


Help on function train in module optuna.integration.lightgbm_tuner:

train(*args, **kwargs)
    Wrapper of LightGBM Training API to tune hyperparameters.
    
    
        This feature is experimental. The interface may be changed in the future.
    
    It tunes important hyperparameters (e.g., `min_child_samples` and `feature_fraction`) in a
    stepwise manner. Arguments and keyword arguments for `lightgbm.train()
    <https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.train.html>`_ can be passed.



In [0]:
study.best_params, study.best_value, study.best_trial