In [1]:
import pandas as pd
import numpy as np
import pickle
import logging
import json

from model_metrics import format_results
import data_clean_for_model
import PipelineHelper
from sklearn.preprocessing import StandardScaler



In [2]:
logging.basicConfig(level = logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
### 1. Load Data
df = pd.read_parquet("data/all_processed_df.parquet.gzip")

In [4]:
k = 5
rseed = 229
df["outcome"] = np.where( df["state"]=="successful", 1, 0, )
df["un_id"] = np.arange(0, df.shape[0], 1 )
df["name_len"] = df["name"].str.len()
df["cv_group"] = np.random.choice( np.arange(0, k), size=df.shape[0] )
df["binned_usd_goal"] = pd.qcut( np.log(df["usd_goal"]+1), 20 )

with open("model_config.json", 'r') as j:
     model_params = json.loads(j.read())
model_params['naive_bayes']['ngram_range'] = tuple(model_params['naive_bayes']['ngram_range'])

In [5]:
## load project metadata
logger.info("Loading features")
try:
    f = open("data/features.pkl", "rb")
    ft_dict = pickle.load(f)
    f.close()
    X_train, y_train, X_test, y_test = ft_dict.values()
except:
    X_train, X_test, y_train, y_test = data_clean_for_model.data_clean_for_model(df, "outcome", model_params, cv=model_params["cv"])

INFO:__main__:Loading features


In [6]:
# load text
logger.info("Processing text data")
blurb_train, blurb_test, _, _    = data_clean_for_model.process_blurb(df, model_params)

INFO:__main__:Processing text data


In [7]:
## 2. Run text models

try: 
    f = open("data/res/text_models.pkl", "rb")
    text_models = pickle.load(f)
    f.close()
except:
    raise Warning("Text models do not exist. Will load from scratch")
# get naive bayes predictions
logger.info("Loading Naive Bayes predictions")
try:
    #nb_proba_train = np.load("data/res/multi_nb_preds_train.npy")
    #nb_proba_test = np.load("data/res/multi_nb_preds_test.npy")
    nb_proba_train, nb_proba_test = text_models['nb_train'], text_models['nb_test']
except:
    logger.info("Running Naive Bayes model")
    nb_params = model_params['naive_bayes']
    nb_train_pred, nb_proba_train, nb_test_pred, nb_proba_test = PipelineHelper.naive_bayes_predictions(
        blurb_train, y_train, blurb_test,
        tfidf=nb_params['tf-idf'], ngram_range=nb_params['ngram_range']
    )
    np.save("data/res/multi_nb_preds_train.npy", nb_proba_train)
    np.save("data/res/multi_nb_preds_test.npy", nb_proba_test)

# get LDA topic model
logger.info("Loading LDA topic predictions")
try:
    lda_train, lda_test = text_models['lda_train'], text_models['lda_test']
    #lda_train = pd.read_csv("data/res/lda_train.csv").drop(columns=['Unnamed: 0'])
    #lda_test = pd.read_csv("data/res/lda_test.csv").drop(columns=['Unnamed: 0'])
except:
    logger.info("Running LDA topic model")
    lda_params = model_params['lda']
    tokenized_train = blurb_train.apply(data_clean_for_model.tokenize_text)
    tokenized_test = blurb_test.apply(data_clean_for_model.tokenize_text)
    lda_train, lda_test = PipelineHelper.train_lda_model(tokenized_train, tokenized_test, params['lda'])
    lda_train.to_csv("data/res/lda_train.csv")
    lda_test.to_csv("data/res/lda_test.csv")

# get Word2Vec model predictions
logger.info("Loading Word2Vec dimension predictions")
try:
    #f = open("data/res/w2v_dict.pkl", "rb")
    #w2v_dict = pickle.load(f)
    #f.close()
    #w2v_train, w2v_test = w2v_dict.values()
    w2v_train, w2v_test = text_models['w2v_train'], text_models['w2v_test']
except:
    raise Warning("Word2Vec function not implemented. Running without it -- likely will crash.")

INFO:__main__:Loading Naive Bayes predictions
INFO:__main__:Loading LDA topic predictions
INFO:__main__:Loading Word2Vec dimension predictions


In [8]:
## HACK: change train size (only downside here is that we are not also updating our NB/LDA/W2V performance)
new_train_ind = int(X_test.shape[0]/3)
X_train = pd.concat((X_train, X_test[0:new_train_ind])).reset_index(drop=True)
X_test = X_test[new_train_ind:].reset_index(drop=True)

nb_proba_train = np.vstack((nb_proba_train, nb_proba_test[0:new_train_ind, :]))
nb_proba_test = nb_proba_test[new_train_ind:, :]

lda_train = pd.concat((lda_train, lda_test[0:new_train_ind])).reset_index(drop=True)
lda_test = lda_test[new_train_ind:].reset_index(drop=True)

w2v_train = pd.concat((pd.DataFrame(w2v_train), pd.DataFrame(w2v_test)[0:new_train_ind] )).reset_index(drop=True)
w2v_test = pd.DataFrame(w2v_test)[new_train_ind:].reset_index(drop=True)

y_train = pd.concat((y_train, y_test[0:new_train_ind])).reset_index(drop=True)
y_test = y_test[new_train_ind:].reset_index(drop=True)

assert X_train.shape[0] == nb_proba_train.shape[0] == lda_train.shape[0] == w2v_train.shape[0]
assert X_test.shape[0] == nb_proba_test.shape[0] == lda_test.shape[0] == w2v_test.shape[0]

In [8]:
import tensorflow as tf
from tensorflow import keras

In [10]:
from tensorflow.keras.models import Sequential 
from keras.layers import Dense

In [11]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [12]:
from sklearn.model_selection import StratifiedKFold

In [17]:
id_vars = ["un_id", "cv_group"]
id_train = X_train[id_vars]
id_test = X_test[id_vars]
X_train = X_train.drop(columns=id_vars)
X_test = X_test.drop(columns=id_vars)

In [18]:
X_train_nb = X_train.copy()
X_test_nb = X_test.copy()
# NB 
X_train_nb['nb_proba'] = nb_proba_train[:, 1]
X_test_nb['nb_proba'] = nb_proba_test[:, 1]

In [19]:
scaler = StandardScaler()
time_vars = ["deadline", "launched_at", "time_diff"]
scale_vars = ["blurb_len", "name_len"] + time_vars
X_train_nb_scale = X_train_nb.copy()
X_test_nb_scale = X_test_nb.copy()
X_train_nb_scale[ scale_vars ] = scaler.fit_transform(X_train_nb[ scale_vars ])
X_test_nb_scale[ scale_vars ] = scaler.transform(X_test_nb[ scale_vars ])
X_train_nb_scale2 = X_train_nb_scale.copy()
X_test_nb_scale2 = X_test_nb_scale.copy()
usd_goal_cols = X_train_nb_scale2.columns[X_train_nb_scale2.columns.str.contains("usd_goal")]
X_train_nb_scale2[ usd_goal_cols ] = scaler.fit_transform(X_train_nb_scale2[ usd_goal_cols ])
X_test_nb_scale2[ usd_goal_cols ] = scaler.transform(X_test_nb_scale2[ usd_goal_cols ])

In [31]:
def create_baseline():
    model = Sequential()
    model.add(Dense(3, input_dim=X_train_nb_scale2.shape[1], activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
    return model 

est = KerasClassifier(build_fn=create_baseline, epochs=100, batch_size=10, verbose=1)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=229)
results = cross_val_score(est, X_train_nb_scale2.iloc[0:10000], y_train[0:10000], cv=kfold, verbose=5, n_jobs=4)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   5 | elapsed:  2.1min remaining:  3.2min


Baseline: 83.02% (1.09%)


[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:  3.2min finished


In [33]:
def create_baseline():
    model = Sequential()
    model.add(Dense(5, input_dim=X_train_nb_scale2.shape[1], activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
    return model 
est = KerasClassifier(build_fn=create_baseline, epochs=30, batch_size=5, verbose=1)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=229)
results = cross_val_score(est, X_train_nb_scale2, y_train, cv=kfold, verbose=5, n_jobs=4)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   2 out of   5 | elapsed: 24.7min remaining: 37.0min
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed: 36.3min finished


Baseline: 84.82% (0.55%)


In [34]:
results

array([0.85101694, 0.84870058, 0.85384178, 0.8495155 , 0.83782029])

In [34]:
def create_baseline(input_layer=25):
    model = Sequential()
    model.add(Dense(input_layer, input_dim=X_train_nb_scale2.shape[1], activation='selu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
    return model 

In [36]:
mod = create_baseline()
mod.fit(X_train_nb_scale2, y_train, validation_data=(X_test_nb_scale2,y_test), batch_size = 5, epochs = 20, class_weight = {0:1.24764767, 1:0.8343821})
y_pred = mod.predict(X_test_nb_scale2)
y_pred
(np.round(y_pred).flatten()==y_test).sum() / y_test.shape[0]

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


0.815683615819209

In [37]:
np.round(calculate_performance(y_test, np.round(y_pred.ravel())),2)

array([0.82, 0.84, 0.88, 0.74, 0.8 , 0.84, 0.82, 0.18])

In [38]:
mod.save("data/nns/metadata_nb_nn_batch5_epochs30_dim_25.tf")

INFO:tensorflow:Assets written to: data/nns/metadata_nb_nn_batch5_epochs30_dim_25.tf/assets


In [28]:
mod = create_baseline(50)
mod.fit(X_train_nb_scale2, y_train, validation_data=(X_test_nb_scale2,y_test), batch_size = 3, epochs = 10)
y_pred = mod.predict(X_test_nb_scale2)
y_pred
(np.round(y_pred).flatten()==y_test).sum() / y_test.shape[0]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


0.8114500941619586

In [None]:
mod.save("data/nns/nn_batch5_epochs7_dim50.tf")

In [15]:
def create_baseline(input_layer=25):
    model = Sequential()
    model.add(Dense(input_layer, input_dim=X_train_nb_scale2.shape[1], activation='selu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
    return model 
mod = create_baseline(25)
mod.fit(X_train_nb_scale2, y_train, validation_data=(X_test_nb_scale2,y_test), batch_size = 5, epochs = 15)
y_pred = mod.predict(X_test_nb_scale2)
y_pred
(np.round(y_pred).flatten()==y_test).sum() / y_test.shape[0]

Epoch 1/15

KeyboardInterrupt: 

In [29]:
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train.reset_index(drop=True))
class_weights

1         0
2         1
3         0
4         1
         ..
154868    1
154869    1
154870    1
154871    1
154872    1
Name: outcome, Length: 154873, dtype: int64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


array([1.24764767, 0.8343821 ])

In [30]:
def create_baseline(input_layer=25):
    model = Sequential()
    model.add(Dense(input_layer, input_dim=X_train_nb_scale2.drop(columns="nb_proba").shape[1], activation='selu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
    return model 
mod = create_baseline(25)
mod.fit(X_train_nb_scale2.drop(columns="nb_proba"), y_train, validation_data=(X_test_nb_scale2.drop(columns="nb_proba"),y_test), batch_size = 5, epochs = 15, 
       class_weight = {0:1.24764767, 1:0.8343821}
       )
y_pred = mod.predict(X_test_nb_scale2.drop(columns="nb_proba"))
y_pred
(np.round(y_pred).flatten()==y_test).sum() / y_test.shape[0]

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


0.799984934086629

In [33]:
mod.save("data/nns/metadata_nn_batch5_epoch15_dim25.tf")

INFO:tensorflow:Assets written to: data/nns/metadata_nn_batch5_epoch15_dim25.tf/assets


In [31]:
from model_metrics import calculate_performance

In [32]:
np.round(calculate_performance(y_test, np.round(y_pred.ravel())),2)

array([0.8 , 0.82, 0.9 , 0.7 , 0.75, 0.88, 0.81, 0.2 ])

INFO:tensorflow:Assets written to: data/nns/nn_batch5_epochs7_dim50.tf/assets


In [30]:
X_train_nb_lda_scale = pd.concat((X_train_nb_scale2, lda_train), axis=1).copy() 
X_test_nb_lda_scale =pd.concat((X_test_nb_scale2, lda_test), axis=1).copy() 
X_train_nb_lda_scale[ lda_train.columns ] = scaler.fit_transform(X_train_nb_lda_scale[ lda_train.columns ])
X_test_nb_lda_scale[ lda_train.columns ] = scaler.transform(X_test_nb_lda_scale[ lda_train.columns ])

In [37]:
def create_baseline(input_layer=25):
    model = Sequential()
    model.add(Dense(input_layer, input_dim=X_train_nb_lda_scale.shape[1], activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
    return model 
mod = create_baseline(50)
mod.fit(X_train_nb_lda_scale, y_train, validation_data=(X_test_nb_lda_scale,y_test), batch_size = 5, epochs = 7)
y_pred = mod.predict(X_test_nb_lda_scale)
y_pred
(np.round(y_pred).flatten()==y_test).sum() / y_test.shape[0]

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


0.8121431261770244

In [33]:
mod2 = create_baseline(25)
mod2.fit(X_train_nb_lda_scale, y_train, validation_data=(X_test_nb_lda_scale,y_test), batch_size = 5, epochs = 7)
y_pred = mod2.predict(X_test_nb_lda_scale)
y_pred
(np.round(y_pred).flatten()==y_test).sum() / y_test.shape[0]

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


0.8138154425612053

In [34]:
X_train_nb_w2v_scale = pd.concat((X_train_nb_scale2, w2v_train), axis=1).copy() 
X_test_nb_w2v_scale =pd.concat((X_test_nb_scale2, w2v_test), axis=1).copy() 
X_train_nb_w2v_scale[ w2v_train.columns ] = scaler.fit_transform(X_train_nb_w2v_scale[ w2v_train.columns ])
X_test_nb_w2v_scale[ w2v_train.columns ] = scaler.transform(X_test_nb_w2v_scale[ w2v_train.columns ])

In [38]:
def create_baseline(input_layer=25):
    model = Sequential()
    model.add(Dense(input_layer, input_dim=X_train_nb_w2v_scale.shape[1], activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
    return model 
mod3 = create_baseline(25)
mod3.fit(X_train_nb_w2v_scale, y_train, validation_data=(X_test_nb_w2v_scale,y_test), batch_size = 5, epochs = 10)
y_pred = mod3.predict(X_test_nb_w2v_scale)
y_pred
(np.round(y_pred).flatten()==y_test).sum() / y_test.shape[0]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


0.8066290018832392

In [39]:
def create_baseline(input_layer=25):
    model = Sequential()
    model.add(Dense(input_layer, input_dim=tmp_train.shape[1], activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
    return model 
tmp_train = scaler.fit_transform(X_train_nb)
tmp_test = scaler.transform(X_test_nb)
tmp_mod = create_baseline(25)
tmp_mod.fit(tmp_train, y_train, validation_data=(tmp_test,y_test), batch_size = 5, epochs = 10)
y_pred = tmp_mod.predict(tmp_test)
y_pred
(np.round(y_pred).flatten()==y_test).sum() / y_test.shape[0]

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


0.8178229755178907

In [41]:
tmp_mod2 = create_baseline(35)
tmp_mod2.fit(tmp_train, y_train, validation_data=(tmp_test,y_test), batch_size = 5, epochs = 14)
y_pred = tmp_mod2.predict(tmp_test)
y_pred
(np.round(y_pred).flatten()==y_test).sum() / y_test.shape[0]

Epoch 1/14
Epoch 2/14
Epoch 3/14
Epoch 4/14
Epoch 5/14
Epoch 6/14
Epoch 7/14
Epoch 8/14
Epoch 9/14
Epoch 10/14
Epoch 11/14
Epoch 12/14
Epoch 13/14
Epoch 14/14


0.8167532956685499

In [15]:
from sklearn.model_selection import GridSearchCV

In [130]:
def create_baseline(layer=1, output_dim=3, init_activation='sigmoid', final_activation="sigmoid", optimizer='adam'):
    np.random.seed(229)
    tf.random.set_seed(229)
    model = Sequential()
    if layer==1:
        model.add(Dense(output_dim, input_dim=X_train_nb_scale2.shape[1], activation=init_activation))
    if layer==2: 
        model.add(Dense(output_dim, input_dim=X_train_nb_scale2.shape[1], activation=init_activation))
        model.add(Dense(3, activation='relu'))
    if layer==3: 
        model.add(Dense(output_dim, input_dim=X_train_nb_scale2.shape[1], activation=init_activation))
        model.add(Dense(5, activation='relu'))
        model.add(Dense(2, activation='relu'))
    model.add(Dense(1, activation=final_activation))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy']) 
    return model 

In [131]:
model_grid = KerasClassifier(build_fn=create_baseline, verbose=1)

In [135]:
epochs = [75, 100, 150]
batches = [3]
layers = [1]
output_dims = [30]
activations = ['selu']
fin_activations=['sigmoid', 'tanh']
param_grid = dict(epochs=epochs, batch_size=batches, layer=layers, output_dim=output_dims, init_activation=activations, final_activation=fin_activations)
ggrid = GridSearchCV(estimator=model_grid, param_grid=param_grid, cv=2, verbose=3, n_jobs=6)

In [136]:
ggrid.fit(X_train_nb_scale2, y_train, validation_data=(X_test_nb_scale2,y_test))

Fitting 2 folds for each of 6 candidates, totalling 12 fits
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150


GridSearchCV(cv=2,
             estimator=<tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x7fd664fdb220>,
             n_jobs=6,
             param_grid={'batch_size': [3], 'epochs': [75, 100, 150],
                         'final_activation': ['sigmoid', 'tanh'],
                         'init_activation': ['selu'], 'layer': [1],
                         'output_dim': [30]},
             verbose=3)

In [137]:
nn_pred = ggrid.predict(X_test_nb_scale2).flatten()
(np.round(nn_pred) == y_test).sum() / y_test.shape[0]





0.8195932203389831

In [142]:
# params --> acc 0.820
{'batch_size': 3,
 'epochs': 150,
 'final_activation': 'sigmoid',
 'init_activation': 'selu',
 'layer': 1,
 'output_dim': 30}
ggrid.best_estimator_.model.save("nn.tf")

INFO:tensorflow:Assets written to: nn.tf/assets


In [149]:
X_train_nb_lda = pd.concat((X_train_nb_scale2, lda_train), axis=1)
X_test_nb_lda = pd.concat((X_test_nb_scale2, lda_test), axis=1)
X_train_nb_lda[ lda_train.columns ] = scaler.fit_transform( X_train_nb_lda[ lda_train.columns ] )
X_test_nb_lda[ lda_train.columns ] = scaler.transform( X_test_nb_lda[ lda_train.columns ])

In [155]:
def create_baseline(layer=1, output_dim=3, init_activation='sigmoid', final_activation="sigmoid", optimizer='adam'):
    np.random.seed(229)
    tf.random.set_seed(229)
    model = Sequential()
    if layer==1:
        model.add(Dense(output_dim, input_dim=X_train_nb_lda.shape[1], activation=init_activation))
    if layer==2: 
        model.add(Dense(output_dim, input_dim=X_train_nb_lda.shape[1], activation=init_activation))
        model.add(Dense(output_dim / 2, activation='relu'))
    if layer==3: 
        model.add(Dense(output_dim, input_dim=X_train_nb_lda.shape[1], activation=init_activation))
        model.add(Dense(5, activation='relu'))
        model.add(Dense(2, activation='relu'))
    model.add(Dense(1, activation=final_activation))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy']) 
    return model 

In [160]:
epochs = [30]
batches = [5]
layers = [1]
output_dims = [25]
activations = ['selu']
fin_activations=['sigmoid']
model_grid = KerasClassifier(build_fn=create_baseline, verbose=1)
param_grid = dict(epochs=epochs, batch_size=batches, layer=layers, output_dim=output_dims, init_activation=activations, final_activation=fin_activations)
ggrid = GridSearchCV(estimator=model_grid, param_grid=param_grid, cv=3, verbose=3, n_jobs=6)

In [161]:
ggrid.fit(X_train_nb_lda, y_train, validation_data=(X_test_nb_lda,y_test))

Fitting 3 folds for each of 8 candidates, totalling 24 fits
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


GridSearchCV(cv=3,
             estimator=<tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x7fd5d277d430>,
             n_jobs=6,
             param_grid={'batch_size': [5], 'epochs': [30],
                         'final_activation': ['sigmoid'],
                         'init_activation': ['selu'], 'layer': [1, 2],
                         'output_dim': [20, 30, 50, 100]},
             verbose=3)

In [164]:
ggrid.best_params_

{'batch_size': 5,
 'epochs': 30,
 'final_activation': 'sigmoid',
 'init_activation': 'selu',
 'layer': 1,
 'output_dim': 20}

In [163]:
nn_pred = ggrid.predict(X_test_nb_lda).flatten()
(np.round(nn_pred) == y_test).sum() / y_test.shape[0]





0.8143050847457627

In [165]:
from sklearn.svm import SVC

In [166]:
svm = SVC(random_state=229, kernel='rbf')

In [179]:
svm.fit(X_train_nb_w2v[0:10000], y_train[0:10000])

SVC(random_state=229)

In [180]:
ypred = svm.predict(X_test_nb_w2v[0:10000])
(ypred==y_test[0:10000]).sum() / 10000

0.7595

In [178]:
X_train_nb_w2v = pd.concat((X_train_nb_scale2, pd.DataFrame(w2v_train)), axis=1)
X_test_nb_w2v = pd.concat((X_test_nb_scale2, pd.DataFrame(w2v_test)), axis=1)
X_train_nb_w2v[ w2v_train.columns ] = scaler.fit_transform( X_train_nb_w2v[ w2v_train.columns ] )
X_test_nb_w2v[ w2v_train.columns ] = scaler.transform( X_test_nb_w2v[ w2v_train.columns ])

In [171]:
def create_baseline(layer=1, output_dim=3, init_activation='sigmoid', final_activation="sigmoid", optimizer='adam'):
    np.random.seed(229)
    tf.random.set_seed(229)
    model = Sequential()
    if layer==1:
        model.add(Dense(output_dim, input_dim=X_train_nb_w2v.shape[1], activation=init_activation))
    model.add(Dense(1, activation=final_activation))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy']) 
    return model 

In [181]:
epochs = [100]
batches = [3]
layers = [1]
output_dims = [20, 25, 30, 35]
activations = ['selu']
fin_activations=['sigmoid']
model_grid = KerasClassifier(build_fn=create_baseline, verbose=1)
param_grid = dict(epochs=epochs, batch_size=batches, layer=layers, output_dim=output_dims, init_activation=activations, final_activation=fin_activations)
ggrid = GridSearchCV(estimator=model_grid, param_grid=param_grid, cv=3, verbose=3, n_jobs=6)

In [182]:
ggrid.fit(X_train_nb_w2v, y_train, validation_data=(X_test_nb_w2v,y_test))

Fitting 3 folds for each of 4 candidates, totalling 12 fits


KeyboardInterrupt: 

In [186]:
df.loc_id.nunique()

16742

In [124]:
# model w/ params acc -> 0.821
{'batch_size': 5,
 'epochs': 50,
 'init_activation': 'selu',
 'layer': 1,
 'output_dim': 30}
(np.round(nn_pred) == y_test).sum() / y_test.shape[0]

0.8207909604519774