In [1]:
import pandas as pd
import numpy as np
import pickle
import logging
import json

from model_metrics import format_results
import data_clean_for_model
import PipelineHelper



In [2]:
logging.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level = logging.INFO, 
    datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)

In [3]:
### 1. Load Data
df = pd.read_parquet("data/all_processed_df.parquet.gzip")

In [4]:
k = 5
rseed = 229
df["outcome"] = np.where( df["state"]=="successful", 1, 0, )
df["un_id"] = np.arange(0, df.shape[0], 1 )
df["name_len"] = df["name"].str.len()
df["cv_group"] = np.random.choice( np.arange(0, k), size=df.shape[0] )
df["binned_usd_goal"] = pd.qcut( np.log(df["usd_goal"]+1), 20 )

with open("model_config.json", 'r') as j:
     model_params = json.loads(j.read())
model_params['naive_bayes']['ngram_range'] = tuple(model_params['naive_bayes']['ngram_range'])

In [5]:
## load project metadata
logger.info("Loading features")
_refresh = False
try:
    if _refresh: raise Exception("Reloading")
    f = open("data/features.pkl", "rb")
    ft_dict = pickle.load(f)
    f.close()
    X_train, y_train, X_test, y_test = ft_dict.values()
except:
    X_train, X_test, y_train, y_test = data_clean_for_model.data_clean_for_model(df, "outcome", model_params, cv=model_params["cv"])
    f = open("data/features.pkl", "wb")
    pickle.dump({
        'X_train':X_train, 'y_train':y_train, 'X_test':X_test, 'y_test':y_test
    }, f)
    f.close()

2021-06-02 23:05:21 INFO     Loading features


In [6]:
# load text
logger.info("Processing text data")
blurb_train, blurb_test, _, _    = data_clean_for_model.process_blurb(df, model_params)

2021-06-02 23:05:21 INFO     Processing text data


In [7]:
## 2. Run text models

try: 
    f = open("data/res/text_models.pkl", "rb")
    text_models = pickle.load(f)
    f.close()
except:
    raise Warning("Text models do not exist. Will load from scratch")
# get naive bayes predictions
logger.info("Loading Naive Bayes predictions")
try:
    #nb_proba_train = np.load("data/res/multi_nb_preds_train.npy")
    #nb_proba_test = np.load("data/res/multi_nb_preds_test.npy")
    nb_proba_train, nb_proba_test = text_models['nb_train'], text_models['nb_test']
except:
    logger.info("Running Naive Bayes model")
    nb_params = model_params['naive_bayes']
    nb_train_pred, nb_proba_train, nb_test_pred, nb_proba_test = PipelineHelper.naive_bayes_predictions(
        blurb_train, y_train, blurb_test,
        tfidf=nb_params['tf-idf'], ngram_range=nb_params['ngram_range']
    )
    np.save("data/res/multi_nb_preds_train.npy", nb_proba_train)
    np.save("data/res/multi_nb_preds_test.npy", nb_proba_test)

# get LDA topic model
logger.info("Loading LDA topic predictions")
try:
    lda_train, lda_test = text_models['lda_train'], text_models['lda_test']
    #lda_train = pd.read_csv("data/res/lda_train.csv").drop(columns=['Unnamed: 0'])
    #lda_test = pd.read_csv("data/res/lda_test.csv").drop(columns=['Unnamed: 0'])
except:
    logger.info("Running LDA topic model")
    lda_params = model_params['lda']
    tokenized_train = blurb_train.apply(data_clean_for_model.tokenize_text)
    tokenized_test = blurb_test.apply(data_clean_for_model.tokenize_text)
    lda_train, lda_test = PipelineHelper.train_lda_model(tokenized_train, tokenized_test, params['lda'])
    lda_train.to_csv("data/res/lda_train.csv")
    lda_test.to_csv("data/res/lda_test.csv")

# get Word2Vec model predictions
logger.info("Loading Word2Vec dimension predictions")
try:
    #f = open("data/res/w2v_dict.pkl", "rb")
    #w2v_dict = pickle.load(f)
    #f.close()
    #w2v_train, w2v_test = w2v_dict.values()
    w2v_train, w2v_test = text_models['w2v_train'], text_models['w2v_test']
except:
    raise Warning("Word2Vec function not implemented. Running without it -- likely will crash.")

2021-06-02 23:05:24 INFO     Loading Naive Bayes predictions
2021-06-02 23:05:24 INFO     Loading LDA topic predictions
2021-06-02 23:05:24 INFO     Loading Word2Vec dimension predictions


In [8]:
_ignore=True
if not _ignore: 
    text_models = {
        'nb_train':nb_proba_train, 
        'nb_test':nb_proba_test,
        'lda_train':lda_train, 
        'lda_test':lda_test,
        'w2v_train':w2v_train, 
        'w2v_test':w2v_test
    }
    f = open("data/res/text_models.pkl", "wb")
    pickle.dump(text_models, f)
    f.close()
    
_increment_rseed = False
if _increment_rseed: 
    model_params['rseed'] += 1
    model_params['rseed']

In [9]:
id_vars = ["un_id", "cv_group"]
id_train = X_train[id_vars]
id_test = X_test[id_vars]
X_train = X_train.drop(columns=id_vars)
X_test = X_test.drop(columns=id_vars)

In [10]:
model_params

{'cv': True,
 'test_frac': 0.3,
 'lnom_usdgoal': True,
 'dummies': True,
 'rseed': 229,
 'naive_bayes': {'tf-idf': True, 'ngram_range': (1, 1)},
 'lda': {'corpus': {'no_below': 10, 'no_above': 0.35},
  'n_topics': 20,
  'chunksize': 100,
  'passes': 50,
  'rseed': 229},
 'linear_models': {'lasso_alpha': 0.75,
  'ridge_alpha': 0.75,
  'logreg_C': 1000,
  'logreg_penalty': 'none'},
 'lightgbm': {'bagging_fraction': 0.75,
  'feature_fraction': 0.2,
  'max_depth': 55,
  'max_bin': 500,
  'num_leaves': 400,
  'lambda_l1': 0,
  'lambda_l2': 0},
 'random_forest': {'bootstrap': False,
  'max_depth': 55,
  'max_features': 'auto',
  'min_samples_leaf': 10,
  'min_samples_split': 2,
  'n_estimators': 200},
 'svm': {'C': 0.1, 'dual': False, 'class_weight': 'balanced'}}

In [11]:
### a. Just on metadata
logger.info("Getting metadata results")
stat_df, pred_df, models = PipelineHelper.run_analyses(X_train, y_train, X_test, y_test, model_params)

2021-06-02 23:05:24 INFO     Getting metadata results
2021-06-02 23:05:24 INFO     Fitting linear models
  return linalg.solve(A, Xy, sym_pos=True,
2021-06-02 23:05:34 INFO     Fitting lightgbm




2021-06-02 23:05:38 INFO     Fitting random forest


In [12]:
### b. Just on metadata, - binned_usd_goal_outcome_mean
logger.info("Getting metadata - binned_usd_goal_outcome_mean results")
stat_df_nobinusd, pred_df_nobinusd, models_nobinusd = PipelineHelper.run_analyses(
    X_train.drop(columns=['binned_usd_goal_outcome_mean']), y_train, 
    X_test.drop(columns=['binned_usd_goal_outcome_mean']), y_test, model_params)

2021-06-02 23:07:45 INFO     Getting metadata - binned_usd_goal_outcome_mean results
2021-06-02 23:07:45 INFO     Fitting linear models
  return linalg.solve(A, Xy, sym_pos=True,
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
2021-06-02 23:07:54 INFO     Fitting lightgbm




2021-06-02 23:07:59 INFO     Fitting random forest


In [13]:
### c. Just on metadata + nb 
logger.info("Getting metadata + naive bayes results")
X_train_nb = X_train.copy()
X_test_nb = X_test.copy()
# NB 
X_train_nb['nb_proba'] = nb_proba_train[:, 1]
X_test_nb['nb_proba'] = nb_proba_test[:, 1]
stat_df_nb, pred_df_nb, models_nb = PipelineHelper.run_analyses(X_train_nb, y_train, X_test_nb, y_test, model_params)

2021-06-02 23:09:58 INFO     Getting metadata + naive bayes results
2021-06-02 23:09:58 INFO     Fitting linear models
  return linalg.solve(A, Xy, sym_pos=True,
2021-06-02 23:10:07 INFO     Fitting lightgbm




2021-06-02 23:10:11 INFO     Fitting random forest


In [17]:
### d. Just on metadata + nb + lda
logger.info("Getting metadata + naive bayes + LDA results")
X_train_nb_lda = pd.concat((X_train_nb, lda_train), axis=1)
X_test_nb_lda = pd.concat((X_test_nb, lda_test), axis=1)
stat_df_nb_lda, pred_df_nb_lda, models_nb_lda = PipelineHelper.run_analyses(X_train_nb_lda, y_train, X_test_nb_lda, y_test, model_params)

2021-06-02 23:13:42 INFO     Getting metadata + naive bayes + LDA results
2021-06-02 23:13:42 INFO     Fitting linear models
  return linalg.solve(A, Xy, sym_pos=True,
2021-06-02 23:13:52 INFO     Fitting lightgbm
2021-06-02 23:13:59 INFO     Fitting random forest


In [18]:
### e. Just on metadata + nb + w2v
logger.info("Getting metadata + naive bayes + w2v results")
X_train_nb_w2v = pd.concat((X_train_nb, pd.DataFrame(w2v_train)), axis=1)
X_test_nb_w2v = pd.concat((X_test_nb, pd.DataFrame(w2v_test)), axis=1)
stat_df_nb_w2v, pred_df_nb_w2v, models_nb_w2v = PipelineHelper.run_analyses(X_train_nb_w2v, y_train, X_test_nb_w2v, y_test, model_params)

2021-06-02 23:15:23 INFO     Getting metadata + naive bayes + w2v results
2021-06-02 23:15:26 INFO     Fitting linear models
2021-06-02 23:15:57 INFO     Fitting lightgbm




2021-06-02 23:16:26 INFO     Fitting random forest


In [19]:
### f. Just on metadata + nb + lda - cols to drop 
logger.info("Getting metadata + naive bayes + LDA results")
cols_to_drop = [
    'dummy_cat_id_290', 'dummy_cat_id_300', 'dummy_cat_id_317','dummy_cat_id_386', 'dummy_cat_id_352', #'dummy_cat_id_1',
    'dummy_cat_id_355', 'dummy_cat_id_354', 'dummy_cat_id_321', 'dummy_cat_id_12', 'dummy_cat_id_340', 'dummy_cat_id_268', 'binned_usd_goal_outcome_mean'
]
stat_df_nb_lda_drop, pred_df_nb_lda_drop, models_nb_lda_drop = PipelineHelper.run_analyses(
    X_train_nb_lda.drop(columns=cols_to_drop), y_train, 
    X_test_nb_lda.drop(columns=cols_to_drop), y_test, model_params
)

2021-06-02 23:22:16 INFO     Getting metadata + naive bayes + LDA results
2021-06-02 23:22:17 INFO     Fitting linear models
  return linalg.solve(A, Xy, sym_pos=True,
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
2021-06-02 23:22:27 INFO     Fitting lightgbm




2021-06-02 23:22:33 INFO     Fitting random forest


In [20]:
### g. Just on metadata + nb, scaled vars 
model_params['linear_models']['logreg_penalty'] = 'l2'
model_params['linear_models']['logreg_C'] = 0.75
model_params['linear_models']['ridge_alpha'] = 0.5
model_params['linear_models']['lasso_alpha'] = 1

X_train_nb_scale, X_test_nb_scale = PipelineHelper.scale_data(X_train_nb, X_test_nb)
logger.info("Getting metadata + naive bayes w/ scaled vars")
stat_df_nb_scale, pred_df_nb_scale, models_nb_scale = PipelineHelper.run_analyses(
    X_train_nb_scale, y_train, 
    X_test_nb_scale, y_test, model_params
)

2021-06-02 23:23:47 INFO     Getting metadata + naive bayes w/ scaled vars
2021-06-02 23:23:47 INFO     Fitting linear models
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
2021-06-02 23:23:55 INFO     Fitting lightgbm




2021-06-02 23:24:00 INFO     Fitting random forest


In [21]:
### g. Just on metadata, scaled vars 
#X_train_nb_scale, X_test_nb_scale = PipelineHelper.scale_data(X_train_nb, X_test_nb)
logger.info("Getting metadata + naive bayes w/ scaled vars")
stat_df_scale, pred_df_scale, models_scale = PipelineHelper.run_analyses(
    X_train_nb_scale.drop(columns=['nb_proba']), y_train, 
    X_test_nb_scale.drop(columns=['nb_proba']), y_test, model_params
)

2021-06-02 23:24:53 INFO     Getting metadata + naive bayes w/ scaled vars
2021-06-02 23:24:53 INFO     Fitting linear models
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
2021-06-02 23:25:01 INFO     Fitting lightgbm




2021-06-02 23:25:05 INFO     Fitting random forest


In [22]:
### h. Metadata + nb + lda, scaled vars 
X_train_nb_lda_scale, X_test_nb_lda_scale = PipelineHelper.scale_data(X_train_nb_lda, X_test_nb_lda, addtl_cols=lda_train.columns)
logger.info("Getting metadata + naive bayes + lda w/ scaled vars")
stat_df_nb_lda_scale, pred_df_nb_lda_scale, models_nb_lda_scale = PipelineHelper.run_analyses(
    X_train_nb_lda_scale, y_train, 
    X_test_nb_lda_scale, y_test, model_params
)

2021-06-02 23:25:56 INFO     Getting metadata + naive bayes + lda w/ scaled vars
2021-06-02 23:25:56 INFO     Fitting linear models
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
2021-06-02 23:26:05 INFO     Fitting lightgbm




2021-06-02 23:26:13 INFO     Fitting random forest


In [23]:
### i. Metadata + nb + w2v, scaled vars 
X_train_nb_w2v_scale, X_test_nb_w2v_scale = PipelineHelper.scale_data(X_train_nb_w2v, X_test_nb_w2v, addtl_cols=w2v_train.columns)
logger.info("Getting metadata + naive bayes + w2v w/ scaled vars")
stat_df_nb_w2v_scale, pred_df_nb_w2v_scale, models_nb_w2v_scale = PipelineHelper.run_analyses(
    X_train_nb_w2v_scale, y_train, 
    X_test_nb_w2v_scale, y_test, model_params
)

2021-06-02 23:27:46 INFO     Getting metadata + naive bayes + w2v w/ scaled vars
2021-06-02 23:27:46 INFO     Fitting linear models
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
2021-06-02 23:28:03 INFO     Fitting lightgbm




2021-06-02 23:28:32 INFO     Fitting random forest


In [24]:
core_nlp_sentiment = pd.read_csv("data/res/sentiment_col.csv")
core_nlp_sentiment['un_id'] = np.arange(0, core_nlp_sentiment.shape[0], 1 )
core_nlp_train = id_train.merge(core_nlp_sentiment, on="un_id", how="left")
core_nlp_test = id_test.merge(core_nlp_sentiment, on="un_id", how="left")

In [25]:
### i. Metadata + nb + w2v, scaled vars 
X_train_nb_nlp = X_train_nb.copy()
X_test_nb_nlp = X_test_nb.copy()
X_train_nb_nlp['sentiment'] = core_nlp_train['sentiment']
X_test_nb_nlp['sentiment'] = core_nlp_test['sentiment']
logger.info("Getting metadata + naive bayes + w2v w/ scaled vars")
stat_df_nb_nlp, pred_df_nb_nlp, models_nb_nlp = PipelineHelper.run_analyses(
    X_train_nb_nlp, y_train, 
    X_test_nb_nlp, y_test, model_params
)

2021-06-02 23:35:00 INFO     Getting metadata + naive bayes + w2v w/ scaled vars
2021-06-02 23:35:00 INFO     Fitting linear models
  return linalg.solve(A, Xy, sym_pos=True,
2021-06-02 23:35:11 INFO     Fitting lightgbm




2021-06-02 23:35:17 INFO     Fitting random forest


In [26]:
# j. neural net (takes FOREVER so am just loading in one model on X_train_nb) 
load_nn=True
if load_nn:
    import tensorflow as tf 
    from model_metrics import calculate_performance
    nn = tf.keras.models.load_model("data/nns/nn_batch5_epochs30_dim_25.tf")
    ypred = nn.predict(X_test_nb_scale)
    ypred = np.round(ypred.ravel())
    res = calculate_performance(y_test, ypred)
    res = ["KerasClassifier"] + res
    res = ["metadata_nb_scale"] + res

In [27]:
[np.round(i, 2) for i in res if type(i)!=str]

[0.81, 0.85, 0.83, 0.78, 0.86, 0.74, 0.8, 0.19]

In [28]:
stat_df.insert(0, "data", "metadata"), 
stat_df_nobinusd.insert(0, "data", "metadata_nobin"), 
stat_df_nb.insert(0, "data", "metadata_nb"),
stat_df_nb_lda.insert(0, "data", "metadata_nb_lda"),
stat_df_nb_w2v.insert(0, "data", "metadata_nb_w2v"),
stat_df_nb_lda_drop.insert(0, "data", "metadata_nb_lda_drop")
stat_df_nb_scale.insert(0, "data", "metadata_nb_scale")
stat_df_nb_lda_scale.insert(0, "data", "metadata_nb_lda_scale")
stat_df_nb_w2v_scale.insert(0, "data", "metadata_nb_w2v_scale")
stat_df_nb_nlp.insert(0, "data", "metadata_nb_nlp")
stat_df_scale.insert(0, "data", "metadata_scale")

In [29]:
fin = (pd.concat((stat_df, stat_df_scale, stat_df_nobinusd, stat_df_nb, stat_df_nb_lda, stat_df_nb_w2v, stat_df_nb_lda_drop, stat_df_nb_scale, stat_df_nb_lda_scale, stat_df_nb_w2v_scale, stat_df_nb_nlp))
       .sort_values('accuracy', ascending=False)
       .assign(
           accuracy_rank = lambda x:np.arange(1, x.shape[0]+1, 1), 
           random_state = model_params['rseed']
       )
      )

In [30]:
print(np.round(fin[np.logical_or(fin.data=="metadata_nb", fin.data=="metadata_nb_scale")][['data', 'model', 'accuracy', 'f1_score', 'precision_1', 'precision_0','recall_1', 'recall_0']],2).to_latex())

\begin{tabular}{lllrrrrrr}
\toprule
{} &               data &                   model &  accuracy &  f1\_score &  precision\_1 &  precision\_0 &  recall\_1 &  recall\_0 \\
\midrule
0 &        metadata\_nb &          LGBMClassifier &      0.83 &      0.85 &         0.89 &         0.75 &      0.81 &      0.84 \\
0 &  metadata\_nb\_scale &          LGBMClassifier &      0.83 &      0.85 &         0.89 &         0.75 &      0.81 &      0.84 \\
1 &        metadata\_nb &  RandomForestClassifier &      0.82 &      0.84 &         0.88 &         0.74 &      0.80 &      0.84 \\
1 &  metadata\_nb\_scale &  RandomForestClassifier &      0.81 &      0.84 &         0.88 &         0.73 &      0.80 &      0.84 \\
2 &        metadata\_nb &                   Ridge &      0.80 &      0.83 &         0.83 &         0.75 &      0.84 &      0.74 \\
2 &  metadata\_nb\_scale &                   Ridge &      0.80 &      0.83 &         0.83 &         0.75 &      0.84 &      0.74 \\
0 &        metadata\_nb &     

In [31]:
print(np.round(fin[np.logical_or(fin.data=="metadata_nb", fin.data=="metadata_nb_scale")][['data', 'model', 'accuracy', 'f1_score', 'precision_1', 'precision_0','recall_1', 'recall_0']],2).to_latex(index=False))

\begin{tabular}{llrrrrrr}
\toprule
             data &                  model &  accuracy &  f1\_score &  precision\_1 &  precision\_0 &  recall\_1 &  recall\_0 \\
\midrule
      metadata\_nb &         LGBMClassifier &      0.83 &      0.85 &         0.89 &         0.75 &      0.81 &      0.84 \\
metadata\_nb\_scale &         LGBMClassifier &      0.83 &      0.85 &         0.89 &         0.75 &      0.81 &      0.84 \\
      metadata\_nb & RandomForestClassifier &      0.82 &      0.84 &         0.88 &         0.74 &      0.80 &      0.84 \\
metadata\_nb\_scale & RandomForestClassifier &      0.81 &      0.84 &         0.88 &         0.73 &      0.80 &      0.84 \\
      metadata\_nb &                  Ridge &      0.80 &      0.83 &         0.83 &         0.75 &      0.84 &      0.74 \\
metadata\_nb\_scale &                  Ridge &      0.80 &      0.83 &         0.83 &         0.75 &      0.84 &      0.74 \\
      metadata\_nb &       LinearRegression &      0.80 &      0.83 &     