In [1]:
import pandas as pd
import numpy as np
import pickle
import logging
import json
import sklearn
from tqdm import tqdm
from pycorenlp import StanfordCoreNLP

from model_metrics import format_results
import data_clean_for_model
import PipelineHelper



In [2]:
nlp = StanfordCoreNLP('http://localhost:9000')

In [2]:
logging.basicConfig(level = logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
### 1. Load Data
df = pd.read_parquet("data/all_processed_df.parquet.gzip")

In [4]:
k = 5
rseed = 229
df["outcome"] = np.where( df["state"]=="successful", 1, 0, )
df["un_id"] = np.arange(0, df.shape[0], 1 )
df["name_len"] = df["name"].str.len()
df["cv_group"] = np.random.choice( np.arange(0, k), size=df.shape[0] )
df["binned_usd_goal"] = pd.qcut( np.log(df["usd_goal"]+1), 20 )

with open("model_config.json", 'r') as j:
     model_params = json.loads(j.read())
model_params['naive_bayes']['ngram_range'] = tuple(model_params['naive_bayes']['ngram_range'])

In [5]:
## load project metadata
logger.info("Loading features")
try:
    f = open("data/features.pkl", "rb")
    ft_dict = pickle.load(f)
    f.close()
    X_train, y_train, X_test, y_test = ft_dict.values()
except:
    X_train, X_test, y_train, y_test = data_clean_for_model.data_clean_for_model(df, "outcome", model_params, cv=model_params["cv"])

INFO:__main__:Loading features


In [6]:
# load text
logger.info("Processing text data")
blurb_train, blurb_test, _, _    = data_clean_for_model.process_blurb(df, model_params)

INFO:__main__:Processing text data


In [76]:
df['sentiment'] = np.empty(df.shape[0])
for i in tqdm(range(len(df.blurb.index))):
    if not isinstance(df.blurb.iloc[i], str):
        df.sentiment.iloc[i] = 2
        continue
    result = nlp.annotate(df.blurb.iloc[i],
                       properties={
                           'annotators': 'sentiment,',
                           'outputFormat': 'json',
                       })
    total = 0
    numSentences = 0
    for s in result['sentences']:
        total += int(s['sentimentValue'])
        numSentences += 1
    df.sentiment.iloc[i] = total/numSentences
print(df.sentiment)

100%|██████████| 221248/221248 [1:19:40<00:00, 46.28it/s]

index
0       1.0
1       2.0
2       1.5
3       1.5
4       2.0
       ... 
1628    3.0
1633    2.5
1640    3.0
1662    1.0
1736    2.0
Name: sentiment, Length: 221248, dtype: float64





In [77]:
cols = ["sentiment"]
df.to_csv('sentiment_col.csv', columns = cols)

In [7]:
## 2. Run text models

try: 
    f = open("data/res/text_models.pkl", "rb")
    text_models = pickle.load(f)
    f.close()
except:
    raise Warning("Text models do not exist. Will load from scratch")

In [8]:
# get naive bayes predictions
logger.info("Loading Naive Bayes predictions")
try:
    #nb_proba_train = np.load("data/res/multi_nb_preds_train.npy")
    #nb_proba_test = np.load("data/res/multi_nb_preds_test.npy")
    nb_proba_train, nb_proba_test = text_models['nb_train'], text_models['nb_test']
except:
    logger.info("Running Naive Bayes model")
    nb_params = model_params['naive_bayes']
    nb_train_pred, nb_proba_train, nb_test_pred, nb_proba_test = PipelineHelper.naive_bayes_predictions(
        blurb_train, y_train, blurb_test,
        tfidf=nb_params['tf-idf'], ngram_range=nb_params['ngram_range']
    )
    np.save("data/res/multi_nb_preds_train.npy", nb_proba_train)
    np.save("data/res/multi_nb_preds_test.npy", nb_proba_test)

# get LDA topic model
logger.info("Loading LDA topic predictions")
try:
    lda_train, lda_test = text_models['lda_train'], text_models['lda_test']
    #lda_train = pd.read_csv("data/res/lda_train.csv").drop(columns=['Unnamed: 0'])
    #lda_test = pd.read_csv("data/res/lda_test.csv").drop(columns=['Unnamed: 0'])
except:
    logger.info("Running LDA topic model")
    lda_params = model_params['lda']
    tokenized_train = blurb_train.apply(data_clean_for_model.tokenize_text)
    tokenized_test = blurb_test.apply(data_clean_for_model.tokenize_text)
    lda_train, lda_test = PipelineHelper.train_lda_model(tokenized_train, tokenized_test, params['lda'])
    lda_train.to_csv("data/res/lda_train.csv")
    lda_test.to_csv("data/res/lda_test.csv")

# get Word2Vec model predictions
logger.info("Loading Word2Vec dimension predictions")
try:
    #f = open("data/res/w2v_dict.pkl", "rb")
    #w2v_dict = pickle.load(f)
    #f.close()
    #w2v_train, w2v_test = w2v_dict.values()
    w2v_train, w2v_test = text_models['w2v_train'], text_models['w2v_test']
except:
    raise Warning("Word2Vec function not implemented. Running without it -- likely will crash.")

INFO:__main__:Loading Naive Bayes predictions
INFO:__main__:Loading LDA topic predictions
INFO:__main__:Loading Word2Vec dimension predictions


In [9]:
### a. Just on metadata
logger.info("Getting metadata results")
stat_df, pred_df, models = PipelineHelper.run_analyses(X_train, y_train, X_test, y_test, model_params)

INFO:__main__:Getting metadata results
INFO:PipelineHelper:Fitting linear models
  overwrite_a=True).T
  "Setting penalty='none' will ignore the C and l1_ratio "
INFO:PipelineHelper:Fitting lightgbm




INFO:PipelineHelper:Fitting random forest
INFO:PipelineHelper:Fitting SVM


In [10]:
### b. Just on metadata, - binned_usd_goal_outcome_mean
logger.info("Getting metadata - binned_usd_goal_outcome_mean results")
stat_df_nobinusd, pred_df_nobinusd, models_nobinusd = PipelineHelper.run_analyses(
    X_train.drop(columns=['binned_usd_goal_outcome_mean']), y_train, 
    X_test.drop(columns=['binned_usd_goal_outcome_mean']), y_test, model_params)

INFO:__main__:Getting metadata - binned_usd_goal_outcome_mean results
INFO:PipelineHelper:Fitting linear models
  overwrite_a=True).T
  "Setting penalty='none' will ignore the C and l1_ratio "
INFO:PipelineHelper:Fitting lightgbm




INFO:PipelineHelper:Fitting random forest
INFO:PipelineHelper:Fitting SVM


In [11]:
### c. Just on metadata + nb 
logger.info("Getting metadata + naive bayes results")
X_train_nb = X_train.copy()
X_test_nb = X_test.copy()
# NB 
X_train_nb['nb_proba'] = nb_proba_train[:, 1]
X_test_nb['nb_proba'] = nb_proba_test[:, 1]
stat_df_nb, pred_df_nb, models_nb = PipelineHelper.run_analyses(X_train_nb, y_train, X_test_nb, y_test, model_params)

INFO:__main__:Getting metadata + naive bayes results
INFO:PipelineHelper:Fitting linear models
  overwrite_a=True).T
  "Setting penalty='none' will ignore the C and l1_ratio "
INFO:PipelineHelper:Fitting lightgbm




INFO:PipelineHelper:Fitting random forest
INFO:PipelineHelper:Fitting SVM


In [12]:
### d. Just on metadata + nb + lda
logger.info("Getting metadata + naive bayes + LDA results")
X_train_nb_lda = pd.concat((X_train_nb, lda_train), axis=1)
X_test_nb_lda = pd.concat((X_test_nb, lda_test), axis=1)
stat_df_nb_lda, pred_df_nb_lda, models_nb_lda = PipelineHelper.run_analyses(X_train_nb_lda, y_train, X_test_nb_lda, y_test, model_params)

INFO:__main__:Getting metadata + naive bayes + LDA results
INFO:PipelineHelper:Fitting linear models
  overwrite_a=True).T
  "Setting penalty='none' will ignore the C and l1_ratio "
INFO:PipelineHelper:Fitting lightgbm




INFO:PipelineHelper:Fitting random forest
INFO:PipelineHelper:Fitting SVM


In [13]:
### e. Just on metadata + nb + w2v
logger.info("Getting metadata + naive bayes + w2v results")
X_train_nb_w2v = pd.concat((X_train_nb, pd.DataFrame(w2v_train)), axis=1)
X_test_nb_w2v = pd.concat((X_test_nb, pd.DataFrame(w2v_test)), axis=1)
stat_df_nb_w2v, pred_df_nb_w2v, models_nb_w2v = PipelineHelper.run_analyses(X_train_nb_w2v, y_train, X_test_nb_w2v, y_test, model_params)

INFO:__main__:Getting metadata + naive bayes + w2v results
INFO:PipelineHelper:Fitting linear models
  overwrite_a=True).T
  "Setting penalty='none' will ignore the C and l1_ratio "
INFO:PipelineHelper:Fitting lightgbm




INFO:PipelineHelper:Fitting random forest
INFO:PipelineHelper:Fitting SVM


In [14]:
### f. Just on metadata + nb + lda - cols to drop 
logger.info("Getting metadata + naive bayes + LDA results")
cols_to_drop = [
    'dummy_cat_id_290', 'dummy_cat_id_300', 'dummy_cat_id_317','dummy_cat_id_386', 'dummy_cat_id_352', #'dummy_cat_id_1',
    'dummy_cat_id_355', 'dummy_cat_id_354', 'dummy_cat_id_321', 'dummy_cat_id_12', 'dummy_cat_id_340', 'dummy_cat_id_268', 'binned_usd_goal_outcome_mean'
]
stat_df_nb_lda_drop, pred_df_nb_lda_drop, models_nb_lda_drop = PipelineHelper.run_analyses(
    X_train_nb_lda.drop(columns=cols_to_drop), y_train, 
    X_test_nb_lda.drop(columns=cols_to_drop), y_test, model_params
)

INFO:__main__:Getting metadata + naive bayes + LDA results
INFO:PipelineHelper:Fitting linear models
  overwrite_a=True).T
  "Setting penalty='none' will ignore the C and l1_ratio "
INFO:PipelineHelper:Fitting lightgbm




INFO:PipelineHelper:Fitting random forest
INFO:PipelineHelper:Fitting SVM


In [15]:
stat_df.insert(0, "data", "metadata"), 
stat_df_nobinusd.insert(0, "data", "metadata_nobin"), 
stat_df_nb.insert(0, "data", "metadata_nb"),
stat_df_nb_lda.insert(0, "data", "metadata_nb_lda"),
stat_df_nb_w2v.insert(0, "data", "metadata_nb_w2v"),
stat_df_nb_lda_drop.insert(0, "data", "metadata_nb_lda_drop")

In [16]:
fin = (pd.concat((stat_df, stat_df_nobinusd, stat_df_nb, stat_df_nb_lda, stat_df_nb_w2v, stat_df_nb_lda_drop))
       .sort_values('accuracy', ascending=False)
       .assign(
           accuracy_rank = lambda x:np.arange(1, x.shape[0]+1, 1), 
           random_state = model_params['rseed']
       )
      )
fin.head()

Unnamed: 0,data,model,accuracy,f1_score,precision_1,precision_0,recall_1,recall_0,roc_auc,brier,accuracy_rank,random_state
0,metadata_nb,LGBMClassifier,0.825431,0.848953,0.885703,0.750599,0.81513,0.841002,0.828066,0.174569,1,229
0,metadata_nb_lda_drop,LGBMClassifier,0.823443,0.847502,0.882493,0.749517,0.81518,0.835932,0.825556,0.176557,2,229
0,metadata_nb_w2v,LGBMClassifier,0.823292,0.849325,0.872308,0.758067,0.827521,0.816899,0.82221,0.176708,3,229
0,metadata_nb_lda,LGBMClassifier,0.822885,0.846587,0.884252,0.74707,0.812001,0.839337,0.825669,0.177115,4,229
1,metadata_nb,RandomForestClassifier,0.814945,0.838926,0.880942,0.73524,0.800736,0.836423,0.81858,0.185055,5,229


In [17]:
fin.loc[fin['model'] == "SVMClassifier"]

Unnamed: 0,data,model,accuracy,f1_score,precision_1,precision_0,recall_1,recall_0,roc_auc,brier,accuracy_rank,random_state
0,metadata_nb_lda,SVMClassifier,0.798825,0.827674,0.854209,0.726721,0.802739,0.792909,0.797824,0.201175,14,229
0,metadata_nb,SVMClassifier,0.798644,0.827746,0.85309,0.727317,0.803865,0.790752,0.797309,0.201356,16,229
0,metadata_nb_lda_drop,SVMClassifier,0.798478,0.827917,0.851626,0.728246,0.805492,0.787876,0.796684,0.201522,17,229
0,metadata_nb_w2v,SVMClassifier,0.79664,0.829674,0.836493,0.73879,0.822965,0.756849,0.789907,0.20336,22,229
0,metadata_nobin,SVMClassifier,0.783473,0.808,0.866331,0.69156,0.757028,0.823445,0.790236,0.216527,28,229
0,metadata,SVMClassifier,0.783292,0.807014,0.869546,0.689443,0.752873,0.829272,0.791072,0.216708,30,229


In [39]:
pred_df.columns = "metadata_" + pred_df.columns
pred_df_nobinusd.columns = "metadata_nobin_" + pred_df_nobinusd.columns
pred_df_nb.columns = "metadata_nb_" + pred_df_nb.columns
pred_df_nb_lda.columns = "metadata_nb_lda_" + pred_df_nb_lda.columns
pred_df_nb_w2v.columns = "metadata_nb_w2v_" + pred_df_nb_w2v.columns
pred_df_nb_lda_drop.columns = "metadata_nb_lda_drop_" + pred_df_nb_lda_drop.columns
pred_fin = pd.concat((pred_df, pred_df_nobinusd, pred_df_nb, pred_df_nb_lda, pred_df_nb_w2v, pred_df_nb_lda_drop), axis=1)

Unnamed: 0,metadata_metadata_LinearRegression_pred,metadata_metadata_Lasso_pred,metadata_metadata_Ridge_pred,metadata_metadata_LogisticRegression_pred,metadata_metadata_LGBMClassifier_pred,metadata_metadata_RandomForestClassifier_pred,metadata_nobin_LinearRegression_pred,metadata_nobin_Lasso_pred,metadata_nobin_Ridge_pred,metadata_nobin_LogisticRegression_pred,...,metadata_nb_w2v_Ridge_pred,metadata_nb_w2v_LogisticRegression_pred,metadata_nb_w2v_LGBMClassifier_pred,metadata_nb_w2v_RandomForestClassifier_pred,metadata_nb_lda_drop_LinearRegression_pred,metadata_nb_lda_drop_Lasso_pred,metadata_nb_lda_drop_Ridge_pred,metadata_nb_lda_drop_LogisticRegression_pred,metadata_nb_lda_drop_LGBMClassifier_pred,metadata_nb_lda_drop_RandomForestClassifier_pred
0,0.6554,0.832675,0.651824,1,1,1,0.655646,0.832675,0.652055,1,...,0.761006,1,1,1,0.800469,0.832675,0.803503,1,1,1
1,0.144957,0.473552,0.144069,1,0,0,0.138541,0.473552,0.137552,1,...,0.136281,1,0,0,0.07207,0.473552,0.079891,1,0,0
2,0.488107,0.425906,0.488423,1,0,0,0.479006,0.425906,0.479338,1,...,0.577682,1,1,1,0.581687,0.425906,0.579625,1,1,0
3,1.036096,0.832221,1.037228,1,1,1,1.027969,0.832221,1.028984,1,...,1.201803,1,1,1,1.107628,0.832221,1.102822,1,1,1
4,0.15473,0.290907,0.154283,0,0,0,0.192137,0.290907,0.191644,0,...,0.140261,0,0,0,0.27818,0.290907,0.279277,0,0,0
