In [1]:
import pandas as pd 
import numpy as np 
import logging 
import nltk 
from nltk.corpus import stopwords
import re

In [2]:
from sklearn.model_selection import train_test_split 
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
import gensim
from gensim.models import Word2Vec

wv = gensim.models.KeyedVectors.load_word2vec_format("/Users/benji/Downloads/GoogleNews-vectors-negative300.bin", binary=True)



In [4]:
df = pd.read_parquet("data/all_processed_df.parquet.gzip")

In [5]:
RE_replace_space = re.compile('[/(){}\[\]\|@,;]')
RE_symbols_to_drop = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(txt):
    if txt is None: return ''
    txt = txt.lower()
    txt = RE_replace_space.sub(' ', txt)
    txt = RE_symbols_to_drop.sub('', txt)
    txt = ' '.join(word for word in txt.split() if word not in STOPWORDS)
    return txt 

df['blurb_cln'] = df['blurb'].apply(clean_text)

In [6]:
rseed = 229

In [7]:
y = np.where(df['state']=='successful', 1, 0)
# use same dfs as we use in the other model
X_train, X_lnom, y_train, y_lnom = train_test_split(df, y, test_size=0.3, random_state=rseed)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.25, random_state=rseed)

In [8]:
## try out naive bayes
nb = Pipeline([
    ('vect', CountVectorizer()), 
    ('tfidf', TfidfTransformer()), 
    ('clf', MultinomialNB())
])
nb.fit(X_train['blurb_cln'], y_train)
y_pred = nb.predict(X_test['blurb_cln'])
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.6957049510576203
              precision    recall  f1-score   support

           0       0.72      0.40      0.51     15514
           1       0.69      0.90      0.78     23205

    accuracy                           0.70     38719
   macro avg       0.70      0.65      0.65     38719
weighted avg       0.70      0.70      0.67     38719



In [10]:
np.save("data/res/multi_nb_preds_test.npy", nb.predict_proba(X_test['blurb_cln']))
np.save("data/res/multi_nb_preds_train.npy", nb.predict_proba(X_train['blurb_cln']))

In [10]:
def word_averaging(wv, words):
    #all_words = set()
    mean = []
    
    for wrd in words: 
        if isinstance(wrd, np.ndarray):
            mean.append(wrd)
        elif wrd in wv.key_to_index:
            mean.append(wv.get_vector(wrd, norm=True))
            #all_words.add(wv.key_to_index[wrd])
    
    if not mean: 
        logging.warning("cannot compute similarity with no input %s", words)
        return np.zeros(wv.vector_size, )
    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def word_averaging_list(wv, text_list):
    return np.vstack( [word_averaging(wv, post) for post in text_list ])

def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [11]:
test_tokenized = X_test.apply( lambda r: w2v_tokenize_text(r['blurb_cln']), axis=1).values
train_tokenized = X_train.apply( lambda r: w2v_tokenize_text(r['blurb_cln']), axis=1).values

In [None]:
X_train_word_average = word_averaging_list(wv, train_tokenized)
X_test_word_average = word_averaging_list(wv, test_tokenized)

In [14]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_word_average, y_train)
y_pred = logreg.predict(X_test_word_average)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.6854774141894161
              precision    recall  f1-score   support

           0       0.64      0.50      0.56     15514
           1       0.71      0.81      0.76     23205

    accuracy                           0.69     38719
   macro avg       0.67      0.65      0.66     38719
weighted avg       0.68      0.69      0.68     38719



In [15]:
np.save("data/res/w2v_Xtrain_avg.npy", X_train_word_average)
np.save("data/res/w2v_Xtest_avg.npy", X_test_word_average)