In [1]:
import pandas as pd 
import numpy as np 

import pickle
import logging
import nltk 
from nltk.corpus import stopwords

import re
import gensim 



In [2]:
from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [4]:
from sklearn.linear_model import SGDClassifier, LogisticRegression 

In [5]:
df = pd.read_parquet("data/all_processed_df.parquet.gzip")

In [6]:
RE_replace_space = re.compile('[/(){}\[\]\|@,;]')
RE_symbols_to_drop = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(txt):
    if txt is None: return ''
    txt = txt.lower()
    txt = RE_replace_space.sub(' ', txt)
    txt = RE_symbols_to_drop.sub('', txt)
    txt = ' '.join(word for word in txt.split() if word not in STOPWORDS)
    return txt 

df['blurb_cln'] = df['blurb'].apply(clean_text)

In [7]:
rseed = 229
X = df['blurb_cln']
y = np.where(df['state']=='successful', 1, 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=rseed)

In [8]:
nb = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,1))), 
    ('tfidf', TfidfTransformer()), 
    ('clf', MultinomialNB())
])
nb.fit(X_train, y_train)


y_pred = nb.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))
# ngrams: --> acc:
# (1,1)   --> 0.7
# (1,2)   --> 0.68
# (2,2)   --> 0.68 
# (1,3)   --> 0.67

# UNIGRAMS BEST PREDICTOR 

accuracy 0.6990960451977402
              precision    recall  f1-score   support

           0       0.71      0.41      0.52     17594
           1       0.69      0.89      0.78     26656

    accuracy                           0.70     44250
   macro avg       0.70      0.65      0.65     44250
weighted avg       0.70      0.70      0.68     44250



In [9]:
# W/O TF-IDF transformation  
nb = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,2))), 
    #('tfidf', TfidfTransformer()), 
    ('clf', MultinomialNB())
])
nb.fit(X_train, y_train)


y_pred = nb.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))
# ngrams: --> acc:
# (1,1)   --> 0.71
# (1,2)   --> 0.72
# (2,2)   --> 0.69
# (1,3)   --> 0.72

# UNIGRAMS BEST PREDICTOR 

accuracy 0.7194124293785311
              precision    recall  f1-score   support

           0       0.69      0.53      0.60     17594
           1       0.73      0.84      0.78     26656

    accuracy                           0.72     44250
   macro avg       0.71      0.69      0.69     44250
weighted avg       0.72      0.72      0.71     44250



In [17]:
np.save("data/res/multi_nb_preds_train_opt.npy", nb.predict_proba(X_train))
np.save("data/res/multi_nb_preds_test_opt.npy", nb.predict_proba(X_test))

In [21]:
sgd = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()), 
    ('clf', SGDClassifier(penalty='l2', alpha=1e-3, random_state=rseed, class_weight='balanced'))
])
sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.6498681732580037
              precision    recall  f1-score   support

           0       0.54      0.77      0.64     26428
           1       0.79      0.57      0.66     39947

    accuracy                           0.65     66375
   macro avg       0.67      0.67      0.65     66375
weighted avg       0.69      0.65      0.65     66375



In [25]:
logreg = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()), 
    ('clf', LogisticRegression(n_jobs=1, C=1e5, max_iter=2000))
    
])

logreg.fit(X_train, y_train)
y_pred = logreg.fit(X_test)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


ValueError: This LogisticRegression estimator requires y to be passed, but the target y is None.

In [10]:
from gensim.models import Word2Vec

wv = gensim.models.KeyedVectors.load_word2vec_format("/Users/benji/Downloads/GoogleNews-vectors-negative300.bin", binary=True)

In [8]:
wv.get_normed_vectors()

array([[ 0.06731994, -0.05344657,  0.01899059, ..., -0.09324721,
        -0.00733469, -0.00514565],
       [ 0.05295623,  0.06545979,  0.06619529, ..., -0.03585578,
         0.01089464, -0.04707221],
       [-0.00851202, -0.03422449,  0.03228392, ...,  0.05151315,
        -0.02522733,  0.01746507],
       ...,
       [-0.0200327 , -0.09257349, -0.01978385, ..., -0.01667318,
        -0.01368694,  0.04753101],
       [ 0.02795097, -0.02753379,  0.03087122, ..., -0.00756136,
         0.02304912,  0.01626997],
       [ 0.03380508, -0.03380508, -0.00294652, ...,  0.05957004,
         0.05408813,  0.00973038]], dtype=float32)

In [11]:
def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for wrd in words: 
        if isinstance(wrd, np.ndarray):
            mean.append(wrd)
        elif wrd in wv.key_to_index:
            mean.append(wv.get_vector(wrd, norm=True))
            all_words.add(wv.key_to_index[wrd])
    
    if not mean: 
        logging.warning("cannot compute similarity with no input %s", words)
        return np.zeros(wv.vector_size, )
    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def word_averaging_list(wv, text_list):
    return np.vstack( [word_averaging(wv, post) for post in text_list ])

In [16]:
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.3, random_state=rseed)
#X_train, X_lnom, y_train, y_lnom = train_test_split(df, y, test_size=0.3, random_state=rseed)
#X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.25, random_state=rseed)

In [17]:
train_tokenized = X_train.apply( lambda r: w2v_tokenize_text(r['blurb_cln']), axis=1).values
test_tokenized = X_test.apply( lambda r: w2v_tokenize_text(r['blurb_cln']), axis=1).values

In [18]:
X_train_word_average = word_averaging_list(wv, train_tokenized)
X_test_word_average = word_averaging_list(wv, test_tokenized)



In [34]:
X_train_word_average[0:5, :]

array([[-0.04454321, -0.05306488, -0.04265701, ...,  0.00134691,
         0.03013412, -0.01988591],
       [ 0.05001489, -0.00681169,  0.00947347, ..., -0.01200528,
        -0.01470918, -0.01058321],
       [ 0.05706745, -0.01788912, -0.03405837, ..., -0.02279961,
         0.03940886,  0.00274279],
       [ 0.0198833 ,  0.03409994,  0.00031001, ...,  0.03845909,
         0.02085519,  0.04676888],
       [ 0.03569921,  0.03855181,  0.0583232 , ..., -0.03728773,
         0.03405387,  0.02881503]])

In [19]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg = logreg.fit(X_train_word_average, y_train)
y_pred = logreg.predict(X_test_word_average)
print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy 0.6898531073446328
              precision    recall  f1-score   support

           0       0.64      0.50      0.56     26428
           1       0.71      0.81      0.76     39947

    accuracy                           0.69     66375
   macro avg       0.68      0.66      0.66     66375
weighted avg       0.68      0.69      0.68     66375



In [38]:
np.save("data/res/w2v_Xtrain_avg.npy", X_train_word_average)
np.save("data/res/w2v_Xtest_avg.npy", X_test_word_average)

In [20]:
np.save("data/res/w2v_Xtrain_avg_big.npy", X_train_word_average)
np.save("data/res/w2v_Xtest_avg_big.npy", X_test_word_average)