In [1]:
import numpy as np 
import pandas as pd
import re
from nltk.corpus import stopwords

In [2]:
import nltk

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_parquet("data/all_processed_df.parquet.gzip")

In [5]:
df['outcome'] = (df['state']=='successful').astype(int)
df.outcome.value_counts()

1    132754
0     88494
Name: outcome, dtype: int64

In [6]:
RE_replace_space = re.compile('[/(){}\[\]\|@,;]')
RE_symbols_to_drop = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(txt):
    if txt is None: return ''
    txt = str(txt)
    txt = txt.lower()
    txt = RE_replace_space.sub(' ', txt)
    txt = RE_symbols_to_drop.sub('', txt)
    txt = ' '.join(word for word in txt.split() if word not in STOPWORDS)
    return txt 

df['blurb_cln'] = df['blurb'].apply(clean_text)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df, df['outcome'], random_state=229, test_size=0.3)

In [8]:
df['blurb_cln']

0       humble little astro traveling around brewing f...
1       spooky ghost miniatures tabletop gaming rpg dn...
2       part dreamcatcher comics starting gaming depar...
3       write dd gaming modules make game mastering ex...
4       wander creates high quality handcrafted functi...
                              ...                        
1628    david griswold eliza reisfeld proud present fi...
1633    jeremy clark one toptier traditional comic boo...
1640    hugs bugs cleverly written illustrated moral t...
1662    biker club formed ashes apocalypse ldmc fights...
1736                                 luxury playing cards
Name: blurb_cln, Length: 221248, dtype: object

In [9]:
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [10]:
train_tokenized = X_train.apply( lambda r: tokenize_text(r['blurb_cln']), axis=1).values
test_tokenized = X_test.apply( lambda r: tokenize_text(r['blurb_cln']), axis=1).values

In [11]:
import gensim



In [12]:
def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod
def get_corpus(words):
    bigram_mod = bigrams(words)
    bigram = [bigram_mod[review] for review in words]
    id2word = gensim.corpora.Dictionary(bigram)
    id2word.filter_extremes(no_below=10, no_above=0.35)
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in bigram]
    
    return corpus, id2word, bigram

In [13]:
train_corpus, train_id2word, bigram_train = get_corpus(train_tokenized)

In [52]:
n_topics = 10
lda_train = gensim.models.ldamulticore.LdaMulticore(
                       corpus=train_corpus,
                       num_topics=n_topics,
                       id2word=train_id2word,
                       chunksize=100,
                       workers=7, # Num. Processing Cores - 1
                       passes=50,
                       eval_every = 1,
                       per_word_topics=True, 
    random_state=229
)
#lda_train.save("data/lda/lda_train20.model")

In [40]:
n_topics=20
lda_train = gensim.models.LdaModel.load("data/lda/lda_train20.model")

In [53]:
lda_train.print_topics(num_topics=n_topics)

[(0,
  '0.017*"series" + 0.013*"set" + 0.012*"inspired" + 0.011*"two" + 0.009*"short_film" + 0.009*"enamel_pins" + 0.009*"collection" + 0.008*"story" + 0.008*"film" + 0.008*"world"'),
 (1,
  '0.014*"made" + 0.010*"unique" + 0.010*"using" + 0.009*"design" + 0.009*"en" + 0.009*"designed" + 0.007*"real" + 0.007*"hand" + 0.006*"handmade" + 0.006*"100"'),
 (2,
  '0.009*"food" + 0.009*"home" + 0.006*"natural" + 0.006*"local" + 0.005*"place" + 0.005*"water" + 0.005*"light" + 0.005*"community" + 0.005*"using" + 0.005*"products"'),
 (3,
  '0.027*"game" + 0.011*"new" + 0.010*"play" + 0.009*"games" + 0.007*"characters" + 0.006*"world" + 0.006*"worlds" + 0.006*"action" + 0.006*"meets" + 0.005*"based"'),
 (4,
  '0.021*"art" + 0.013*"project" + 0.012*"artists" + 0.012*"new" + 0.011*"dance" + 0.009*"work" + 0.009*"music" + 0.008*"release" + 0.008*"show" + 0.007*"community"'),
 (5,
  '0.036*"de" + 0.017*"la" + 0.009*"un" + 0.008*"jazz" + 0.007*"que" + 0.007*"et" + 0.007*"tool" + 0.006*"comes" + 0.006*

In [54]:
train_vecs = []
for i in range(len(train_tokenized)):
    top_topics = (
        lda_train.get_document_topics(train_corpus[i],
                                      minimum_probability=0.0)
    )
    topic_vec = [top_topics[i][1] for i in range(n_topics)]
    train_vecs.append(topic_vec)

In [55]:
lda_df = pd.DataFrame(train_vecs)
lda_df.columns = ["lda_df_topic" + str(i) for i in lda_df.columns]

In [25]:
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [26]:
from sklearn.linear_model import LogisticRegression, SGDClassifier, LinearRegression

In [56]:
scaler = StandardScaler()
lda_df_scale = scaler.fit_transform(lda_df)

In [57]:
clf = LogisticRegression(class_weight='balanced').fit(lda_df, y_train)
clf_scale = LogisticRegression(class_weight='balanced').fit(lda_df_scale, y_train)

In [58]:
print(classification_report(y_train, clf.predict(lda_df)))
print(classification_report(y_train, clf_scale.predict(lda_df_scale)))

              precision    recall  f1-score   support

           0       0.47      0.62      0.53     62066
           1       0.68      0.53      0.60     92807

    accuracy                           0.57    154873
   macro avg       0.57      0.57      0.56    154873
weighted avg       0.59      0.57      0.57    154873

              precision    recall  f1-score   support

           0       0.47      0.62      0.53     62066
           1       0.68      0.53      0.60     92807

    accuracy                           0.57    154873
   macro avg       0.57      0.57      0.56    154873
weighted avg       0.59      0.57      0.57    154873



In [59]:
sgd = SGDClassifier(
    max_iter=1000,
    tol=1e-3,
    loss='log',
    class_weight='balanced'
).fit(lda_df, y_train)
print(classification_report(y_train, sgd.predict(lda_df)))

              precision    recall  f1-score   support

           0       0.48      0.50      0.49     62066
           1       0.66      0.64      0.65     92807

    accuracy                           0.59    154873
   macro avg       0.57      0.57      0.57    154873
weighted avg       0.59      0.59      0.59    154873



In [60]:
def get_bigram(words):
    bigram = bigrams(words)
    bigram = [bigram[review] for review in words]
    return bigram
  
bigram_test = get_bigram(test_tokenized)

test_corpus = [train_id2word.doc2bow(text) for text in bigram_test]

In [61]:
test_vecs = []
for i in range(len(test_tokenized)):
    top_topics = (
            lda_train.get_document_topics(test_corpus[i],
                                          minimum_probability=0.0)
    )
    topic_vec = [top_topics[i][1] for i in range(n_topics)]
    test_vecs.append(topic_vec)

In [62]:
lda_df_test = pd.DataFrame(test_vecs)
lda_df_test.columns = ["lda_df_topic" + str(i) for i in lda_df_test.columns]

In [63]:
print(classification_report(y_test, clf.predict(lda_df_test)))
print(classification_report(y_test, sgd.predict(lda_df_test)))

              precision    recall  f1-score   support

           0       0.47      0.62      0.53     26428
           1       0.68      0.54      0.60     39947

    accuracy                           0.57     66375
   macro avg       0.57      0.58      0.57     66375
weighted avg       0.60      0.57      0.57     66375

              precision    recall  f1-score   support

           0       0.48      0.50      0.49     26428
           1       0.66      0.65      0.65     39947

    accuracy                           0.59     66375
   macro avg       0.57      0.57      0.57     66375
weighted avg       0.59      0.59      0.59     66375



In [77]:
lda_df.to_csv("data/lda_df.csv")
lda_df_test.to_csv("data/lda_df_test.csv")