In [116]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack

# plots
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# processing
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 

# modeling
from sklearn.linear_model import LogisticRegressionCV

# others
import warnings
warnings.filterwarnings('ignore')


# nlp
import string
from gensim import models
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\costa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\costa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\costa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Homework 4

In [99]:
df = pd.read_csv('data/reddit_200k_train.csv', encoding = 'latin-1', index_col='Unnamed: 0')
test = pd.read_csv('data/reddit_200k_test.csv', encoding = 'latin-1', index_col='Unnamed: 0')

# subset the columns
df['removed'] = df.REMOVED
df = df[['body', 'removed']]

test['removed'] = test.REMOVED
test = test[['body', 'removed']]

In [6]:
df.head(2)

Unnamed: 0,body,removed
1,I've always been taught it emerged from the ea...,False
2,"As an ECE, my first feeling as ""HEY THAT'S NOT...",True


# Task 1: Bag of Words and Simple Features

### 1.1 Baseline Model

In [23]:
cv = CountVectorizer()
X_train_base = cv.fit_transform(df.body)
y_train = np.where(df.removed, 1, 0)

X_test_base = cv.transform(test.body)
y_test = np.where(test.removed, 1, 0)

In [6]:
lr = LogisticRegressionCV(cv=5, scoring='roc_auc', solver='sag').fit(X_train_base, y_train)
baseline_train_score = lr.score(X_train_base, y_train)
baseline_test_score = lr.score(X_test_base, y_test)

In [7]:
print('Baseline model achieves a mean of {} ROC-AUC on our training data.'.format(np.round(np.mean(baseline_train_score), 2)))

Baseline model achieves a mean of 0.75 ROC-AUC on our training data.


In [8]:
print('Baseline model achieves a mean of {} ROC-AUC on our test data.'.format(np.round(np.mean(baseline_test_score), 2)))

Baseline model achieves a mean of 0.75 ROC-AUC on our test data.


In [9]:
bot10 = np.array(cv.get_feature_names())[np.argsort(lr.coef_[0])[:10]]
top10 = np.array(cv.get_feature_names())[np.argsort(lr.coef_[0])[::-1][:10]]

In [10]:
bot10

array(['iâ', 'itâ', 'donâ', 'http', 'edit', 'does', 'www', 'link', 'org',
       'com'], dtype='<U252')

In [11]:
top10

array(['fuck', 'comments', 'removed', 'shit', 'women', 'oh', 'weed', 'my',
       'comment', 'let'], dtype='<U252')

### 1.2 Processing

#### 1.2.1 Using lemmatization

We want to try using lemmatization with the count vectorizer, which will help reduce the number of features. 

In [57]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

lem = CountVectorizer(tokenizer = LemmaTokenizer())
X_train_lem = lem.fit_transform(df.body)
X_test_lem = lem.transform(test.body)

In [13]:
lr = LogisticRegressionCV(cv=5, scoring='roc_auc', solver='sag').fit(X_train_lem, y_train)

In [14]:
lem_train_score = lr.score(X_train_lem, y_train)
lem_test_score = lr.score(X_test_lem, y_test)

In [15]:
print('Model with lemmatization achieves a mean of {} ROC-AUC on our training data.'.format(np.round(np.mean(lem_train_score), 2)))

Model with lemmatization achieves a mean of 0.72 ROC-AUC on our training data.


In [16]:
print('Model with lemmatization achieves a mean of {} ROC-AUC on our test data.'.format(np.round(np.mean(lem_test_score), 2)))

Model with lemmatization achieves a mean of 0.72 ROC-AUC on our test data.


In [17]:
bot10 = np.array(lem.get_feature_names())[np.argsort(lr.coef_[0])[:10]]
top10 = np.array(lem.get_feature_names())[np.argsort(lr.coef_[0])[::-1][:10]]

In [18]:
bot10

array(['?', ':', 'http', 'how', 'would', 'what', 'itâ\x80\x99s', 'there',
       'in', 'doe'], dtype='<U830')

In [19]:
top10

array(['my', '!', 'comment', 'me', 'it\x92s', 'woman', '...', 'removed',
       'fuck', '<'], dtype='<U830')

Lemmatization makes our train and test scores worse, and doesn't seem to really be working given the names of the features here.

#### 1.2.2 Using tf-idf scaling

In [20]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(df.body)
X_test_tfidf = tfidf.transform(test.body)

In [21]:
lr = LogisticRegressionCV(cv=5, scoring='roc_auc', solver='sag').fit(X_train_tfidf, y_train)

In [22]:
tfidf_train_score = lr.score(X_train_tfidf, y_train)
tfidf_test_score = lr.score(X_test_tfidf, y_test)

In [23]:
print('Model with tf-idf scaling achieves a mean of {} ROC-AUC on our training data.'.format(np.round(np.mean(tfidf_train_score), 2)))

Model with tf-idf scaling achieves a mean of 0.81 ROC-AUC on our training data.


In [24]:
print('Model with tf-idf scaling achieves a mean of {} ROC-AUC on our test data.'.format(np.round(np.mean(tfidf_test_score), 2)))

Model with tf-idf scaling achieves a mean of 0.78 ROC-AUC on our test data.


In [25]:
bot30 = np.array(tfidf.get_feature_names())[np.argsort(lr.coef_[0])[:30]]
top30 = np.array(tfidf.get_feature_names())[np.argsort(lr.coef_[0])[::-1][:30]]

In [26]:
bot30

array(['iâ', 'itâ', 'donâ', 'edit', '½ï', 'thatâ', 'http', 'doesnâ',
       'www', 'isnâ', 'youâ', 'didnâ', 'org', 'theyâ', 'canâ', 'https',
       'abstract', 'weâ', 'doi', 'hi', 'thereâ', 'arenâ', 'eli5', 'link',
       'question', 'curious', 'com', 'or', 'does', 'similar'],
      dtype='<U252')

In [27]:
top30

array(['fuck', 'mods', 'my', 'comments', '0001f914', '0001f602', 'flair',
       'censorship', 'removed', 'ass', 'wet', 'women', 'pharma',
       'thumbnail', 'upvoted', 'vote', 'upvote', 'genders', 'liberals',
       'turtle', 'weed', 'feminists', 'vegans', 'lsd', 'hillary',
       'saffron', 'racist', 'fe0f', 'shit', 'trump'], dtype='<U252')

Tf-idf scaling gives us results that are slightly better than our baseline model. The features are also now much more interesting, especially the 30 features with the highest positive coefficients. Indeed, we find many words related to very sensitive subjects ('feminists', 'liberals', 'hillary', 'trump'), as well as curse words. Moreover, '0001f602' and '0001f914' actually correspond to emojis (laughing crying emoji and thinking emoji respectively).

#### 1.2.3 Using both lemmatization and tf-idf scaling

In [28]:
tlem = TfidfVectorizer(tokenizer = LemmaTokenizer())

X_train_tlem = tlem.fit_transform(df.body)
X_test_tlem = tlem.transform(test.body)

In [29]:
lr = LogisticRegressionCV(cv=5, scoring='roc_auc', solver='sag').fit(X_train_tlem, y_train)

tlem_train_score = lr.score(X_train_tlem, y_train)
tlem_test_score = lr.score(X_test_tlem, y_test)

In [31]:
print('Model with tf-idf scaling and lemmatization achieves a mean of {} ROC-AUC on our training data.'.format(np.round(np.mean(tlem_train_score), 2)))

Model with tf-idf scaling and lemmatization achieves a mean of 0.87 ROC-AUC on our training data.


In [32]:
print('Model with tf-idf scaling and lemmatization achieves a mean of {} ROC-AUC on our test data.'.format(np.round(np.mean(tlem_test_score), 2)))

Model with tf-idf scaling and lemmatization achieves a mean of 0.79 ROC-AUC on our test data.


In [33]:
bot30 = np.array(tlem.get_feature_names())[np.argsort(lr.coef_[0])[:30]]
top30 = np.array(tlem.get_feature_names())[np.argsort(lr.coef_[0])[::-1][:30]]

In [34]:
top30

array(['it\x92s', 'don\x92t', 'i\x92m', 'can\x92t', 'doesn\x92t',
       'that\x92s', 'i\x92ve', '\x94', 'didn\x92t', 'isn\x92t', '<', '>',
       'they\x92re', 'you\x92re', 'there\x92s', 'i\x92ll', 'i\x92d',
       'we\x92re', '\x96', 'won\x92t', 'what\x92s', 'wouldn\x92t',
       'let\x92s', 'aren\x92t', '\x93the', '\x97', 'couldn\x92t',
       'we\x92ve', 'wasn\x92t', 'upvote'], dtype='<U830')

In [35]:
bot30

array(['itâ\x80\x99s', 'donâ\x80\x99t', 'iâ\x80\x99m', 'edit',
       'thatâ\x80\x99s', 'doesnâ\x80\x99t', 'ï¿½ï¿½', 'iâ\x80\x99ve',
       'didnâ\x80\x99t', 'canâ\x80\x99t', 'â\x80\x9d', 'http',
       'isnâ\x80\x99t', 'arenâ\x80\x99t', 'thereâ\x80\x99s',
       'youâ\x80\x99re', 'theyâ\x80\x99re', 'iâ\x80\x99ll', 'iâ\x80\x99d',
       'wouldnâ\x80\x99t', 'weâ\x80\x99re', 'â\x80\x94', 'wonâ\x80\x99t',
       'whatâ\x80\x99s', 'wasnâ\x80\x99t', 'havenâ\x80\x99t', 'abstract',
       'hi', 'letâ\x80\x99s', 'couldnâ\x80\x99t'], dtype='<U830')

Here, we can see that while lemmatization made the baseline model worse, it actually makes tf-idf scaling better. However, the features are not really interpretable here and they seem to consist mostly of stop words.

#### 1.2.4 Using bi-grams, tri-grams and 4-grams

In [106]:
stopwords = stopwords.words('english')
for w in ['no', 'not', 'how', 'why', 'himself', 'yourself', 'you', 'me']:
    stopwords.remove(w)

In [21]:
gram = CountVectorizer(ngram_range=(2, 4), min_df=5, stop_words=stopwords)

X_train_chng = gram.fit_transform(df.body)
X_test_chng = gram.transform(test.body)

In [24]:
lr = LogisticRegressionCV(cv=5, scoring='roc_auc', solver='sag').fit(X_train_chng, y_train)

In [29]:
chng_train_score = lr.score(X_train_chng, y_train)
chng_test_score = lr.score(X_test_chng, y_test)

In [30]:
print('Model with n-grams achieves a mean of {} ROC-AUC on our training data.'.format(np.round(np.mean(chng_train_score), 2)))

Model with n-grams achieves a mean of 0.83 ROC-AUC on our training data.


In [31]:
print('Model with n-grams achieves a mean of {} ROC-AUC on our test data.'.format(np.round(np.mean(chng_test_score), 2)))

Model with n-grams achieves a mean of 0.71 ROC-AUC on our test data.


In [32]:
bot30 = np.array(gram.get_feature_names())[np.argsort(lr.coef_[0])[:30]]
top30 = np.array(gram.get_feature_names())[np.argsort(lr.coef_[0])[::-1][:30]]

In [33]:
bot30

array(['itâ not', 'thanks ama', 'iâ not', 'donâ know', 'donâ think',
       'you donâ', 'anyone know', 'abstract gt', 'link paper',
       'you recommend', 'link abstract', 'someone explain', 'org content',
       'someone eli5', 'people donâ', 'you canâ', 'iâ sure',
       'stupid question', 'interested see', 'thereâ no', 'hi dr',
       'press release', 'how could', 'thatâ why', 'near future',
       'machine learning', 'speed light', 'remember reading',
       'solar panels', '½ï ½ï'], dtype='<U83')

In [34]:
top30

array(['comment section', 'comments removed', 'fuck you', 'fat shaming',
       'big pharma', 'comments deleted', 'social justice',
       'happened comments', 'affirmative action', 'commit suicide',
       'you saying', 'no shit', 'great tits', 'rick morty', 'well shit',
       'everything removed', 'vote republican', 'oh no', 'finds way',
       'comment removed', 'oh wait', 'white man', 'fat acceptance',
       'call me', 'without reading', 'oh god', '18 years', 'sea turtle',
       'fake news', 'many removed'], dtype='<U83')

As we can see, the test score of n-grams is not so good, despite its training score being pretty high. However, we some other interesting features: some, such as "comments removed", may actually indicate a leak in the data. Others, like "rick morty", are... interesting! 

#### Using all of it: lemmatization, tf-idf scaling, n-grams

In [44]:
allv = CountVectorizer(ngram_range=(2, 4), min_df=5, stop_words='english', tokenizer=LemmaTokenizer())

X_train_all = allv.fit_transform(df.body)
X_test_all = allv.transform(test.body)

In [45]:
lr = LogisticRegressionCV(cv=5, scoring='roc_auc', solver='sag').fit(X_train_all, y_train)

In [46]:
all_train_score = lr.score(X_train_all, y_train)
all_test_score = lr.score(X_test_all, y_test)

In [47]:
print('Model with everything achieves a mean of {} ROC-AUC on our training data.'.format(np.round(np.mean(all_train_score), 2)))

Model with everything achieves a mean of 0.69 ROC-AUC on our training data.


In [48]:
print('Model with everything achieves a mean of {} ROC-AUC on our test data.'.format(np.round(np.mean(all_test_score), 2)))

Model with everything achieves a mean of 0.68 ROC-AUC on our test data.


In [49]:
bot30 = np.array(allv.get_feature_names())[np.argsort(lr.coef_[0])[:30]]
top30 = np.array(allv.get_feature_names())[np.argsort(lr.coef_[0])[::-1][:30]]

In [50]:
bot30

array(['http :', ') .', 'edit :', ': http', ': http :', '? ,', '. ,',
       "doe n't", '& gt', '& gt ;', 'gt ;', ', doe', ') ,', '. edit',
       ", 's", '. think', 'climate change', '. edit :', '? doe',
       'question :', '. doe', '] (', ') ?', '( http :', '( http',
       '. itâ\x80\x99s', ", n't", '] ( http :', '] ( http', 'doing ama'],
      dtype='<U142')

In [51]:
top30

array(['> <', 'comment removed', '. it\x92s', '! !', 'removed ?',
       '. i\x92m', ', it\x92s', '. don\x92t', "! ''", 'comment removed ?',
       'shit .', 'high school', 'fuck .', '. fuck', '? !',
       'comment section', '! ! !', 'gon na', '. wa', 'yeah ,', ', i\x92m',
       "n't want", '... ...', 'big pharma', ', don\x92t',
       'comment deleted', '. <', '. started', '. woman', 'u+0001f602 >'],
      dtype='<U142')

Combining everything actually seems to yield the worst scores so far, which is somewhat surprising. The Lemma Tokenizer probably doesn't really work as we'd intend it to. 

### 1.3 Other features

We'll engineer the following features:
- Length: document size (# of characters)
- Capitalization: percentage of capital characters
- Punctuations: boolean indicating whether the post contained punctuations or not

In [92]:
df.head(2)

Unnamed: 0,body,removed
1,I've always been taught it emerged from the ea...,False
2,"As an ECE, my first feeling as ""HEY THAT'S NOT...",True


##### Length:

In [100]:
df['length'] = df.body.str.len()
test['length'] = test.body.str.len()

##### Upper Case Characters:

In [101]:
df['all_cap'] = np.where(df.body.str.isupper(), 1, 0)
test['all_cap'] = np.where(test.body.str.isupper(), 1, 0)

##### Punctuations:

In [115]:
df['punctuation'] = np.where(df.body.str.contains('!'), 1, 0)
test['punctuation'] = np.where(test.body.str.contains('!'), 1, 0)

##### Scaling:

In [103]:
df['length'] = StandardScaler().fit_transform(df['length'].values.reshape(-1,1))
test['length'] = StandardScaler().fit_transform(test['length'].values.reshape(-1,1))

Adding these to a model.

In [108]:
model = TfidfVectorizer(tokenizer = LemmaTokenizer(), stop_words = stopwords)
X_model = model.fit_transform(df.body)
X_test_model = model.transform(test.body)

# combining the text data with the other features
X_model = hstack((X_model, df[['length', 'all_cap', 'punctuation']].values))
X_test_model = hstack((X_test_model, test[['length', 'all_cap', 'punctuation']].values))

# training the model
lr = LogisticRegressionCV(cv=5, scoring='roc_auc', solver='sag').fit(X_model, y_train)
model_score = lr.score(X_model, y_train)
model_test_score = lr.score(X_test_model, y_test)

In [111]:
print('Model with extra features achieves a mean of {} ROC-AUC on our training data.'.format(np.round(np.mean(model_score), 2)))
print('Model with extra features achieves a mean of {} ROC-AUC on our test data.'.format(np.round(np.mean(model_test_score), 2)))

Model with everything achieves a mean of 0.83 ROC-AUC on our training data.
Model with everything achieves a mean of 0.79 ROC-AUC on our test data.


In [112]:
# trying the same thing without the stopwords change
model = TfidfVectorizer(tokenizer = LemmaTokenizer())

X_model = model.fit_transform(df.body)
X_test_model = model.transform(test.body)

X_model = hstack((X_model, df[['length', 'all_cap', 'punctuation']].values))
X_test_model = hstack((X_test_model, test[['length', 'all_cap', 'punctuation']].values))

lr = LogisticRegressionCV(cv=5, scoring='roc_auc', solver='sag').fit(X_model, y_train)
model_score = lr.score(X_model, y_train)
model_test_score = lr.score(X_test_model, y_test)

In [112]:
print('Model with extra features achieves a mean of {} ROC-AUC on our training data.'.format(np.round(np.mean(model_score), 2)))
print('Model with extra features achieves a mean of {} ROC-AUC on our test data.'.format(np.round(np.mean(model_test_score), 2)))

Model with everything achieves a mean of 0.83 ROC-AUC on our training data.
Model with everything achieves a mean of 0.79 ROC-AUC on our test data.


## Task 2: Word Vectors

In [117]:
w = models.KeyedVectors.load_word2vec_format('V:/word_vectors/GoogleNews-vectors-negative300.bin', binary=True)

Vectorizing our text body and the test set. 

In [123]:
vect_w2v = CountVectorizer(vocabulary=w.index2word)
vect_w2v.fit(df.body)

docs = vect_w2v.inverse_transform(vect_w2v.transform(df.body))
X_train_body = []
for doc in docs:
    if len(doc) > 0:
        X_train_body.append(np.mean(w[doc], axis=0))
    else:
        X_train_body.append(np.zeros(300))
X_train_body = np.vstack(X_train_body)

In [157]:
# repeating the above for the test set
docs_test = vect_w2v.inverse_transform(vect_w2v.transform(test.body))
X_test_body = []
for doc in docs_test:
    if len(doc) > 0:
        X_test_body.append(np.mean(w[doc], axis=0))
    else:
        X_test_body.append(np.zeros(300))
X_test_body = np.vstack(X_test_body)

Testing the model. 

In [160]:
lr = LogisticRegressionCV(cv=5, scoring='roc_auc', solver='sag').fit(X_train_body, y_train)

w2v_train_score = lr.score(X_train_body, y_train)
w2v_test_score = lr.score(X_test_body, y_test)

In [161]:
print('Model w/ W2V achieves a mean of {} ROC-AUC on our training data.'.format(np.round(np.mean(w2v_train_score), 2)))
print('Model w/ W2V achieves a mean of {} ROC-AUC on our test data.'.format(np.round(np.mean(w2v_test_score), 2)))

Model w/ W2V achieves a mean of 0.73 ROC-AUC on our training data.
Model w/ W2V achieves a mean of 0.73 ROC-AUC on our test data.


What if we incorporate the other features? Including one that indicates that there were no vocab words. 

In [192]:
docs_series = pd.Series(docs)
df['v_length'] = docs_series.apply(lambda x: len(x)) # finds the document length
df['v_empty'] = np.where(df.v_length == 0.0, 1, 0)   # maps empty docs to 1 and others to 0

# repeat the above for test
docs_series = pd.Series(docs_test)
test['v_length'] = docs_series.apply(lambda x: len(x)) # finds the document length
test['v_empty'] = np.where(test.v_length == 0.0, 1, 0)   # maps empty docs to 1 and others to 0

In [214]:
X_train_body2 = np.concatenate((X_train_body, df[['length', 'all_cap', 'v_empty', 'punctuation']].values), axis=1)
X_test_body2 = np.concatenate((X_test_body, test[['length', 'all_cap', 'v_empty', 'punctuation']].values), axis=1)

In [215]:
lr = LogisticRegressionCV(cv=5, scoring='roc_auc', solver='sag').fit(X_train_body2, y_train)

w2v_train_score2 = lr.score(X_train_body2, y_train)
w2v_test_score2 = lr.score(X_test_body2, y_test)

In [216]:
print('Second model w/ W2V achieves a mean of {} ROC-AUC on our training data.'.format(np.round(np.mean(w2v_train_score), 2)))
print('Second model w/ W2V achieves a mean of {} ROC-AUC on our test data.'.format(np.round(np.mean(w2v_test_score), 2)))

Second model w/ W2V achieves a mean of 0.73 ROC-AUC on our training data.
Second model w/ W2V achieves a mean of 0.73 ROC-AUC on our test data.
