In [16]:
import pandas as pd
import numpy as np

# plots
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# processing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 

# modeling
from sklearn.linear_model import LogisticRegressionCV

# others
import warnings
warnings.filterwarnings('ignore')

In [20]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\coren\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\coren\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

# Homework 4

In [2]:
df = pd.read_csv('data/reddit_200k_train.csv', encoding = 'latin-1', index_col='Unnamed: 0')
test = pd.read_csv('data/reddit_200k_test.csv', encoding = 'latin-1', index_col='Unnamed: 0')

# subset the columns
df['removed'] = df.REMOVED
df = df[['body', 'removed']]

test['removed'] = test.REMOVED
test = test[['body', 'removed']]

In [3]:
df.head(2)

Unnamed: 0,body,removed
1,I've always been taught it emerged from the ea...,False
2,"As an ECE, my first feeling as ""HEY THAT'S NOT...",True


# Task 1: Bag of Words and Simple Features

### 1.1 Baseline Model

In [9]:
cv = CountVectorizer()
X_train_base = cv.fit_transform(df.body)
y_train = np.where(df.removed, 1, 0)

X_test_base = cv.fit_transform(test.body)
y_test = np.where(test.removed, 1, 0)

In [10]:
lr = LogisticRegressionCV(cv=5, scoring='roc_auc', solver='sag').fit(X_train_base, y_train)
baseline_train_score = lr.scores_
baseline_test_score = lr.score(X_test_base, y_test)

In [11]:
print('Baseline model achieves a mean of {} ROC-AUC on our training data.'.format(np.round(np.mean(baseline_train_score), 2)))

Baseline model achieves a mean of 0.73 ROC-AUC on our training data.


In [None]:
print('Baseline model achieves a mean of {} ROC-AUC on our training data.'.format(np.round(np.mean(baseline_test_score), 2)))

### 1.2 Processing

#### 1.2.1 Using stemming

We want to try using lemmatization with the count vectorizer, which will help reduce the number of features. 

In [21]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

lem = CountVectorizer(tokenizer = LemmaTokenizer())
X_train_lem = lem.fit_transform(df.body)
X_test_lem = lem.fit_transform()

In [23]:
lr = LogisticRegressionCV(cv=5, scoring='roc_auc', solver='sag').fit(X_lem, y)
lem_score = lr.scores_

In [25]:
print('Model with stemming achieves a mean of {} ROC-AUC on our training data.'.format(np.round(np.mean(baseline_score), 2)))

0.7146171498354021

#### 1.2.2 Using tf-idf scaling

In [None]:
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(df.body)

In [None]:
tfidf_score = cross_val_score(RidgeClassifier(solver='sag'), X_tfidf, y, cv=5, scoring='roc_auc')