In [1]:
TRAIN_SET = "./data/reddit_200k_train_utf-8.csv"
TEST_SET = "./data/reddit_200k_test_utf-8.csv"

In [2]:
import numpy as np
import pandas as pd

In [3]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

##  Task 1 Bag of Words and simple Features
### 1.1 Create a baseline model using a bag-of-words approach and a linear model.

In [4]:
corpus_train = pd.read_csv(TRAIN_SET, encoding='utf-8')
corpus_test = pd.read_csv(TEST_SET, encoding='utf-8')

In [5]:
print("The size of training dataset is {}".format(corpus_train.shape))
print("The size of testing dataset is {}".format(corpus_test.shape))

The size of training dataset is (167529, 8)
The size of testing dataset is (55843, 8)


For this homework, only use the reddit_200k training and test set, and only use the “body” and “removed” columns. <BR>
Pick an appropriate evaluation metric for imbalanced binary classification.

In [6]:
train_set = corpus_train[['body', 'REMOVED']]
test_set = corpus_test[['body', 'REMOVED']]

In [7]:
print("The size of training dataset is {}".format(train_set.shape))
print("The size of testing dataset is {}".format(test_set.shape))

The size of training dataset is (167529, 2)
The size of testing dataset is (55843, 2)


In [8]:
X_train, y_train = train_set['body'], train_set['REMOVED']
print(f"X_train has the type of {type(X_train)}, and has the size of {X_train.shape}")
print(f"y_train has the type of {type(y_train)}, and has the size of {y_train.shape}")

X_test, y_test = test_set['body'], test_set['REMOVED']
print(f"X_test has the type of {type(X_test)}, and has the size of {X_test.shape}")
print(f"y_test has the type of {type(y_test)}, and has the size of {y_test.shape}")

X_train has the type of <class 'pandas.core.series.Series'>, and has the size of (167529,)
y_train has the type of <class 'pandas.core.series.Series'>, and has the size of (167529,)
X_test has the type of <class 'pandas.core.series.Series'>, and has the size of (55843,)
y_test has the type of <class 'pandas.core.series.Series'>, and has the size of (55843,)


In [9]:
# tokenization and building a vocabulary
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect.fit(X_train)
print(f"Builded vocabulary size: {len(vect.vocabulary_)}")

Builded vocabulary size: 115231


In [10]:
X_train_baseline = vect.transform(X_train)
print("bag_of_words: {}".format(repr(X_train_baseline)))

bag_of_words: <167529x115231 sparse matrix of type '<class 'numpy.int64'>'
	with 5025953 stored elements in Compressed Sparse Row format>


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train_baseline, y_train)

In [48]:
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

X_test_baseline = vect.transform(X_test)
print("Test score: {:.2f}".format(grid.score(X_test_baseline, y_test)))

Test score: 0.71


### 1.2 Try using n-grams, characters, tf-idf rescaling and possibly other ways to tune the BoW model. Be aware that you might need to adjust the (regularization of the) linear model for different feature sets.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(TfidfVectorizer(min_df=5), LogisticRegression())
param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10],
              'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
              'tfidfvectorizer__min_df': [1, 2, 3, 4, 5]}

grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)
print(f"Best cross-validation score: {:}")

In [None]:
import spacy
print(f"SpaCy version: {spacy.__version__}")

In [None]:
import re
regexp = re.compile('(?u)\\b\\w\\w+\\b')
en_nlp = spacy.load('en', disable=['parser', 'ner'])
old_tokenizer = en_nlp.tokenizer
en_nlp.tokenizer = lambda string: old_tokenizer.tokens_from_list(regexp.findall(string))

def custom_tokenizer(document):
    doc_spacy = en_nlp(document)
    return [token.lemma_ for token in doc_spacy]


lemma_vect = TfidfVectorizer(tokenizer=custom_tokenizer)

### 1.3 Explore other features you can derive from the text, such as html, length, punctuation, capitalization or other features you deem important from exploring the dataset

## Task 2 Word Vectors

### Use a pretrained word-embedding (word2vec, glove or fasttext) instead of the bag-of-words model. Does this improve classification?