In [1]:
import pandas as pd
from collections import Counter
from itertools import chain

# Data loading

In [2]:
df = pd.read_csv("data/reddit_train.csv")

In [3]:
print("Data format:\n")
print(df.iloc[0])

Data format:

id                                                            0
comments      Honestly, Buffalo is the correct answer. I rem...
subreddits                                               hockey
Name: 0, dtype: object


In [4]:
print("Number of examples per subreddit:\n")
print(df.subreddits.value_counts())

Number of examples per subreddit:

wow                3500
movies             3500
AskReddit          3500
nba                3500
conspiracy         3500
GlobalOffensive    3500
canada             3500
soccer             3500
anime              3500
worldnews          3500
Music              3500
europe             3500
Overwatch          3500
leagueoflegends    3500
funny              3500
hockey             3500
nfl                3500
baseball           3500
trees              3500
gameofthrones      3500
Name: subreddits, dtype: int64


# Corpus vocabulary (no preprocessing)

In [5]:
corpus = " ".join(df.comments.to_list()).split()
print("Total words in corpus: ", len(corpus))

Total words in corpus:  2955954


In [6]:
vocab = Counter(corpus)
print("Number of unique 'words' in corpus: ", len(vocab))

Number of unique 'words' in corpus:  187629


In [7]:
print("1000 most common words: \n")
for word, count in vocab.most_common()[:1000]:
    print(word, ": ", count)

1000 most common words: 

the :  114884
to :  72190
a :  68406
and :  59579
of :  51613
I :  48315
is :  38504
in :  38503
that :  35374
you :  29315
it :  27053
for :  26018
was :  21762
on :  18850
have :  18722
be :  18107
with :  18061
but :  17498
are :  17369
not :  16668
this :  15899
as :  14494
they :  14364
just :  12751
like :  12659
he :  12530
or :  12418
if :  11219
at :  11197
The :  9991
so :  9563
your :  9497
my :  9208
about :  9149
would :  9089
from :  8950
can :  8811
all :  8586
more :  8259
an :  8231
his :  8074
get :  8001
don't :  7995
one :  7707
has :  7673
people :  7622
think :  7417
it's :  7336
because :  7256
their :  7179
do :  7004
what :  6988
when :  6850
will :  6749
up :  6703
I'm :  6646
by :  6645
we :  6639
out :  6633
than :  6116
who :  5836
- :  5765
how :  5598
some :  5524
me :  5486
no :  5436
had :  5431
really :  5414
only :  5379
been :  5194
even :  5128
were :  5115
any :  5040
them :  4948
If :  4925
there :  4899
You :  4686
good 

In [8]:
print("1000 least common words: \n")
for word, count in vocab.most_common()[-1000:]:
    print(word, ": ", count)

1000 least common words: 

Beast. :  1
autoaim :  1
strong). :  1
overpowered" :  1
https://en.wikipedia.org/wiki/Lawrence_v._Texas :  1
^89590 :  1
Sugou :  1
grapes) :  1
grown, :  1
foodstuffs :  1
Parma, :  1
Reggio :  1
Modena, :  1
Bologna, :  1
Mantua :  1
Parmesan. :  1
delicacies :  1
$23.27 :  1
kg, :  1
$49.11 :  1
kg. :  1
https://www.parmashop.com/english/parmigiano-reggiano.html :  1
winemakers :  1
Shiraz, :  1
Riesling, :  1
Mueller-Thurgau, :  1
Dornfelder, :  1
Chardonnay :  1
Barbaresco :  1
Chianti, :  1
Rioja :  1
Reserva, :  1
sommeliers :  1
palatable. :  1
remember!) :  1
trees: :  1
treason): :  1
mother-bear :  1
media*. :  1
"Prop :  1
friendly" :  1
Ulting :  1
Sejuani... :  1
Numminen :  1
Teppo, :  1
got: :  1
proclaimed. :  1
below... :  1
bicyclist/columnist :  1
Shaplro. :  1
Hurvitz :  1
Mitzvah :  1
Hurvltz... :  1
Counsel, :  1
Advocacy :  1
committee. :  1
i, :  1
Loulse.... :  1
Shimmel's :  1
shlmmel... :  1
HanAholeSolo :  1
purveyed :  1
circle-

# Data preprocessing with spacy

In [9]:
import spacy
from tqdm import tqdm_notebook as tqdm

In [10]:
nlp = spacy.load("en_core_web_sm")

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [None]:
tokenized_comments = list(nlp.pipe(tqdm(df.comments.to_list()), batch_size=10000, n_threads=6))

In [None]:
def filter_token(t):
    return (
        not t.is_bracket and
        not t.is_currency and
        not t.is_digit and
        not t.is_left_punct and
        not t.is_right_punct and
        not t.is_punct and
        not t.is_quote and
        not t.is_space and 
        not t.is_stop and 
        not t.like_url and
        not t.like_email and
        not t.like_num and
        t.is_alpha
    )

In [None]:
preprocessed_comments = [
    [t.lemma_.lower() for t in c if filter_token(t)]
    for c in tqdm(tokenized_comments)
]

In [None]:
preprocessed_comments[9]

In [None]:
tokenized_comments[9]

In [None]:
df["preprocessed_comments"] = [" ".join(c) for c in preprocessed_comments]

In [None]:
preprocessed_vocab = Counter(chain.from_iterable(preprocessed_comments))
print("Number of unique 'words' in corpus: ", len(preprocessed_vocab))

In [None]:
print("1000 most common words: \n")
for word, count in preprocessed_vocab.most_common()[:1000]:
    print(word, ": ", count)

In [None]:
print("1000 least common words: \n")
for word, count in preprocessed_vocab.most_common()[-1000:]:
    print(word, ": ", count)

In [None]:
df.to_csv("data/reddit_spacy_train.csv", index=False)