In [91]:
import pandas as pd
from collections import Counter
from itertools import chain

# Data loading

In [2]:
df = pd.read_csv("data/reddit_train.csv")

In [3]:
print("Data format:\n")
print(df.iloc[0])

Data format:

id                                                            0
comments      Honestly, Buffalo is the correct answer. I rem...
subreddits                                               hockey
Name: 0, dtype: object


In [4]:
print("Number of examples per subreddit:\n")
print(df.subreddits.value_counts())

Number of examples per subreddit:

GlobalOffensive    3500
conspiracy         3500
funny              3500
Music              3500
trees              3500
nfl                3500
movies             3500
hockey             3500
wow                3500
Overwatch          3500
AskReddit          3500
soccer             3500
baseball           3500
anime              3500
gameofthrones      3500
nba                3500
europe             3500
worldnews          3500
leagueoflegends    3500
canada             3500
Name: subreddits, dtype: int64


# Corpus vocabulary (no preprocessing)

In [5]:
corpus = " ".join(df.comments.to_list()).split()
print("Total words in corpus: ", len(corpus))

Total words in corpus:  2955954


In [6]:
vocab = Counter(corpus)
print("Number of unique 'words' in corpus: ", len(vocab))

Number of unique 'words' in corpus:  187629


In [7]:
print("1000 most common words: \n")
for word, count in vocab.most_common()[:1000]:
    print(word, ": ", count)

1000 most common words: 

the :  114884
to :  72190
a :  68406
and :  59579
of :  51613
I :  48315
is :  38504
in :  38503
that :  35374
you :  29315
it :  27053
for :  26018
was :  21762
on :  18850
have :  18722
be :  18107
with :  18061
but :  17498
are :  17369
not :  16668
this :  15899
as :  14494
they :  14364
just :  12751
like :  12659
he :  12530
or :  12418
if :  11219
at :  11197
The :  9991
so :  9563
your :  9497
my :  9208
about :  9149
would :  9089
from :  8950
can :  8811
all :  8586
more :  8259
an :  8231
his :  8074
get :  8001
don't :  7995
one :  7707
has :  7673
people :  7622
think :  7417
it's :  7336
because :  7256
their :  7179
do :  7004
what :  6988
when :  6850
will :  6749
up :  6703
I'm :  6646
by :  6645
we :  6639
out :  6633
than :  6116
who :  5836
- :  5765
how :  5598
some :  5524
me :  5486
no :  5436
had :  5431
really :  5414
only :  5379
been :  5194
even :  5128
were :  5115
any :  5040
them :  4948
If :  4925
there :  4899
You :  4686
good 

back. :  276
force :  275
problems :  275
Would :  275
food :  275
willing :  274
... :  274
paying :  274
discussion :  273
2. :  273
serious :  273
(I :  273
here, :  272
party :  272
format :  272
ago. :  271
bad. :  271
drop :  270
+ :  270
either. :  269


In [8]:
print("1000 least common words: \n")
for word, count in vocab.most_common()[-1000:]:
    print(word, ": ", count)

1000 least common words: 

Beast. :  1
autoaim :  1
strong). :  1
overpowered" :  1
https://en.wikipedia.org/wiki/Lawrence_v._Texas :  1
^89590 :  1
Sugou :  1
grapes) :  1
grown, :  1
foodstuffs :  1
Parma, :  1
Reggio :  1
Modena, :  1
Bologna, :  1
Mantua :  1
Parmesan. :  1
delicacies :  1
$23.27 :  1
kg, :  1
$49.11 :  1
kg. :  1
https://www.parmashop.com/english/parmigiano-reggiano.html :  1
winemakers :  1
Shiraz, :  1
Riesling, :  1
Mueller-Thurgau, :  1
Dornfelder, :  1
Chardonnay :  1
Barbaresco :  1
Chianti, :  1
Rioja :  1
Reserva, :  1
sommeliers :  1
palatable. :  1
remember!) :  1
trees: :  1
treason): :  1
mother-bear :  1
media*. :  1
"Prop :  1
friendly" :  1
Ulting :  1
Sejuani... :  1
Numminen :  1
Teppo, :  1
got: :  1
proclaimed. :  1
below... :  1
bicyclist/columnist :  1
Shaplro. :  1
Hurvitz :  1
Mitzvah :  1
Hurvltz... :  1
Counsel, :  1
Advocacy :  1
committee. :  1
i, :  1
Loulse.... :  1
Shimmel's :  1
shlmmel... :  1
HanAholeSolo :  1
purveyed :  1
circle-

75,000 :  1
Toba :  1
"nuclear" :  1
metallurgy, :  1
desolate, :  1
inhospitable :  1
cockroaches. :  1
"Surely :  1
comment!". :  1
Monogamy :  1
Baseball, :  1
Extremadura. :  1
(2008-2014), :  1
Centro :  1
converged :  1
peninsula. :  1
weapon! :  1
4X :  1
animes/SoL :  1
MAL](https://myanimelist.net/animelist/Talonpls) :  1
Launders, :  1
sado :  1
top:D :  1


# Data preprocessing with spacy

In [9]:
import spacy
from tqdm import tqdm_notebook as tqdm

In [10]:
nlp = spacy.load("en_core_web_sm")

In [80]:
tokenized_comments = list(nlp.pipe(tqdm(df.comments.to_list()), batch_size=10000, n_threads=6))

HBox(children=(IntProgress(value=0, max=70000), HTML(value='')))




In [102]:
def filter_token(t):
    return (
        not t.is_bracket and
        not t.is_currency and
        not t.is_digit and
        not t.is_left_punct and
        not t.is_right_punct and
        not t.is_punct and
        not t.is_quote and
        not t.is_space and 
        not t.is_stop and 
        not t.like_url and
        not t.like_email and
        not t.like_num and
        t.is_alpha
    )

In [103]:
preprocessed_comments = [
    [t.lemma_.lower() for t in c if filter_token(t)]
    for c in tqdm(tokenized_comments)
]

HBox(children=(IntProgress(value=0, max=70000), HTML(value='')))




In [104]:
preprocessed_comments[9]

['right',
 'disruptor',
 'tank',
 'pull',
 'dps',
 'frey',
 'pick',
 'point',
 'reliably',
 'hero',
 'expect',
 'win',
 'heartbreake']

In [105]:
tokenized_comments[9]

Right! He was a disruptor tank! Pull the dps out of the frey and pick them off, then get on the point. Now he can't reliably go one on one with any hero and expect to win... it's so heartbreaking..

In [106]:
df["preprocessed_comments"] = [" ".join(c) for c in preprocessed_comments]

In [107]:
preprocessed_vocab = Counter(chain.from_iterable(preprocessed_comments))
print("Number of unique 'words' in corpus: ", len(preprocessed_vocab))

Number of unique 'words' in corpus:  54326


In [108]:
print("1000 most common words: \n")
for word, count in preprocessed_vocab.most_common()[:1000]:
    print(word, ": ", count)

1000 most common words: 

like :  14480
think :  10130
people :  9412
good :  8823
time :  7522
know :  7049
go :  6982
get :  6870
play :  6458
year :  6131
game :  5688
want :  5237
say :  5088
team :  5018
thing :  4957
post :  4831
way :  4610
look :  4579
need :  4347
gt :  4133
right :  4048
music :  4004
come :  3731
player :  3726
work :  3692
try :  3686
bad :  3627
point :  3584
mean :  3581
guy :  3301
lot :  3295
make :  3107
pretty :  3070
actually :  3025
feel :  3020
see :  2852
well :  2817
sure :  2814
find :  2800
new :  2775
fuck :  2743
watch :  2672
shit :  2668
start :  2664
great :  2635
love :  2632
day :  2630
season :  2531
probably :  2460
use :  2459
happen :  2437
win :  2374
take :  2358
man :  2344
big :  2318
end :  2306
not :  2301
yeah :  2285
high :  2271
question :  2182
world :  2115
read :  2085
talk :  2085
tell :  2069
give :  2060
long :  2042
maybe :  1999
change :  1991
movie :  1958
leave :  1906
let :  1869
kill :  1840
fan :  1828
live :  1

In [109]:
print("1000 least common words: \n")
for word, count in preprocessed_vocab.most_common()[-1000:]:
    print(word, ": ", count)

1000 least common words: 

denkt :  1
leuhgenpraat :  1
verkoopen :  1
tweemaal :  1
moeten :  1
denken :  1
cattengehspuy :  1
tyd :  1
opstel :  1
stuur :  1
opdracht :  1
naar :  1
geheyman :  1
samenstel :  1
verspieder :  1
vloerduyven :  1
verspreid :  1
republiek :  1
verenigde :  1
nederlanden :  1
gevolgd :  1
kunt :  1
zich :  1
voorbereiden :  1
rabaut :  1
bedroevenden :  1
klyne :  1
leven :  1
noemt :  1
weg :  1
vaagen :  1
tyden :  1
zyn :  1
wyzen :  1
doden :  1
slecht :  1
bloten :  1
handen :  1
alleen :  1
veelomvattend :  1
geoefend :  1
ongewapenden :  1
krygskunst :  1
alsmede :  1
voltallige :  1
arsenaal :  1
watergeuzen :  1
myner :  1
beschikke :  1
benutten :  1
uwer :  1
lamlendigen :  1
achtereinde :  1
vastenland :  1
vagen :  1
klynen :  1
schobbejak :  1
geweten :  1
voor :  1
eene :  1
goddelooze :  1
vergelde :  1
geestige :  1
teweeg :  1
zou :  1
brengen :  1
misschien :  1
uwen :  1
tong :  1
gebeten :  1
betalen :  1
smeerkani :  1
furie :  1
sch

In [112]:
df.to_csv("data/reddit_spacy_train.csv", index=False)