# Natural Language Processing Challenge

## Model: Bag of words

In [1]:
#import sys
#sys.path.append('/content/sample_data')

In [2]:
# import here
import importlib
import helper_data as myhelp
importlib.reload(myhelp)

import pandas as pd
import nltk


In [3]:
#from google.colab import drive
#drive.mount('/content/drive')

## Load the data

The data will be loaded from a .csv file. The columns are separated with tabs.

In [4]:
# load training data from file
#data = myhelp.load_data("../dataset/training_data_lowercase.csv", '\t')
data = myhelp.load_data("training_data_lowercase.csv", '\t')

# load testing data from file
#data_out = myhelp.load_data("../dataset/testing_data_lowercase_nolabels.csv")
data_out = myhelp.load_data("testing_data_lowercase_nolabels.csv")

## Initial visualization of the data

Let's get familiar with the data by starting to visualize the content.

In [5]:
myhelp.display_data(data)

Data head():
   0                                                  1
0  0  donald trump sends out embarrassing new year‚s...
1  0  drunk bragging trump staffer started russian c...
2  0  sheriff david clarke becomes an internet joke ...
3  0  trump is so obsessed he even has obama‚s name ...
4  0  pope francis just called out donald trump duri...

Data info():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34152 entries, 0 to 34151
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       34152 non-null  int64 
 1   1       34152 non-null  object
dtypes: int64(1), object(1)
memory usage: 533.8+ KB
None

Data shape:
(34152, 2)


In [6]:
myhelp.display_data(data_out)

Data head():
   0                                                  1
0  2  copycat muslim terrorist arrested with assault...
1  2  wow! chicago protester caught on camera admits...
2  2   germany's fdp look to fill schaeuble's big shoes
4  2  u.n. seeks 'massive' aid boost amid rohingya '...

Data info():
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9984 entries, 0 to 9983
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       9984 non-null   object
 1   1       9984 non-null   object
dtypes: object(2)
memory usage: 156.1+ KB
None

Data shape:
(9984, 2)


## Set names for the columns

In [7]:
data.columns = ['label', 'article']
data_out.columns = ['label', 'article']

## Data preprocessing

Observations:
- The data is already lower-case.

### Stopwords

In [8]:
import nltk
#nltk.download()   # try to use this is you need some package
nltk.download('stopwords', download_dir='/root/nltk_data') # Google colab: try to use this is you need some package


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
from nltk.corpus import stopwords
print (stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

### Tokenization

In [10]:
#import nltk
nltk.download('punkt_tab') # try to use this is you need some package
nltk.download('punkt_tab', download_dir='/root/nltk_data') # Google colab: try to use this is you need some package

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [11]:

data['token'] = data.apply(
    lambda row: myhelp.tokenizer_and_remove_punctuation(row, 'article'),
    axis=1
    )
data.head()

Unnamed: 0,label,article,token
0,0,donald trump sends out embarrassing new year‚s...,"[donald, trump, sends, out, embarrassing, new,..."
1,0,drunk bragging trump staffer started russian c...,"[drunk, bragging, trump, staffer, started, rus..."
2,0,sheriff david clarke becomes an internet joke ...,"[sheriff, david, clarke, becomes, an, internet..."
3,0,trump is so obsessed he even has obama‚s name ...,"[trump, is, so, obsessed, he, even, has, name,..."
4,0,pope francis just called out donald trump duri...,"[pope, francis, just, called, out, donald, tru..."


In [12]:

data_out['token'] = data_out.apply(
    lambda row: myhelp.tokenizer_and_remove_punctuation(row, 'article'),
    axis=1
    )
data_out.head()

Unnamed: 0,label,article,token
0,2,copycat muslim terrorist arrested with assault...,"[copycat, muslim, terrorist, arrested, with, a..."
1,2,wow! chicago protester caught on camera admits...,"[wow, chicago, protester, caught, on, camera, ..."
2,2,germany's fdp look to fill schaeuble's big shoes,"[germany, fdp, look, to, fill, schaeuble, big,..."
3,2,mi school sends welcome back packet warning ki...,"[mi, school, sends, welcome, back, packet, war..."
4,2,u.n. seeks 'massive' aid boost amid rohingya '...,"[seeks, aid, boost, amid, rohingya, within, an..."


### Stemming

In [13]:
# stemming applied to all articles
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

#stemmer_porter   = PorterStemmer()
stemmer = SnowballStemmer('english')

data['stem'] = data.apply(
    lambda row: myhelp.stemming(stemmer, row, 'token'),
    axis=1
    )
data.head()

Unnamed: 0,label,article,token,stem
0,0,donald trump sends out embarrassing new year‚s...,"[donald, trump, sends, out, embarrassing, new,...","[donald, trump, send, out, embarrass, new, eve..."
1,0,drunk bragging trump staffer started russian c...,"[drunk, bragging, trump, staffer, started, rus...","[drunk, brag, trump, staffer, start, russian, ..."
2,0,sheriff david clarke becomes an internet joke ...,"[sheriff, david, clarke, becomes, an, internet...","[sheriff, david, clark, becom, an, internet, j..."
3,0,trump is so obsessed he even has obama‚s name ...,"[trump, is, so, obsessed, he, even, has, name,...","[trump, is, so, obsess, he, even, has, name, c..."
4,0,pope francis just called out donald trump duri...,"[pope, francis, just, called, out, donald, tru...","[pope, franci, just, call, out, donald, trump,..."


In [14]:

data_out['stem'] = data_out.apply(
    lambda row: myhelp.stemming(stemmer, row, 'token'),
    axis=1
    )
data_out.head()

Unnamed: 0,label,article,token,stem
0,2,copycat muslim terrorist arrested with assault...,"[copycat, muslim, terrorist, arrested, with, a...","[copycat, muslim, terrorist, arrest, with, ass..."
1,2,wow! chicago protester caught on camera admits...,"[wow, chicago, protester, caught, on, camera, ...","[wow, chicago, protest, caught, on, camera, ad..."
2,2,germany's fdp look to fill schaeuble's big shoes,"[germany, fdp, look, to, fill, schaeuble, big,...","[germani, fdp, look, to, fill, schaeubl, big, ..."
3,2,mi school sends welcome back packet warning ki...,"[mi, school, sends, welcome, back, packet, war...","[mi, school, send, welcom, back, packet, warn,..."
4,2,u.n. seeks 'massive' aid boost amid rohingya '...,"[seeks, aid, boost, amid, rohingya, within, an...","[seek, aid, boost, amid, rohingya, within, an,..."


### Lemmatization

In [15]:
#import nltk
#nltk.download('averaged_perceptron_tagger_eng') # try to use this is you need some package
#nltk.download('wordnet') # wordnet is the most well known lemmatizer for english
#nltk.download('omw-1.4')

# Google colab: try to use this is you need some package
nltk.download('averaged_perceptron_tagger_eng', download_dir='/root/nltk_data')
nltk.download('wordnet', download_dir='/root/nltk_data') # wordnet is the most well known lemmatizer for english
nltk.download('omw-1.4', download_dir='/root/nltk_data')


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [16]:
# lemmatize with part of speech helpers
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

data['lemma'] = data.apply(
    lambda row: myhelp.lemmatizer_with_pos(lemmatizer, row, 'token'),
    axis=1
    )
data.head()


Unnamed: 0,label,article,token,stem,lemma
0,0,donald trump sends out embarrassing new year‚s...,"[donald, trump, sends, out, embarrassing, new,...","[donald, trump, send, out, embarrass, new, eve...","[donald, trump, sends, out, embarrass, new, ev..."
1,0,drunk bragging trump staffer started russian c...,"[drunk, bragging, trump, staffer, started, rus...","[drunk, brag, trump, staffer, start, russian, ...","[drunk, bragging, trump, staffer, start, russi..."
2,0,sheriff david clarke becomes an internet joke ...,"[sheriff, david, clarke, becomes, an, internet...","[sheriff, david, clark, becom, an, internet, j...","[sheriff, david, clarke, becomes, an, internet..."
3,0,trump is so obsessed he even has obama‚s name ...,"[trump, is, so, obsessed, he, even, has, name,...","[trump, is, so, obsess, he, even, has, name, c...","[trump, be, so, obsess, he, even, have, name, ..."
4,0,pope francis just called out donald trump duri...,"[pope, francis, just, called, out, donald, tru...","[pope, franci, just, call, out, donald, trump,...","[pope, francis, just, call, out, donald, trump..."


In [17]:

data_out['lemma'] = data_out.apply(
    lambda row: myhelp.lemmatizer_with_pos(lemmatizer, row, 'token'),
    axis=1
    )
data_out.head()

Unnamed: 0,label,article,token,stem,lemma
0,2,copycat muslim terrorist arrested with assault...,"[copycat, muslim, terrorist, arrested, with, a...","[copycat, muslim, terrorist, arrest, with, ass...","[copycat, muslim, terrorist, arrest, with, ass..."
1,2,wow! chicago protester caught on camera admits...,"[wow, chicago, protester, caught, on, camera, ...","[wow, chicago, protest, caught, on, camera, ad...","[wow, chicago, protester, caught, on, camera, ..."
2,2,germany's fdp look to fill schaeuble's big shoes,"[germany, fdp, look, to, fill, schaeuble, big,...","[germani, fdp, look, to, fill, schaeubl, big, ...","[germany, fdp, look, to, fill, schaeuble, big,..."
3,2,mi school sends welcome back packet warning ki...,"[mi, school, sends, welcome, back, packet, war...","[mi, school, send, welcom, back, packet, warn,...","[mi, school, sends, welcome, back, packet, war..."
4,2,u.n. seeks 'massive' aid boost amid rohingya '...,"[seeks, aid, boost, amid, rohingya, within, an...","[seek, aid, boost, amid, rohingya, within, an,...","[seek, aid, boost, amid, rohingya, within, an,..."


### Remove stop words

In [18]:
# data train: remove stopwords for tokenization
data['token_no_sw'] = data.apply(
  lambda row: myhelp.remove_stopwords(stopwords, row, 'token'),
  axis=1)
data.head()


Unnamed: 0,label,article,token,stem,lemma,token_no_sw
0,0,donald trump sends out embarrassing new year‚s...,"[donald, trump, sends, out, embarrassing, new,...","[donald, trump, send, out, embarrass, new, eve...","[donald, trump, sends, out, embarrass, new, ev...","[trump, donald, message, eve, sends, disturbin..."
1,0,drunk bragging trump staffer started russian c...,"[drunk, bragging, trump, staffer, started, rus...","[drunk, brag, trump, staffer, start, russian, ...","[drunk, bragging, trump, staffer, start, russi...","[staffer, trump, bragging, russian, collusion,..."
2,0,sheriff david clarke becomes an internet joke ...,"[sheriff, david, clarke, becomes, an, internet...","[sheriff, david, clark, becom, an, internet, j...","[sheriff, david, clarke, becomes, an, internet...","[internet, clarke, joke, threatening, poke, sh..."
3,0,trump is so obsessed he even has obama‚s name ...,"[trump, is, so, obsessed, he, even, has, name,...","[trump, is, so, obsess, he, even, has, name, c...","[trump, be, so, obsess, he, even, have, name, ...","[trump, obsessed, coded, images, website]"
4,0,pope francis just called out donald trump duri...,"[pope, francis, just, called, out, donald, tru...","[pope, franci, just, call, out, donald, trump,...","[pope, francis, just, call, out, donald, trump...","[pope, donald, trump, speech, called, francis,..."


In [19]:
# data train: remove stopwords for stemming
data['stem_no_sw'] = data.apply(
  lambda row: myhelp.remove_stopwords(stopwords, row, 'stem'),
  axis=1)
data.head()


Unnamed: 0,label,article,token,stem,lemma,token_no_sw,stem_no_sw
0,0,donald trump sends out embarrassing new year‚s...,"[donald, trump, sends, out, embarrassing, new,...","[donald, trump, send, out, embarrass, new, eve...","[donald, trump, sends, out, embarrass, new, ev...","[trump, donald, message, eve, sends, disturbin...","[trump, donald, send, eve, disturb, embarrass,..."
1,0,drunk bragging trump staffer started russian c...,"[drunk, bragging, trump, staffer, started, rus...","[drunk, brag, trump, staffer, start, russian, ...","[drunk, bragging, trump, staffer, start, russi...","[staffer, trump, bragging, russian, collusion,...","[brag, staffer, trump, russian, investig, star..."
2,0,sheriff david clarke becomes an internet joke ...,"[sheriff, david, clarke, becomes, an, internet...","[sheriff, david, clark, becom, an, internet, j...","[sheriff, david, clarke, becomes, an, internet...","[internet, clarke, joke, threatening, poke, sh...","[internet, joke, becom, threaten, poke, peopl,..."
3,0,trump is so obsessed he even has obama‚s name ...,"[trump, is, so, obsessed, he, even, has, name,...","[trump, is, so, obsess, he, even, has, name, c...","[trump, be, so, obsess, he, even, have, name, ...","[trump, obsessed, coded, images, website]","[trump, obsess, code, websit, imag]"
4,0,pope francis just called out donald trump duri...,"[pope, francis, just, called, out, donald, tru...","[pope, franci, just, call, out, donald, trump,...","[pope, francis, just, call, out, donald, trump...","[pope, donald, trump, speech, called, francis,...","[pope, donald, trump, speech, christma, franci..."


In [20]:
# data train: remove stopwords for lemmatization
data['lemma_no_sw'] = data.apply(
  lambda row: myhelp.remove_stopwords(stopwords, row, 'lemma'),
  axis=1)
data.head()


Unnamed: 0,label,article,token,stem,lemma,token_no_sw,stem_no_sw,lemma_no_sw
0,0,donald trump sends out embarrassing new year‚s...,"[donald, trump, sends, out, embarrassing, new,...","[donald, trump, send, out, embarrass, new, eve...","[donald, trump, sends, out, embarrass, new, ev...","[trump, donald, message, eve, sends, disturbin...","[trump, donald, send, eve, disturb, embarrass,...","[trump, donald, message, eve, sends, disturb, ..."
1,0,drunk bragging trump staffer started russian c...,"[drunk, bragging, trump, staffer, started, rus...","[drunk, brag, trump, staffer, start, russian, ...","[drunk, bragging, trump, staffer, start, russi...","[staffer, trump, bragging, russian, collusion,...","[brag, staffer, trump, russian, investig, star...","[staffer, trump, bragging, russian, start, col..."
2,0,sheriff david clarke becomes an internet joke ...,"[sheriff, david, clarke, becomes, an, internet...","[sheriff, david, clark, becom, an, internet, j...","[sheriff, david, clarke, becomes, an, internet...","[internet, clarke, joke, threatening, poke, sh...","[internet, joke, becom, threaten, poke, peopl,...","[internet, clarke, joke, poke, threaten, sheri..."
3,0,trump is so obsessed he even has obama‚s name ...,"[trump, is, so, obsessed, he, even, has, name,...","[trump, is, so, obsess, he, even, has, name, c...","[trump, be, so, obsess, he, even, have, name, ...","[trump, obsessed, coded, images, website]","[trump, obsess, code, websit, imag]","[image, trump, cod, obsess, website]"
4,0,pope francis just called out donald trump duri...,"[pope, francis, just, called, out, donald, tru...","[pope, franci, just, call, out, donald, trump,...","[pope, francis, just, call, out, donald, trump...","[pope, donald, trump, speech, called, francis,...","[pope, donald, trump, speech, christma, franci...","[pope, donald, trump, speech, francis, christm..."


In [21]:
# data test: remove stopwords for tokenization
data_out['token_no_sw'] = data_out.apply(
  lambda row: myhelp.remove_stopwords(stopwords, row, 'token'),
  axis=1)
data_out.head()

Unnamed: 0,label,article,token,stem,lemma,token_no_sw
0,2,copycat muslim terrorist arrested with assault...,"[copycat, muslim, terrorist, arrested, with, a...","[copycat, muslim, terrorist, arrest, with, ass...","[copycat, muslim, terrorist, arrest, with, ass...","[muslim, weapons, copycat, arrested, terrorist..."
1,2,wow! chicago protester caught on camera admits...,"[wow, chicago, protester, caught, on, camera, ...","[wow, chicago, protest, caught, on, camera, ad...","[wow, chicago, protester, caught, on, camera, ...","[camera, gon, caught, admits, chicago, violent..."
2,2,germany's fdp look to fill schaeuble's big shoes,"[germany, fdp, look, to, fill, schaeuble, big,...","[germani, fdp, look, to, fill, schaeubl, big, ...","[germany, fdp, look, to, fill, schaeuble, big,...","[schaeuble, fill, big, shoes, germany, fdp]"
3,2,mi school sends welcome back packet warning ki...,"[mi, school, sends, welcome, back, packet, war...","[mi, school, send, welcom, back, packet, warn,...","[mi, school, sends, welcome, back, packet, war...","[wearing, kids, packet, flag, sends, warning, ..."
4,2,u.n. seeks 'massive' aid boost amid rohingya '...,"[seeks, aid, boost, amid, rohingya, within, an...","[seek, aid, boost, amid, rohingya, within, an,...","[seek, aid, boost, amid, rohingya, within, an,...","[amid, aid, rohingya, boost, seeks, emergency]"


In [22]:
# data test: remove stopwords for stemming
data_out['stem_no_sw'] = data_out.apply(
  lambda row: myhelp.remove_stopwords(stopwords, row, 'stem'),
  axis=1)
data_out.head()

Unnamed: 0,label,article,token,stem,lemma,token_no_sw,stem_no_sw
0,2,copycat muslim terrorist arrested with assault...,"[copycat, muslim, terrorist, arrested, with, a...","[copycat, muslim, terrorist, arrest, with, ass...","[copycat, muslim, terrorist, arrest, with, ass...","[muslim, weapons, copycat, arrested, terrorist...","[muslim, arrest, copycat, terrorist, weapon, a..."
1,2,wow! chicago protester caught on camera admits...,"[wow, chicago, protester, caught, on, camera, ...","[wow, chicago, protest, caught, on, camera, ad...","[wow, chicago, protester, caught, on, camera, ...","[camera, gon, caught, admits, chicago, violent...","[activ, camera, gon, protest, caught, chicago,..."
2,2,germany's fdp look to fill schaeuble's big shoes,"[germany, fdp, look, to, fill, schaeuble, big,...","[germani, fdp, look, to, fill, schaeubl, big, ...","[germany, fdp, look, to, fill, schaeuble, big,...","[schaeuble, fill, big, shoes, germany, fdp]","[germani, fill, big, shoe, schaeubl, fdp]"
3,2,mi school sends welcome back packet warning ki...,"[mi, school, sends, welcome, back, packet, war...","[mi, school, send, welcom, back, packet, warn,...","[mi, school, sends, welcome, back, packet, war...","[wearing, kids, packet, flag, sends, warning, ...","[send, packet, wear, flag, back, welcom, warn,..."
4,2,u.n. seeks 'massive' aid boost amid rohingya '...,"[seeks, aid, boost, amid, rohingya, within, an...","[seek, aid, boost, amid, rohingya, within, an,...","[seek, aid, boost, amid, rohingya, within, an,...","[amid, aid, rohingya, boost, seeks, emergency]","[seek, amid, rohingya, aid, boost, emerg]"


In [23]:
# data test: remove stopwords for lemmatization
data_out['lemma_no_sw'] = data_out.apply(
  lambda row: myhelp.remove_stopwords(stopwords, row, 'lemma'),
  axis=1)
data_out.head()

Unnamed: 0,label,article,token,stem,lemma,token_no_sw,stem_no_sw,lemma_no_sw
0,2,copycat muslim terrorist arrested with assault...,"[copycat, muslim, terrorist, arrested, with, a...","[copycat, muslim, terrorist, arrest, with, ass...","[copycat, muslim, terrorist, arrest, with, ass...","[muslim, weapons, copycat, arrested, terrorist...","[muslim, arrest, copycat, terrorist, weapon, a...","[muslim, arrest, copycat, terrorist, weapon, a..."
1,2,wow! chicago protester caught on camera admits...,"[wow, chicago, protester, caught, on, camera, ...","[wow, chicago, protest, caught, on, camera, ad...","[wow, chicago, protester, caught, on, camera, ...","[camera, gon, caught, admits, chicago, violent...","[activ, camera, gon, protest, caught, chicago,...","[camera, gon, caught, admits, chicago, violent..."
2,2,germany's fdp look to fill schaeuble's big shoes,"[germany, fdp, look, to, fill, schaeuble, big,...","[germani, fdp, look, to, fill, schaeubl, big, ...","[germany, fdp, look, to, fill, schaeuble, big,...","[schaeuble, fill, big, shoes, germany, fdp]","[germani, fill, big, shoe, schaeubl, fdp]","[schaeuble, fill, big, shoe, germany, fdp]"
3,2,mi school sends welcome back packet warning ki...,"[mi, school, sends, welcome, back, packet, war...","[mi, school, send, welcom, back, packet, warn,...","[mi, school, sends, welcome, back, packet, war...","[wearing, kids, packet, flag, sends, warning, ...","[send, packet, wear, flag, back, welcom, warn,...","[packet, wear, flag, sends, back, welcome, war..."
4,2,u.n. seeks 'massive' aid boost amid rohingya '...,"[seeks, aid, boost, amid, rohingya, within, an...","[seek, aid, boost, amid, rohingya, within, an,...","[seek, aid, boost, amid, rohingya, within, an,...","[amid, aid, rohingya, boost, seeks, emergency]","[seek, amid, rohingya, aid, boost, emerg]","[seek, amid, rohingya, aid, boost, emergency]"


### Merge preprocessed data to suit classification

---



In [24]:
# put all this cleaning together

#def re_blob(row):
#  return " ".join(row['no_stopwords'])

# data train: make a blob for tokenizer
data['token_blob'] = data.apply(
    lambda row: myhelp.re_blob(row, 'token_no_sw'),
    axis=1
    )
data.head()

#data_out['clean_blob'] = data_out.apply(re_blob,axis=1)
#data_out.head()

Unnamed: 0,label,article,token,stem,lemma,token_no_sw,stem_no_sw,lemma_no_sw,token_blob
0,0,donald trump sends out embarrassing new year‚s...,"[donald, trump, sends, out, embarrassing, new,...","[donald, trump, send, out, embarrass, new, eve...","[donald, trump, sends, out, embarrass, new, ev...","[trump, donald, message, eve, sends, disturbin...","[trump, donald, send, eve, disturb, embarrass,...","[trump, donald, message, eve, sends, disturb, ...",trump donald message eve sends disturbing emba...
1,0,drunk bragging trump staffer started russian c...,"[drunk, bragging, trump, staffer, started, rus...","[drunk, brag, trump, staffer, start, russian, ...","[drunk, bragging, trump, staffer, start, russi...","[staffer, trump, bragging, russian, collusion,...","[brag, staffer, trump, russian, investig, star...","[staffer, trump, bragging, russian, start, col...",staffer trump bragging russian collusion inves...
2,0,sheriff david clarke becomes an internet joke ...,"[sheriff, david, clarke, becomes, an, internet...","[sheriff, david, clark, becom, an, internet, j...","[sheriff, david, clarke, becomes, an, internet...","[internet, clarke, joke, threatening, poke, sh...","[internet, joke, becom, threaten, poke, peopl,...","[internet, clarke, joke, poke, threaten, sheri...",internet clarke joke threatening poke sheriff ...
3,0,trump is so obsessed he even has obama‚s name ...,"[trump, is, so, obsessed, he, even, has, name,...","[trump, is, so, obsess, he, even, has, name, c...","[trump, be, so, obsess, he, even, have, name, ...","[trump, obsessed, coded, images, website]","[trump, obsess, code, websit, imag]","[image, trump, cod, obsess, website]",trump obsessed coded images website
4,0,pope francis just called out donald trump duri...,"[pope, francis, just, called, out, donald, tru...","[pope, franci, just, call, out, donald, trump,...","[pope, francis, just, call, out, donald, trump...","[pope, donald, trump, speech, called, francis,...","[pope, donald, trump, speech, christma, franci...","[pope, donald, trump, speech, francis, christm...",pope donald trump speech called francis christmas


In [25]:
# data train: make a blob for stemming
data['stem_blob'] = data.apply(
    lambda row: myhelp.re_blob(row, 'stem_no_sw'),
    axis=1
    )
data.head()

Unnamed: 0,label,article,token,stem,lemma,token_no_sw,stem_no_sw,lemma_no_sw,token_blob,stem_blob
0,0,donald trump sends out embarrassing new year‚s...,"[donald, trump, sends, out, embarrassing, new,...","[donald, trump, send, out, embarrass, new, eve...","[donald, trump, sends, out, embarrass, new, ev...","[trump, donald, message, eve, sends, disturbin...","[trump, donald, send, eve, disturb, embarrass,...","[trump, donald, message, eve, sends, disturb, ...",trump donald message eve sends disturbing emba...,trump donald send eve disturb embarrass messag
1,0,drunk bragging trump staffer started russian c...,"[drunk, bragging, trump, staffer, started, rus...","[drunk, brag, trump, staffer, start, russian, ...","[drunk, bragging, trump, staffer, start, russi...","[staffer, trump, bragging, russian, collusion,...","[brag, staffer, trump, russian, investig, star...","[staffer, trump, bragging, russian, start, col...",staffer trump bragging russian collusion inves...,brag staffer trump russian investig start coll...
2,0,sheriff david clarke becomes an internet joke ...,"[sheriff, david, clarke, becomes, an, internet...","[sheriff, david, clark, becom, an, internet, j...","[sheriff, david, clarke, becomes, an, internet...","[internet, clarke, joke, threatening, poke, sh...","[internet, joke, becom, threaten, poke, peopl,...","[internet, clarke, joke, poke, threaten, sheri...",internet clarke joke threatening poke sheriff ...,internet joke becom threaten poke peopl clark ...
3,0,trump is so obsessed he even has obama‚s name ...,"[trump, is, so, obsessed, he, even, has, name,...","[trump, is, so, obsess, he, even, has, name, c...","[trump, be, so, obsess, he, even, have, name, ...","[trump, obsessed, coded, images, website]","[trump, obsess, code, websit, imag]","[image, trump, cod, obsess, website]",trump obsessed coded images website,trump obsess code websit imag
4,0,pope francis just called out donald trump duri...,"[pope, francis, just, called, out, donald, tru...","[pope, franci, just, call, out, donald, trump,...","[pope, francis, just, call, out, donald, trump...","[pope, donald, trump, speech, called, francis,...","[pope, donald, trump, speech, christma, franci...","[pope, donald, trump, speech, francis, christm...",pope donald trump speech called francis christmas,pope donald trump speech christma franci dure ...


In [26]:
# data train: make a blob for lemmatizer
data['lemma_blob'] = data.apply(
    lambda row: myhelp.re_blob(row, 'lemma_no_sw'),
    axis=1
    )
data.head()

Unnamed: 0,label,article,token,stem,lemma,token_no_sw,stem_no_sw,lemma_no_sw,token_blob,stem_blob,lemma_blob
0,0,donald trump sends out embarrassing new year‚s...,"[donald, trump, sends, out, embarrassing, new,...","[donald, trump, send, out, embarrass, new, eve...","[donald, trump, sends, out, embarrass, new, ev...","[trump, donald, message, eve, sends, disturbin...","[trump, donald, send, eve, disturb, embarrass,...","[trump, donald, message, eve, sends, disturb, ...",trump donald message eve sends disturbing emba...,trump donald send eve disturb embarrass messag,trump donald message eve sends disturb embarrass
1,0,drunk bragging trump staffer started russian c...,"[drunk, bragging, trump, staffer, started, rus...","[drunk, brag, trump, staffer, start, russian, ...","[drunk, bragging, trump, staffer, start, russi...","[staffer, trump, bragging, russian, collusion,...","[brag, staffer, trump, russian, investig, star...","[staffer, trump, bragging, russian, start, col...",staffer trump bragging russian collusion inves...,brag staffer trump russian investig start coll...,staffer trump bragging russian start collusion...
2,0,sheriff david clarke becomes an internet joke ...,"[sheriff, david, clarke, becomes, an, internet...","[sheriff, david, clark, becom, an, internet, j...","[sheriff, david, clarke, becomes, an, internet...","[internet, clarke, joke, threatening, poke, sh...","[internet, joke, becom, threaten, poke, peopl,...","[internet, clarke, joke, poke, threaten, sheri...",internet clarke joke threatening poke sheriff ...,internet joke becom threaten poke peopl clark ...,internet clarke joke poke threaten sheriff david
3,0,trump is so obsessed he even has obama‚s name ...,"[trump, is, so, obsessed, he, even, has, name,...","[trump, is, so, obsess, he, even, has, name, c...","[trump, be, so, obsess, he, even, have, name, ...","[trump, obsessed, coded, images, website]","[trump, obsess, code, websit, imag]","[image, trump, cod, obsess, website]",trump obsessed coded images website,trump obsess code websit imag,image trump cod obsess website
4,0,pope francis just called out donald trump duri...,"[pope, francis, just, called, out, donald, tru...","[pope, franci, just, call, out, donald, trump,...","[pope, francis, just, call, out, donald, trump...","[pope, donald, trump, speech, called, francis,...","[pope, donald, trump, speech, christma, franci...","[pope, donald, trump, speech, francis, christm...",pope donald trump speech called francis christmas,pope donald trump speech christma franci dure ...,pope donald trump speech francis christmas call


In [27]:
# data test: make a blob for tokenizer
data_out['token_blob'] = data_out.apply(
    lambda row: myhelp.re_blob(row, 'token_no_sw'),
    axis=1
    )
data_out.head()

Unnamed: 0,label,article,token,stem,lemma,token_no_sw,stem_no_sw,lemma_no_sw,token_blob
0,2,copycat muslim terrorist arrested with assault...,"[copycat, muslim, terrorist, arrested, with, a...","[copycat, muslim, terrorist, arrest, with, ass...","[copycat, muslim, terrorist, arrest, with, ass...","[muslim, weapons, copycat, arrested, terrorist...","[muslim, arrest, copycat, terrorist, weapon, a...","[muslim, arrest, copycat, terrorist, weapon, a...",muslim weapons copycat arrested terrorist assault
1,2,wow! chicago protester caught on camera admits...,"[wow, chicago, protester, caught, on, camera, ...","[wow, chicago, protest, caught, on, camera, ad...","[wow, chicago, protester, caught, on, camera, ...","[camera, gon, caught, admits, chicago, violent...","[activ, camera, gon, protest, caught, chicago,...","[camera, gon, caught, admits, chicago, violent...",camera gon caught admits chicago violent activ...
2,2,germany's fdp look to fill schaeuble's big shoes,"[germany, fdp, look, to, fill, schaeuble, big,...","[germani, fdp, look, to, fill, schaeubl, big, ...","[germany, fdp, look, to, fill, schaeuble, big,...","[schaeuble, fill, big, shoes, germany, fdp]","[germani, fill, big, shoe, schaeubl, fdp]","[schaeuble, fill, big, shoe, germany, fdp]",schaeuble fill big shoes germany fdp
3,2,mi school sends welcome back packet warning ki...,"[mi, school, sends, welcome, back, packet, war...","[mi, school, send, welcom, back, packet, warn,...","[mi, school, sends, welcome, back, packet, war...","[wearing, kids, packet, flag, sends, warning, ...","[send, packet, wear, flag, back, welcom, warn,...","[packet, wear, flag, sends, back, welcome, war...",wearing kids packet flag sends warning back we...
4,2,u.n. seeks 'massive' aid boost amid rohingya '...,"[seeks, aid, boost, amid, rohingya, within, an...","[seek, aid, boost, amid, rohingya, within, an,...","[seek, aid, boost, amid, rohingya, within, an,...","[amid, aid, rohingya, boost, seeks, emergency]","[seek, amid, rohingya, aid, boost, emerg]","[seek, amid, rohingya, aid, boost, emergency]",amid aid rohingya boost seeks emergency


In [28]:
# data test: make a blob for stemming
data_out['stem_blob'] = data_out.apply(
    lambda row: myhelp.re_blob(row, 'stem_no_sw'),
    axis=1
    )
data_out.head()

Unnamed: 0,label,article,token,stem,lemma,token_no_sw,stem_no_sw,lemma_no_sw,token_blob,stem_blob
0,2,copycat muslim terrorist arrested with assault...,"[copycat, muslim, terrorist, arrested, with, a...","[copycat, muslim, terrorist, arrest, with, ass...","[copycat, muslim, terrorist, arrest, with, ass...","[muslim, weapons, copycat, arrested, terrorist...","[muslim, arrest, copycat, terrorist, weapon, a...","[muslim, arrest, copycat, terrorist, weapon, a...",muslim weapons copycat arrested terrorist assault,muslim arrest copycat terrorist weapon assault
1,2,wow! chicago protester caught on camera admits...,"[wow, chicago, protester, caught, on, camera, ...","[wow, chicago, protest, caught, on, camera, ad...","[wow, chicago, protester, caught, on, camera, ...","[camera, gon, caught, admits, chicago, violent...","[activ, camera, gon, protest, caught, chicago,...","[camera, gon, caught, admits, chicago, violent...",camera gon caught admits chicago violent activ...,activ camera gon protest caught chicago violen...
2,2,germany's fdp look to fill schaeuble's big shoes,"[germany, fdp, look, to, fill, schaeuble, big,...","[germani, fdp, look, to, fill, schaeubl, big, ...","[germany, fdp, look, to, fill, schaeuble, big,...","[schaeuble, fill, big, shoes, germany, fdp]","[germani, fill, big, shoe, schaeubl, fdp]","[schaeuble, fill, big, shoe, germany, fdp]",schaeuble fill big shoes germany fdp,germani fill big shoe schaeubl fdp
3,2,mi school sends welcome back packet warning ki...,"[mi, school, sends, welcome, back, packet, war...","[mi, school, send, welcom, back, packet, warn,...","[mi, school, sends, welcome, back, packet, war...","[wearing, kids, packet, flag, sends, warning, ...","[send, packet, wear, flag, back, welcom, warn,...","[packet, wear, flag, sends, back, welcome, war...",wearing kids packet flag sends warning back we...,send packet wear flag back welcom warn school kid
4,2,u.n. seeks 'massive' aid boost amid rohingya '...,"[seeks, aid, boost, amid, rohingya, within, an...","[seek, aid, boost, amid, rohingya, within, an,...","[seek, aid, boost, amid, rohingya, within, an,...","[amid, aid, rohingya, boost, seeks, emergency]","[seek, amid, rohingya, aid, boost, emerg]","[seek, amid, rohingya, aid, boost, emergency]",amid aid rohingya boost seeks emergency,seek amid rohingya aid boost emerg


In [29]:
# data test: make a blob for lemmatizer
data_out['lemma_blob'] = data_out.apply(
    lambda row: myhelp.re_blob(row, 'lemma_no_sw'),
    axis=1
    )
data_out.head()

Unnamed: 0,label,article,token,stem,lemma,token_no_sw,stem_no_sw,lemma_no_sw,token_blob,stem_blob,lemma_blob
0,2,copycat muslim terrorist arrested with assault...,"[copycat, muslim, terrorist, arrested, with, a...","[copycat, muslim, terrorist, arrest, with, ass...","[copycat, muslim, terrorist, arrest, with, ass...","[muslim, weapons, copycat, arrested, terrorist...","[muslim, arrest, copycat, terrorist, weapon, a...","[muslim, arrest, copycat, terrorist, weapon, a...",muslim weapons copycat arrested terrorist assault,muslim arrest copycat terrorist weapon assault,muslim arrest copycat terrorist weapon assault
1,2,wow! chicago protester caught on camera admits...,"[wow, chicago, protester, caught, on, camera, ...","[wow, chicago, protest, caught, on, camera, ad...","[wow, chicago, protester, caught, on, camera, ...","[camera, gon, caught, admits, chicago, violent...","[activ, camera, gon, protest, caught, chicago,...","[camera, gon, caught, admits, chicago, violent...",camera gon caught admits chicago violent activ...,activ camera gon protest caught chicago violen...,camera gon caught admits chicago violent activ...
2,2,germany's fdp look to fill schaeuble's big shoes,"[germany, fdp, look, to, fill, schaeuble, big,...","[germani, fdp, look, to, fill, schaeubl, big, ...","[germany, fdp, look, to, fill, schaeuble, big,...","[schaeuble, fill, big, shoes, germany, fdp]","[germani, fill, big, shoe, schaeubl, fdp]","[schaeuble, fill, big, shoe, germany, fdp]",schaeuble fill big shoes germany fdp,germani fill big shoe schaeubl fdp,schaeuble fill big shoe germany fdp
3,2,mi school sends welcome back packet warning ki...,"[mi, school, sends, welcome, back, packet, war...","[mi, school, send, welcom, back, packet, warn,...","[mi, school, sends, welcome, back, packet, war...","[wearing, kids, packet, flag, sends, warning, ...","[send, packet, wear, flag, back, welcom, warn,...","[packet, wear, flag, sends, back, welcome, war...",wearing kids packet flag sends warning back we...,send packet wear flag back welcom warn school kid,packet wear flag sends back welcome warn schoo...
4,2,u.n. seeks 'massive' aid boost amid rohingya '...,"[seeks, aid, boost, amid, rohingya, within, an...","[seek, aid, boost, amid, rohingya, within, an,...","[seek, aid, boost, amid, rohingya, within, an,...","[amid, aid, rohingya, boost, seeks, emergency]","[seek, amid, rohingya, aid, boost, emerg]","[seek, amid, rohingya, aid, boost, emergency]",amid aid rohingya boost seeks emergency,seek amid rohingya aid boost emerg,seek amid rohingya aid boost emergency


## Split the train dataset

In [30]:
from sklearn.model_selection import train_test_split

# tokenization: split the train dataset in train and validation sets
X_train_text_token, X_val_text_token, y_train_token, y_val_token = train_test_split(
    data['token_blob'],
    data['label'],
    test_size=0.1,
    random_state=42
    )

In [31]:
# stemming: split the train dataset in train and validation sets
X_train_text_stem, X_val_text_stem, y_train_stem, y_val_stem = train_test_split(
    data['stem_blob'],
    data['label'],
    test_size=0.1,
    random_state=42
    )

In [32]:
# lemmatization: split the train dataset in train and validation sets
X_train_text_lemma, X_val_text_lemma, y_train_lemma, y_val_lemma = train_test_split(
    data['lemma_blob'],
    data['label'],
    test_size=0.1,
    random_state=42
    )

In [33]:
# no preprocessing: split the train dataset in train and validation sets
X_train_text_raw, X_val_text_raw, y_train_raw, y_val_raw = train_test_split(
    data['article'],
    data['label'],
    test_size=0.1,
    random_state=42
    )

## Select a classifier

In [34]:
# tokenization
#let's take only the most common 1000 words
from sklearn.feature_extraction.text import CountVectorizer

token_bow_vect = CountVectorizer(max_features=1000)

# fit creates one entry for each different word seen
X_train_token = token_bow_vect.fit_transform(X_train_text_token).toarray()

X_val_token = token_bow_vect.transform(X_val_text_token).toarray()


In [35]:
# stemming

#let's take only the most common 1000 words
stem_bow_vect = CountVectorizer(max_features=1000)

# fit creates one entry for each different word seen
X_train_stem = stem_bow_vect.fit_transform(X_train_text_stem).toarray()

X_val_stem = stem_bow_vect.transform(X_val_text_stem).toarray()

In [36]:
# lemmatization
#let's take only the most common 1000 words

lemma_bow_vect = CountVectorizer(max_features=1000)

# fit creates one entry for each different word seen
X_train_lemma = lemma_bow_vect.fit_transform(X_train_text_lemma).toarray()

X_val_lemma = lemma_bow_vect.transform(X_val_text_lemma).toarray()

In [37]:
# no preprocessing
#let's take only the most common 1000 words

raw_bow_vect = CountVectorizer(max_features=1000)

# fit creates one entry for each different word seen
X_train_raw = raw_bow_vect.fit_transform(X_train_text_raw).toarray()

X_val_raw = raw_bow_vect.transform(X_val_text_raw).toarray()

## Train a SVM model

### No processing

In [39]:
# no processing
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

# create the model
svm_raw = LinearSVC(max_iter=200, random_state=100)

# fit the model with the training set
svm_raw.fit(X_train_raw, y_train_raw)

# evaluate the model with validation set
y_val_raw_pred = svm_raw.predict(X_val_raw)

# display accuracy
accuracy_raw = accuracy_score(y_val_raw, y_val_raw_pred)
print("No preprocessing -> Accuracy:", accuracy_raw)
print(classification_report(y_val_raw, y_val_raw_pred))

No preprocessing -> Accuracy: 0.9276932084309133
              precision    recall  f1-score   support

           0       0.95      0.91      0.93      1733
           1       0.91      0.95      0.93      1683

    accuracy                           0.93      3416
   macro avg       0.93      0.93      0.93      3416
weighted avg       0.93      0.93      0.93      3416



### Tokenization

In [40]:
# tokenization

# create the model
svm_token = LinearSVC( random_state=100)

# fit the model with the training set
svm_token.fit(X_train_token, y_train_token)

# evaluate the model with validation set
y_val_token_pred = svm_token.predict(X_val_token)

# display accuracy
accuracy_token = accuracy_score(y_val_token, y_val_token_pred)
print("Tokenization -> Accuracy:", accuracy_token)
print(classification_report(y_val_token, y_val_token_pred))

Tokenization -> Accuracy: 0.8984192037470726
              precision    recall  f1-score   support

           0       0.91      0.89      0.90      1733
           1       0.89      0.90      0.90      1683

    accuracy                           0.90      3416
   macro avg       0.90      0.90      0.90      3416
weighted avg       0.90      0.90      0.90      3416



In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Stemming

In [41]:
# stemming

# create the model
svm_stem = LinearSVC(random_state=100)

# fit the model with the training set
svm_stem.fit(X_train_stem, y_train_stem)

# evaluate the model with validation set
y_val_stem_pred = svm_stem.predict(X_val_stem)

# display accuracy
accuracy_stem = accuracy_score(y_val_stem, y_val_stem_pred)
print("Stemming -> Accuracy:", accuracy_stem)
print(classification_report(y_val_stem, y_val_stem_pred))

Stemming -> Accuracy: 0.9124707259953162
              precision    recall  f1-score   support

           0       0.92      0.91      0.91      1733
           1       0.91      0.92      0.91      1683

    accuracy                           0.91      3416
   macro avg       0.91      0.91      0.91      3416
weighted avg       0.91      0.91      0.91      3416



### Lemmatization

In [42]:
# lemmatization

# create the model
svm_lemma = LinearSVC(max_iter=200, random_state=100)

# fit the model with the training set
svm_lemma.fit(X_train_lemma, y_train_lemma)

# evaluate the model with validation set
y_val_lemma_pred = svm_lemma.predict(X_val_lemma)

# display accuracy
accuracy_lemma = accuracy_score(y_val_lemma, y_val_lemma_pred)
print("Lemmatization -> Accuracy:", accuracy_lemma)
print(classification_report(y_val_lemma, y_val_lemma_pred))

Lemmatization -> Accuracy: 0.9033957845433255
              precision    recall  f1-score   support

           0       0.91      0.90      0.90      1733
           1       0.90      0.91      0.90      1683

    accuracy                           0.90      3416
   macro avg       0.90      0.90      0.90      3416
weighted avg       0.90      0.90      0.90      3416



## Evaluate the model using test dataset

In [43]:
# lemmatization
X_test_lemma = lemma_bow_vect.transform(data_out['lemma_blob']).toarray()

# predict for test data set
y_test_lemma_pred = svm_lemma.predict(X_test_lemma)

data_out['label'] = y_test_lemma_pred



## Save the predictions to an output file

In [44]:
data_out[['label', 'article']].to_csv('testing_data_lowercase_labels.csv', sep='\t', index=False, header=False)


## Save the models to output files

In [None]:
from pathlib import Path
import joblib

# Name your model here and run to save.
best_pipes = [
    svm_raw,
    svm_token,
    svm_stem,
    svm_lemma
    ]

# Define model path clearly
model_paths = [
    "bow_raw_svm.joblib",
    "bow_token_svm.joblib",
    "bow_stem_svm.joblib",
    "bow_lemma_svm.joblib"
    ]

# Save the model
for model_path, best_pipe in zip(model_paths, best_pipes):
    joblib.dump(best_pipe, model_path)

print("Models saved.")