# Importing

In [3]:
import pandas as pd
import nltk
import spacy
from nltk import ngrams
from nltk.tokenize import word_tokenize 
from sklearn.datasets import make_classification
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
import re
import pycld2 as cld2
from langdetect import detect
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adrianacuppuleri/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df = pd.read_csv("data/WELFake_Dataset.csv")
df.drop(columns=["Unnamed: 0"], inplace=True)
df

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,,Did they post their votes for Hillary already?,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
...,...,...,...
72129,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0
72130,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1
72131,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0
72132,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72134 entries, 0 to 72133
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   71576 non-null  object
 1   text    72095 non-null  object
 2   label   72134 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.7+ MB


In [6]:
df['label'].value_counts()

1    37106
0    35028
Name: label, dtype: int64

In [7]:
df[df['title'].isna() & df['text'].isna()]

Unnamed: 0,title,text,label


In [8]:
df[df['title'].isna() | df['text'].isna()]

Unnamed: 0,title,text,label
1,,Did they post their votes for Hillary already?,1
43,,True. Hillary needs a distraction and what bet...,1
162,,All eyes on Electoral delegates. The People kn...,1
185,,Cool,1
269,,A leading US senator: US Supporting War in Syr...,1
...,...,...,...
71484,,Another Arab supremacist masturbation fantasy....,1
71521,,I'm sure they drastically changed accounting m...,1
71540,,It's easy to imagine Obama or Kerry pissing hi...,1
71570,,Ever since the powers to be assassinated JFK A...,1


In [9]:
df.dropna(axis=0, how='any', inplace=True)

In [10]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1
...,...,...,...
71532,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0
71533,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1
71534,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0
71535,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0


# Text preprocessing
## With NLTK

### Cleaning text

In [11]:
def clean(text):
    text = str(text).lower()
    text = re.sub(r'\s+', ' ', text)           # sequences of white spaces
    text = re.sub(r'[^\x00-\x7F]+',' ', text)  # Removing all the non ASCII characters
    text = re.sub(r'\s+',' ', text)            # Replacing multiple Spaces with Single Space
    text = re.sub(r'\.{2,}', ' ', text)        # Replacing Two or more dots with one
    text = re.sub(r'[^\x00-\x7F]+',' ', text)  # Removing all the non ASCII characters
    text = re.sub(r'\W+',' ', text)            # Replace everything non-alpahnumeric with a space
    return text.strip()

In [12]:
df['text_clean'] = df['text'].map(clean)
df['title_clean'] = df['title'].map(clean)

In [13]:
df["empty_cell_text"] = df['text_clean'].str.contains(r'^\s*$', na=False)
df["empty_cell_title"] = df['title_clean'].str.contains(r'^\s*$', na=False)

In [14]:
df

Unnamed: 0,title,text,label,text_clean,title_clean,empty_cell_text,empty_cell_title
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,no comment is expected from barack obama membe...,law enforcement on high alert following threat...,False,False
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,now most of the demonstrators gathered last ni...,unbelievable obama s attorney general says mos...,False,False
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,a dozen politically active pastors came here f...,bobby jindal raised hindu uses story of christ...,False,False
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,the rs 28 sarmat missile dubbed satan 2 will r...,satan 2 russia unvelis an image of its terrify...,False,False
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,all we can say on this one is it s about time ...,about time christian group sues amazon and spl...,False,False
...,...,...,...,...,...,...,...
71532,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0,washington reuters hackers believed to be work...,russians steal research on trump in hack of u ...,False,False
71533,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1,you know because in fantasyland republicans ne...,watch giuliani demands that democrats apologiz...,False,False
71534,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0,migrants refuse to leave train at refugee camp...,migrants refuse to leave train at refugee camp...,False,False
71535,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0,mexico city reuters donald trump s combative s...,trump tussle gives unpopular mexican leader mu...,False,False


### Removing empty cells

In [15]:
df.drop(df.loc[df["empty_cell_text" or "empty_cell_title"]].index, inplace=True)
df

Unnamed: 0,title,text,label,text_clean,title_clean,empty_cell_text,empty_cell_title
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,no comment is expected from barack obama membe...,law enforcement on high alert following threat...,False,False
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,now most of the demonstrators gathered last ni...,unbelievable obama s attorney general says mos...,False,False
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,a dozen politically active pastors came here f...,bobby jindal raised hindu uses story of christ...,False,False
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,the rs 28 sarmat missile dubbed satan 2 will r...,satan 2 russia unvelis an image of its terrify...,False,False
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,all we can say on this one is it s about time ...,about time christian group sues amazon and spl...,False,False
...,...,...,...,...,...,...,...
71532,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0,washington reuters hackers believed to be work...,russians steal research on trump in hack of u ...,False,False
71533,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1,you know because in fantasyland republicans ne...,watch giuliani demands that democrats apologiz...,False,False
71534,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0,migrants refuse to leave train at refugee camp...,migrants refuse to leave train at refugee camp...,False,False
71535,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0,mexico city reuters donald trump s combative s...,trump tussle gives unpopular mexican leader mu...,False,False


In [16]:
df.reset_index(drop=True, inplace=True)
df.drop(columns=["empty_cell_text", "empty_cell_title"], inplace=True)

In [17]:
df

Unnamed: 0,title,text,label,text_clean,title_clean
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,no comment is expected from barack obama membe...,law enforcement on high alert following threat...
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,now most of the demonstrators gathered last ni...,unbelievable obama s attorney general says mos...
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,a dozen politically active pastors came here f...,bobby jindal raised hindu uses story of christ...
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,the rs 28 sarmat missile dubbed satan 2 will r...,satan 2 russia unvelis an image of its terrify...
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,all we can say on this one is it s about time ...,about time christian group sues amazon and spl...
...,...,...,...,...,...
70776,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0,washington reuters hackers believed to be work...,russians steal research on trump in hack of u ...
70777,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1,you know because in fantasyland republicans ne...,watch giuliani demands that democrats apologiz...
70778,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0,migrants refuse to leave train at refugee camp...,migrants refuse to leave train at refugee camp...
70779,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0,mexico city reuters donald trump s combative s...,trump tussle gives unpopular mexican leader mu...


In [18]:
#df.to_csv("data/df_cleaned.csv")

### Language detection

In [19]:
def detect_lang(text):
    _, _, _, detected_language = cld2.detect(text, returnVectors=True)
    return str(detected_language)

In [20]:
df['text_lang'] = df['text_clean'].map(detect_lang)
df['title_lang'] = df['title_clean'].map(detect_lang)

In [21]:
df['text_lang'].astype(str)
df['title_lang'].astype(str)

0        ((0, 126, 'ENGLISH', 'en'),)
1        ((0, 132, 'ENGLISH', 'en'),)
2        ((0, 103, 'ENGLISH', 'en'),)
3         ((0, 90, 'ENGLISH', 'en'),)
4         ((0, 77, 'ENGLISH', 'en'),)
                     ...             
70776     ((0, 64, 'ENGLISH', 'en'),)
70777     ((0, 77, 'ENGLISH', 'en'),)
70778     ((0, 57, 'ENGLISH', 'en'),)
70779     ((0, 67, 'ENGLISH', 'en'),)
70780     ((0, 52, 'Unknown', 'un'),)
Name: title_lang, Length: 70781, dtype: object

In [22]:
df['text_lang'] = ~df["text_lang"].str.contains('ENGLISH|Unknown', regex=True)

In [23]:
df['title_lang'] = ~df["title_lang"].str.contains('ENGLISH|Unknown')

In [24]:
df

Unnamed: 0,title,text,label,text_clean,title_clean,text_lang,title_lang
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,no comment is expected from barack obama membe...,law enforcement on high alert following threat...,False,False
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,now most of the demonstrators gathered last ni...,unbelievable obama s attorney general says mos...,False,False
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,a dozen politically active pastors came here f...,bobby jindal raised hindu uses story of christ...,False,False
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,the rs 28 sarmat missile dubbed satan 2 will r...,satan 2 russia unvelis an image of its terrify...,False,False
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,all we can say on this one is it s about time ...,about time christian group sues amazon and spl...,False,False
...,...,...,...,...,...,...,...
70776,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0,washington reuters hackers believed to be work...,russians steal research on trump in hack of u ...,False,False
70777,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1,you know because in fantasyland republicans ne...,watch giuliani demands that democrats apologiz...,False,False
70778,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0,migrants refuse to leave train at refugee camp...,migrants refuse to leave train at refugee camp...,False,False
70779,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0,mexico city reuters donald trump s combative s...,trump tussle gives unpopular mexican leader mu...,False,False


In [25]:
df.drop(df.loc[df["text_lang" or "title_lang"]].index, inplace=True)

In [26]:
df

Unnamed: 0,title,text,label,text_clean,title_clean,text_lang,title_lang
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,no comment is expected from barack obama membe...,law enforcement on high alert following threat...,False,False
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,now most of the demonstrators gathered last ni...,unbelievable obama s attorney general says mos...,False,False
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,a dozen politically active pastors came here f...,bobby jindal raised hindu uses story of christ...,False,False
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,the rs 28 sarmat missile dubbed satan 2 will r...,satan 2 russia unvelis an image of its terrify...,False,False
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,all we can say on this one is it s about time ...,about time christian group sues amazon and spl...,False,False
...,...,...,...,...,...,...,...
70776,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0,washington reuters hackers believed to be work...,russians steal research on trump in hack of u ...,False,False
70777,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1,you know because in fantasyland republicans ne...,watch giuliani demands that democrats apologiz...,False,False
70778,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0,migrants refuse to leave train at refugee camp...,migrants refuse to leave train at refugee camp...,False,False
70779,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0,mexico city reuters donald trump s combative s...,trump tussle gives unpopular mexican leader mu...,False,False


In [27]:
df.reset_index(drop=True, inplace=True)

### Tokenizing

In [28]:
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
df['token_title'] = df.apply(lambda row: nltk.word_tokenize(row['title_clean']), axis=1)

In [29]:
df['token_text'] = df.apply(lambda row: nltk.word_tokenize(row['text_clean']), axis=1)

In [30]:
df.drop(columns=["text_lang", "title_lang"], inplace=True)

In [33]:
df.to_csv("data/df_token.csv")

In [None]:
#df = pd.read_csv("df_token.csv")
#df.drop(columns=["Unnamed: 0"], inplace=True)
#df

In [34]:
df.token_title[0]

['law',
 'enforcement',
 'on',
 'high',
 'alert',
 'following',
 'threats',
 'against',
 'cops',
 'and',
 'whites',
 'on',
 '9',
 '11by',
 'blacklivesmatter',
 'and',
 'fyf911',
 'terrorists',
 'video']

### POS tagging

In [35]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [37]:
df['tag_title'] = df.apply(lambda row: nltk.pos_tag(row['token_title']), axis=1)

In [38]:
df['tag_text'] = df.apply(lambda row: nltk.pos_tag(row['token_text']), axis=1)

KeyboardInterrupt: 

In [39]:
df.tag_title[0]

[('law', 'NN'),
 ('enforcement', 'NN'),
 ('on', 'IN'),
 ('high', 'JJ'),
 ('alert', 'NN'),
 ('following', 'VBG'),
 ('threats', 'NNS'),
 ('against', 'IN'),
 ('cops', 'NNS'),
 ('and', 'CC'),
 ('whites', 'NNS'),
 ('on', 'IN'),
 ('9', 'CD'),
 ('11by', 'CD'),
 ('blacklivesmatter', 'NN'),
 ('and', 'CC'),
 ('fyf911', 'NN'),
 ('terrorists', 'NNS'),
 ('video', 'VBP')]

### Lemmatizing tagged words

In [40]:
lem = WordNetLemmatizer()

In [41]:
def lemmatize(words):
    lemmatized_words = [lem.lemmatize(word) for word in words]
    return lemmatized_words

In [42]:
df['lem_title'] = df.apply(lambda row: lemmatize(row['token_title']), axis=1)

In [43]:
df['lem_text'] = df.apply(lambda row: lemmatize(row['token_text']), axis=1)

In [44]:
df

Unnamed: 0,title,text,label,text_clean,title_clean,token_title,token_text,tag_title,lem_title,lem_text
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,no comment is expected from barack obama membe...,law enforcement on high alert following threat...,"[law, enforcement, on, high, alert, following,...","[no, comment, is, expected, from, barack, obam...","[(law, NN), (enforcement, NN), (on, IN), (high...","[law, enforcement, on, high, alert, following,...","[no, comment, is, expected, from, barack, obam..."
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,now most of the demonstrators gathered last ni...,unbelievable obama s attorney general says mos...,"[unbelievable, obama, s, attorney, general, sa...","[now, most, of, the, demonstrators, gathered, ...","[(unbelievable, JJ), (obama, NN), (s, NN), (at...","[unbelievable, obama, s, attorney, general, sa...","[now, most, of, the, demonstrator, gathered, l..."
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,a dozen politically active pastors came here f...,bobby jindal raised hindu uses story of christ...,"[bobby, jindal, raised, hindu, uses, story, of...","[a, dozen, politically, active, pastors, came,...","[(bobby, NN), (jindal, NN), (raised, VBD), (hi...","[bobby, jindal, raised, hindu, us, story, of, ...","[a, dozen, politically, active, pastor, came, ..."
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,the rs 28 sarmat missile dubbed satan 2 will r...,satan 2 russia unvelis an image of its terrify...,"[satan, 2, russia, unvelis, an, image, of, its...","[the, rs, 28, sarmat, missile, dubbed, satan, ...","[(satan, JJ), (2, CD), (russia, NN), (unvelis,...","[satan, 2, russia, unvelis, an, image, of, it,...","[the, r, 28, sarmat, missile, dubbed, satan, 2..."
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,all we can say on this one is it s about time ...,about time christian group sues amazon and spl...,"[about, time, christian, group, sues, amazon, ...","[all, we, can, say, on, this, one, is, it, s, ...","[(about, IN), (time, NN), (christian, JJ), (gr...","[about, time, christian, group, sue, amazon, a...","[all, we, can, say, on, this, one, is, it, s, ..."
...,...,...,...,...,...,...,...,...,...,...
70680,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0,washington reuters hackers believed to be work...,russians steal research on trump in hack of u ...,"[russians, steal, research, on, trump, in, hac...","[washington, reuters, hackers, believed, to, b...","[(russians, NNS), (steal, VBP), (research, NN)...","[russian, steal, research, on, trump, in, hack...","[washington, reuters, hacker, believed, to, be..."
70681,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1,you know because in fantasyland republicans ne...,watch giuliani demands that democrats apologiz...,"[watch, giuliani, demands, that, democrats, ap...","[you, know, because, in, fantasyland, republic...","[(watch, NN), (giuliani, NN), (demands, VBZ), ...","[watch, giuliani, demand, that, democrat, apol...","[you, know, because, in, fantasyland, republic..."
70682,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0,migrants refuse to leave train at refugee camp...,migrants refuse to leave train at refugee camp...,"[migrants, refuse, to, leave, train, at, refug...","[migrants, refuse, to, leave, train, at, refug...","[(migrants, NNS), (refuse, VBP), (to, TO), (le...","[migrant, refuse, to, leave, train, at, refuge...","[migrant, refuse, to, leave, train, at, refuge..."
70683,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0,mexico city reuters donald trump s combative s...,trump tussle gives unpopular mexican leader mu...,"[trump, tussle, gives, unpopular, mexican, lea...","[mexico, city, reuters, donald, trump, s, comb...","[(trump, NN), (tussle, NN), (gives, VBZ), (unp...","[trump, tussle, give, unpopular, mexican, lead...","[mexico, city, reuters, donald, trump, s, comb..."


In [45]:
df.to_csv("data/df_lemmatized.csv")

for word, tag in enumerate(df['tag_title']):
         wntag = tag[0][0][0].lower()
         wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
         lemma = lem.lemmatize(word, wntag) if wntag else word
         print (lemma)

### Stopwords

In [46]:
stop_words = list(stopwords.words('english')) 
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [47]:
for i in range(len(stop_words)):
    stop_words[i] = re.sub(r"\s*'\s*\w*","",stop_words[i])

In [48]:
df["stop_title"] = df["lem_title"].apply(lambda x: ' '.join([word for word in x if word not in (stop_words)]))

In [49]:
df["stop_text"] = df["lem_text"].apply(lambda x: ' '.join([word for word in x if word not in (stop_words)]))

In [50]:
df

Unnamed: 0,title,text,label,text_clean,title_clean,token_title,token_text,tag_title,lem_title,lem_text,stop_title,stop_text
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,no comment is expected from barack obama membe...,law enforcement on high alert following threat...,"[law, enforcement, on, high, alert, following,...","[no, comment, is, expected, from, barack, obam...","[(law, NN), (enforcement, NN), (on, IN), (high...","[law, enforcement, on, high, alert, following,...","[no, comment, is, expected, from, barack, obam...",law enforcement high alert following threat co...,comment expected barack obama member fyf911 fu...
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,now most of the demonstrators gathered last ni...,unbelievable obama s attorney general says mos...,"[unbelievable, obama, s, attorney, general, sa...","[now, most, of, the, demonstrators, gathered, ...","[(unbelievable, JJ), (obama, NN), (s, NN), (at...","[unbelievable, obama, s, attorney, general, sa...","[now, most, of, the, demonstrator, gathered, l...",unbelievable obama attorney general say charlo...,demonstrator gathered last night exercising co...
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,a dozen politically active pastors came here f...,bobby jindal raised hindu uses story of christ...,"[bobby, jindal, raised, hindu, uses, story, of...","[a, dozen, politically, active, pastors, came,...","[(bobby, NN), (jindal, NN), (raised, VBD), (hi...","[bobby, jindal, raised, hindu, us, story, of, ...","[a, dozen, politically, active, pastor, came, ...",bobby jindal raised hindu us story christian c...,dozen politically active pastor came private d...
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,the rs 28 sarmat missile dubbed satan 2 will r...,satan 2 russia unvelis an image of its terrify...,"[satan, 2, russia, unvelis, an, image, of, its...","[the, rs, 28, sarmat, missile, dubbed, satan, ...","[(satan, JJ), (2, CD), (russia, NN), (unvelis,...","[satan, 2, russia, unvelis, an, image, of, it,...","[the, r, 28, sarmat, missile, dubbed, satan, 2...",satan 2 russia unvelis image terrifying new su...,r 28 sarmat missile dubbed satan 2 replace 18 ...
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,all we can say on this one is it s about time ...,about time christian group sues amazon and spl...,"[about, time, christian, group, sues, amazon, ...","[all, we, can, say, on, this, one, is, it, s, ...","[(about, IN), (time, NN), (christian, JJ), (gr...","[about, time, christian, group, sue, amazon, a...","[all, we, can, say, on, this, one, is, it, s, ...",time christian group sue amazon splc designati...,say one time someone sued southern poverty law...
...,...,...,...,...,...,...,...,...,...,...,...,...
70680,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0,washington reuters hackers believed to be work...,russians steal research on trump in hack of u ...,"[russians, steal, research, on, trump, in, hac...","[washington, reuters, hackers, believed, to, b...","[(russians, NNS), (steal, VBP), (research, NN)...","[russian, steal, research, on, trump, in, hack...","[washington, reuters, hacker, believed, to, be...",russian steal research trump hack u democratic...,washington reuters hacker believed working rus...
70681,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1,you know because in fantasyland republicans ne...,watch giuliani demands that democrats apologiz...,"[watch, giuliani, demands, that, democrats, ap...","[you, know, because, in, fantasyland, republic...","[(watch, NN), (giuliani, NN), (demands, VBZ), ...","[watch, giuliani, demand, that, democrat, apol...","[you, know, because, in, fantasyland, republic...",watch giuliani demand democrat apologize trump...,know fantasyland republican never questioned c...
70682,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0,migrants refuse to leave train at refugee camp...,migrants refuse to leave train at refugee camp...,"[migrants, refuse, to, leave, train, at, refug...","[migrants, refuse, to, leave, train, at, refug...","[(migrants, NNS), (refuse, VBP), (to, TO), (le...","[migrant, refuse, to, leave, train, at, refuge...","[migrant, refuse, to, leave, train, at, refuge...",migrant refuse leave train refugee camp hungary,migrant refuse leave train refugee camp hungar...
70683,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0,mexico city reuters donald trump s combative s...,trump tussle gives unpopular mexican leader mu...,"[trump, tussle, gives, unpopular, mexican, lea...","[mexico, city, reuters, donald, trump, s, comb...","[(trump, NN), (tussle, NN), (gives, VBZ), (unp...","[trump, tussle, give, unpopular, mexican, lead...","[mexico, city, reuters, donald, trump, s, comb...",trump tussle give unpopular mexican leader muc...,mexico city reuters donald trump combative sty...


In [51]:
df.to_csv("data/df_stopwords.csv")

### BOW with countvec

In [52]:
def create_vectorizer(sentences):
    vectorizer = CountVectorizer(max_features=100)
    X = vectorizer.fit_transform(sentences)
    return (vectorizer, X)

In [53]:
(vectorizer, X) = create_vectorizer(df.stop_title)

In [54]:
print(vectorizer.get_feature_names())

['2016', 'america', 'american', 'anti', 'attack', 'back', 'ban', 'bill', 'black', 'border', 'breaking', 'breitbart', 'call', 'campaign', 'chief', 'china', 'clinton', 'cnn', 'comment', 'congress', 'could', 'court', 'day', 'deal', 'democrat', 'donald', 'election', 'email', 'eu', 'fbi', 'first', 'former', 'get', 'give', 'go', 'gop', 'government', 'gun', 'ha', 'hillary', 'house', 'iran', 'korea', 'law', 'leader', 'life', 'like', 'make', 'man', 'may', 'medium', 'million', 'muslim', 'new', 'news', 'north', 'obama', 'official', 'one', 'party', 'people', 'plan', 'police', 'president', 'putin', 'report', 'republican', 'right', 'russia', 'russian', 'sander', 'say', 'senate', 'senator', 'show', 'speech', 'state', 'supporter', 'syria', 'take', 'talk', 'tax', 'tell', 'time', 'top', 'trump', 'tweet', 'video', 'vote', 'voter', 'wa', 'want', 'war', 'watch', 'white', 'win', 'woman', 'world', 'year', 'york']


In [55]:
print(X)

  (0, 43)	1
  (0, 94)	1
  (0, 87)	1
  (1, 87)	1
  (1, 56)	1
  (1, 71)	1
  (1, 76)	1
  (1, 55)	1
  (2, 0)	1
  (3, 68)	1
  (3, 53)	1
  (3, 97)	1
  (3, 79)	1
  (4, 83)	1
  (6, 87)	1
  (6, 6)	1
  (6, 74)	1
  (6, 2)	1
  (6, 46)	1
  (6, 1)	1
  (8, 35)	1
  (8, 73)	1
  (8, 67)	1
  (9, 49)	1
  (9, 28)	2
  :	:
  (70676, 53)	1
  (70676, 83)	1
  (70676, 85)	1
  (70676, 99)	1
  (70677, 43)	1
  (70677, 37)	1
  (70678, 74)	1
  (70678, 16)	2
  (70678, 27)	1
  (70679, 87)	1
  (70679, 83)	1
  (70679, 75)	1
  (70679, 5)	1
  (70680, 85)	1
  (70680, 59)	1
  (70680, 69)	1
  (70681, 85)	1
  (70681, 93)	1
  (70681, 24)	1
  (70683, 85)	1
  (70683, 33)	1
  (70683, 44)	1
  (70684, 16)	1
  (70684, 63)	1
  (70684, 39)	1


In [56]:
denseX = X.todense()

In [57]:
denseX[0]

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]])