In [1]:
import pandas as pd
import nltk
import spacy
from nltk import ngrams
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer
from sklearn.datasets import make_classification
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
import re
import pycld2 as cld2
from langdetect import detect
import numpy as np

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/odelia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
df = pd.read_csv("df_token.csv")
df.drop(columns=["Unnamed: 0", "token_text", "token_title"], inplace=True)
df

Unnamed: 0,title,text,label,text_clean,title_clean
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,no comment is expected from barack obama membe...,law enforcement on high alert following threat...
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,now most of the demonstrators gathered last ni...,unbelievable obama s attorney general says mos...
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,a dozen politically active pastors came here f...,bobby jindal raised hindu uses story of christ...
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,the rs 28 sarmat missile dubbed satan 2 will r...,satan 2 russia unvelis an image of its terrify...
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,all we can say on this one is it s about time ...,about time christian group sues amazon and spl...
...,...,...,...,...,...
70680,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0,washington reuters hackers believed to be work...,russians steal research on trump in hack of u ...
70681,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1,you know because in fantasyland republicans ne...,watch giuliani demands that democrats apologiz...
70682,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0,migrants refuse to leave train at refugee camp...,migrants refuse to leave train at refugee camp...
70683,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0,mexico city reuters donald trump s combative s...,trump tussle gives unpopular mexican leader mu...


In [7]:
df["text_clean"] = df.text_clean.astype(str)
df["title_clean"] = df.title_clean.astype(str)

In [48]:
# Building a corpus - titles

#titles = []
corpus_titles = []
for title in df['title_clean']:
    #titles.append(title) # turning column rows into a list
    corpus_titles.append(title) 

In [49]:
corpus_titles

['law enforcement on high alert following threats against cops and whites on 9 11by blacklivesmatter and fyf911 terrorists video',
 'unbelievable obama s attorney general says most charlotte rioters were peaceful protesters in her home state of north carolina video',
 'bobby jindal raised hindu uses story of christian conversion to woo evangelicals for potential 2016 bid',
 'satan 2 russia unvelis an image of its terrifying new supernuke western world takes notice',
 'about time christian group sues amazon and splc for designation as hate group',
 'dr ben carson targeted by the irs i never had an audit until i spoke at the national prayer breakfast',
 'sports bar owner bans nfl games will show only true american sports i d like to speak for rural america video',
 'latest pipeline leak underscores dangers of dakota access pipeline',
 'gop senator just smacked down the most punchable alt right nazi on the internet',
 'may brexit offer would hurt cost eu citizens eu parliament',
 'schumer

In [17]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [42]:
len(corpus_titles)

70685

In [44]:
# Tokenizing + POS tagging the corpus - titles

def tag(x):
    text = word_tokenize(x)
    tags = nltk.pos_tag(text)
    return tags

In [53]:
tag(str(corpus_titles))

[('[', 'JJ'),
 ("'law", 'CD'),
 ('enforcement', 'NN'),
 ('on', 'IN'),
 ('high', 'JJ'),
 ('alert', 'NN'),
 ('following', 'VBG'),
 ('threats', 'NNS'),
 ('against', 'IN'),
 ('cops', 'NNS'),
 ('and', 'CC'),
 ('whites', 'NNS'),
 ('on', 'IN'),
 ('9', 'CD'),
 ('11by', 'CD'),
 ('blacklivesmatter', 'NN'),
 ('and', 'CC'),
 ('fyf911', 'NN'),
 ('terrorists', 'NNS'),
 ('video', 'VBP'),
 ("'", "''"),
 (',', ','),
 ("'unbelievable", 'JJ'),
 ('obama', 'NN'),
 ('s', 'NN'),
 ('attorney', 'NN'),
 ('general', 'JJ'),
 ('says', 'VBZ'),
 ('most', 'JJS'),
 ('charlotte', 'JJ'),
 ('rioters', 'NNS'),
 ('were', 'VBD'),
 ('peaceful', 'JJ'),
 ('protesters', 'NNS'),
 ('in', 'IN'),
 ('her', 'PRP$'),
 ('home', 'NN'),
 ('state', 'NN'),
 ('of', 'IN'),
 ('north', 'JJ'),
 ('carolina', 'JJ'),
 ('video', 'NN'),
 ("'", "''"),
 (',', ','),
 ("'bobby", "''"),
 ('jindal', 'NN'),
 ('raised', 'VBD'),
 ('hindu', 'JJ'),
 ('uses', 'NNS'),
 ('story', 'NN'),
 ('of', 'IN'),
 ('christian', 'JJ'),
 ('conversion', 'NN'),
 ('to', 'TO'),
 (

In [54]:
corpus_titles

['law enforcement on high alert following threats against cops and whites on 9 11by blacklivesmatter and fyf911 terrorists video',
 'unbelievable obama s attorney general says most charlotte rioters were peaceful protesters in her home state of north carolina video',
 'bobby jindal raised hindu uses story of christian conversion to woo evangelicals for potential 2016 bid',
 'satan 2 russia unvelis an image of its terrifying new supernuke western world takes notice',
 'about time christian group sues amazon and splc for designation as hate group',
 'dr ben carson targeted by the irs i never had an audit until i spoke at the national prayer breakfast',
 'sports bar owner bans nfl games will show only true american sports i d like to speak for rural america video',
 'latest pipeline leak underscores dangers of dakota access pipeline',
 'gop senator just smacked down the most punchable alt right nazi on the internet',
 'may brexit offer would hurt cost eu citizens eu parliament',
 'schumer