# Importing

In [1]:
import pandas as pd
import nltk
import spacy
from nltk import ngrams
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer
from sklearn.datasets import make_classification
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
import re
import pycld2 as cld2
from langdetect import detect

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/odelia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv("WELFake_Dataset.csv")
df.drop(columns=["Unnamed: 0"], inplace=True)
df

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,,Did they post their votes for Hillary already?,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
...,...,...,...
72129,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0
72130,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1
72131,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0
72132,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72134 entries, 0 to 72133
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   71576 non-null  object
 1   text    72095 non-null  object
 2   label   72134 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.7+ MB


In [4]:
df['label'].value_counts()

1    37106
0    35028
Name: label, dtype: int64

In [5]:
df[df['title'].isna() & df['text'].isna()]

Unnamed: 0,title,text,label


In [6]:
df[df['title'].isna() | df['text'].isna()]

Unnamed: 0,title,text,label
1,,Did they post their votes for Hillary already?,1
43,,True. Hillary needs a distraction and what bet...,1
162,,All eyes on Electoral delegates. The People kn...,1
185,,Cool,1
269,,A leading US senator: US Supporting War in Syr...,1
...,...,...,...
71484,,Another Arab supremacist masturbation fantasy....,1
71521,,I'm sure they drastically changed accounting m...,1
71540,,It's easy to imagine Obama or Kerry pissing hi...,1
71570,,Ever since the powers to be assassinated JFK A...,1


In [7]:
df.dropna(axis=0, how='any', inplace=True)

In [8]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1
...,...,...,...
71532,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0
71533,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1
71534,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0
71535,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0


# Text preprocessing
## With NLTK

### Cleaning text

In [9]:
def clean(text):
    text = str(text).lower()
    text = re.sub(r'\s+', ' ', text)           # sequences of white spaces
    text = re.sub(r'[^\x00-\x7F]+',' ', text)  # Removing all the non ASCII characters
    text = re.sub(r'\s+',' ', text)            # Replacing multiple Spaces with Single Space
    text = re.sub(r'\.{2,}', ' ', text)        # Replacing Two or more dots with one
    text = re.sub(r'[^\x00-\x7F]+',' ', text)  # Removing all the non ASCII characters
    text = re.sub(r'\W+',' ', text)            # Replace everything non-alpahnumeric with a space
    return text.strip()

In [10]:
df['text_clean'] = df['text'].map(clean)
df['title_clean'] = df['title'].map(clean)

In [11]:
df["empty_cell_text"] = df['text_clean'].str.contains(r'^\s*$', na=False)
df["empty_cell_title"] = df['title_clean'].str.contains(r'^\s*$', na=False)

In [12]:
df

Unnamed: 0,title,text,label,text_clean,title_clean,empty_cell_text,empty_cell_title
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,no comment is expected from barack obama membe...,law enforcement on high alert following threat...,False,False
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,now most of the demonstrators gathered last ni...,unbelievable obama s attorney general says mos...,False,False
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,a dozen politically active pastors came here f...,bobby jindal raised hindu uses story of christ...,False,False
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,the rs 28 sarmat missile dubbed satan 2 will r...,satan 2 russia unvelis an image of its terrify...,False,False
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,all we can say on this one is it s about time ...,about time christian group sues amazon and spl...,False,False
5,DR BEN CARSON TARGETED BY THE IRS: “I never ha...,DR. BEN CARSON TELLS THE STORY OF WHAT HAPPENE...,1,dr ben carson tells the story of what happened...,dr ben carson targeted by the irs i never had ...,False,False
6,HOUSE INTEL CHAIR On Trump-Russia Fake Story: ...,,1,,house intel chair on trump russia fake story n...,True,False
7,Sports Bar Owner Bans NFL Games…Will Show Only...,"The owner of the Ringling Bar, located south o...",1,the owner of the ringling bar located south of...,sports bar owner bans nfl games will show only...,False,False
8,Latest Pipeline Leak Underscores Dangers Of Da...,"FILE – In this Sept. 15, 2005 file photo, the ...",1,file in this sept 15 2005 file photo the marke...,latest pipeline leak underscores dangers of da...,False,False
9,GOP Senator Just Smacked Down The Most Puncha...,The most punchable Alt-Right Nazi on the inter...,1,the most punchable alt right nazi on the inter...,gop senator just smacked down the most punchab...,False,False


### Removing empty cells

In [18]:
df.drop(df.loc[df["empty_cell_text" or "empty_cell_title"]].index, inplace=True)
df

Unnamed: 0,title,text,label,text_clean,title_clean,empty_cell_text,empty_cell_title
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,no comment is expected from barack obama membe...,law enforcement on high alert following threat...,False,False
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,now most of the demonstrators gathered last ni...,unbelievable obama s attorney general says mos...,False,False
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,a dozen politically active pastors came here f...,bobby jindal raised hindu uses story of christ...,False,False
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,the rs 28 sarmat missile dubbed satan 2 will r...,satan 2 russia unvelis an image of its terrify...,False,False
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,all we can say on this one is it s about time ...,about time christian group sues amazon and spl...,False,False
...,...,...,...,...,...,...,...
71532,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0,washington reuters hackers believed to be work...,russians steal research on trump in hack of u ...,False,False
71533,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1,you know because in fantasyland republicans ne...,watch giuliani demands that democrats apologiz...,False,False
71534,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0,migrants refuse to leave train at refugee camp...,migrants refuse to leave train at refugee camp...,False,False
71535,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0,mexico city reuters donald trump s combative s...,trump tussle gives unpopular mexican leader mu...,False,False


In [25]:
df.reset_index(drop=True, inplace=True)
df.drop(columns=["empty_cell_text", "empty_cell_title"], inplace=True)

KeyError: "['empty_cell_text' 'empty_cell_title'] not found in axis"

In [24]:
df

Unnamed: 0,title,text,label,text_clean,title_clean
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1,no comment is expected from barack obama membe...,law enforcement on high alert following threat...
1,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1,now most of the demonstrators gathered last ni...,unbelievable obama s attorney general says mos...
2,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0,a dozen politically active pastors came here f...,bobby jindal raised hindu uses story of christ...
3,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1,the rs 28 sarmat missile dubbed satan 2 will r...,satan 2 russia unvelis an image of its terrify...
4,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1,all we can say on this one is it s about time ...,about time christian group sues amazon and spl...
...,...,...,...,...,...
70776,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0,washington reuters hackers believed to be work...,russians steal research on trump in hack of u ...
70777,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1,you know because in fantasyland republicans ne...,watch giuliani demands that democrats apologiz...
70778,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0,migrants refuse to leave train at refugee camp...,migrants refuse to leave train at refugee camp...
70779,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0,mexico city reuters donald trump s combative s...,trump tussle gives unpopular mexican leader mu...


In [26]:
df.to_csv("df_cleaned.csv")

### Language detection

In [27]:
def detect_lang(text):
    _, _, _, detected_language = cld2.detect(text, returnVectors=True)
    return str(detected_language)

In [28]:
df['text_lang'] = df['text_clean'].map(detect_lang)
df['title_lang'] = df['title_clean'].map(detect_lang)

In [38]:
.as_type("string")

AttributeError: 'DataFrame' object has no attribute 'as_type'

In [31]:
df.query("text_lang" == "ENGLISH",)

SyntaxError: expression cannot contain assignment, perhaps you meant "=="? (573983797.py, line 1)

### Tokenizing

In [None]:
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
df['token_title'] = df.apply(lambda row: nltk.word_tokenize(row['title_clean']), axis=1)

In [None]:
df['token_text'] = df.apply(lambda row: nltk.word_tokenize(row['text_clean']), axis=1)

In [None]:
df

In [None]:
#pd.df.to_csv("df_token.csv")

### Lemmatizing

In [None]:
nltk.download('omw-1.4')
lem = WordNetLemmatizer()