# Importing

In [17]:
import pandas as pd
import nltk
import spacy
from nltk import ngrams
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer
from sklearn.datasets import make_classification
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/odelia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv("WELFake_Dataset.csv")
df.drop(columns=["Unnamed: 0"], inplace=True)
df

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,,Did they post their votes for Hillary already?,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
...,...,...,...
72129,Russians steal research on Trump in hack of U....,WASHINGTON (Reuters) - Hackers believed to be ...,0
72130,WATCH: Giuliani Demands That Democrats Apolog...,"You know, because in fantasyland Republicans n...",1
72131,Migrants Refuse To Leave Train At Refugee Camp...,Migrants Refuse To Leave Train At Refugee Camp...,0
72132,Trump tussle gives unpopular Mexican leader mu...,MEXICO CITY (Reuters) - Donald Trump’s combati...,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72134 entries, 0 to 72133
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   71576 non-null  object
 1   text    72095 non-null  object
 2   label   72134 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.7+ MB


In [4]:
df['label'].value_counts()

1    37106
0    35028
Name: label, dtype: int64

In [5]:
df[df['title'].isna() & df['text'].isna()]

Unnamed: 0,title,text,label


# Text preprocessing
## With NLTK

In [16]:
# cleaning text

df=df.applymap(lambda x: str(x).lower()) #lowercase
df = df.applymap(lambda x: str(x).replace(r'[^\x00-\x7F]+',' ')) # Removing all the non ASCII characters
df = df.applymap(lambda x: str(x).replace(r'\s+',' ')) # Replacing multiple Spaces with Single Space
df = df.applymap(lambda x: str(x).replace(r'\.{2,}', ' ')) # Replacing Two or more dots with one

### add removing non alphanumeric

df.head(50)

Unnamed: 0,title,text,label
0,law enforcement on high alert following threat...,no comment is expected from barack obama membe...,1
1,,did they post their votes for hillary already?,1
2,unbelievable! obama’s attorney general says mo...,"now, most of the demonstrators gathered last ...",1
3,"bobby jindal, raised hindu, uses story of chri...",a dozen politically active pastors came here f...,0
4,satan 2: russia unvelis an image of its terrif...,"the rs-28 sarmat missile, dubbed satan 2, will...",1
...,...,...,...
72129,russians steal research on trump in hack of u....,washington (reuters) - hackers believed to be ...,0
72130,watch: giuliani demands that democrats apolog...,"you know, because in fantasyland republicans n...",1
72131,migrants refuse to leave train at refugee camp...,migrants refuse to leave train at refugee camp...,0
72132,trump tussle gives unpopular mexican leader mu...,mexico city (reuters) - donald trump’s combati...,0


def clean_text 
        
    for ti in df["title"]
    for te in df["text"]

In [None]:
# Creating a corpus

In [34]:
# Tokenizing
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
#df['tokenized_title'] = 
df['token_title'] = df.apply(lambda row: nltk.word_tokenize(row['title']), axis=1)
df

Unnamed: 0,title,text,label,token_title
0,law enforcement on high alert following threat...,no comment is expected from barack obama membe...,1,"[law, enforcement, on, high, alert, following,..."
1,,did they post their votes for hillary already?,1,[nan]
2,unbelievable! obama’s attorney general says mo...,"now, most of the demonstrators gathered last ...",1,"[unbelievable, !, obama, ’, s, attorney, gener..."
3,"bobby jindal, raised hindu, uses story of chri...",a dozen politically active pastors came here f...,0,"[bobby, jindal, ,, raised, hindu, ,, uses, sto..."
4,satan 2: russia unvelis an image of its terrif...,"the rs-28 sarmat missile, dubbed satan 2, will...",1,"[satan, 2, :, russia, unvelis, an, image, of, ..."
...,...,...,...,...
72129,russians steal research on trump in hack of u....,washington (reuters) - hackers believed to be ...,0,"[russians, steal, research, on, trump, in, hac..."
72130,watch: giuliani demands that democrats apolog...,"you know, because in fantasyland republicans n...",1,"[watch, :, giuliani, demands, that, democrats,..."
72131,migrants refuse to leave train at refugee camp...,migrants refuse to leave train at refugee camp...,0,"[migrants, refuse, to, leave, train, at, refug..."
72132,trump tussle gives unpopular mexican leader mu...,mexico city (reuters) - donald trump’s combati...,0,"[trump, tussle, gives, unpopular, mexican, lea..."


In [37]:
df['token_text'] = df.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)

KeyboardInterrupt: 

In [38]:
df

Unnamed: 0,title,text,label,token_title,token_text
0,law enforcement on high alert following threat...,no comment is expected from barack obama membe...,1,"[law, enforcement, on, high, alert, following,...","[no, comment, is, expected, from, barack, obam..."
1,,did they post their votes for hillary already?,1,[nan],"[did, they, post, their, votes, for, hillary, ..."
2,unbelievable! obama’s attorney general says mo...,"now, most of the demonstrators gathered last ...",1,"[unbelievable, !, obama, ’, s, attorney, gener...","[now, ,, most, of, the, demonstrators, gathere..."
3,"bobby jindal, raised hindu, uses story of chri...",a dozen politically active pastors came here f...,0,"[bobby, jindal, ,, raised, hindu, ,, uses, sto...","[a, dozen, politically, active, pastors, came,..."
4,satan 2: russia unvelis an image of its terrif...,"the rs-28 sarmat missile, dubbed satan 2, will...",1,"[satan, 2, :, russia, unvelis, an, image, of, ...","[the, rs-28, sarmat, missile, ,, dubbed, satan..."
...,...,...,...,...,...
72129,russians steal research on trump in hack of u....,washington (reuters) - hackers believed to be ...,0,"[russians, steal, research, on, trump, in, hac...","[washington, (, reuters, ), -, hackers, believ..."
72130,watch: giuliani demands that democrats apolog...,"you know, because in fantasyland republicans n...",1,"[watch, :, giuliani, demands, that, democrats,...","[you, know, ,, because, in, fantasyland, repub..."
72131,migrants refuse to leave train at refugee camp...,migrants refuse to leave train at refugee camp...,0,"[migrants, refuse, to, leave, train, at, refug...","[migrants, refuse, to, leave, train, at, refug..."
72132,trump tussle gives unpopular mexican leader mu...,mexico city (reuters) - donald trump’s combati...,0,"[trump, tussle, gives, unpopular, mexican, lea...","[mexico, city, (, reuters, ), -, donald, trump..."


In [None]:
pd.df.to_csv("df_token.csv")