# Importing

In [2]:
import pandas as pd
import nltk
import spacy
from nltk import ngrams
from nltk.tokenize import word_tokenize 
from nltk.stem import SnowballStemmer
from sklearn.datasets import make_classification
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
import re
import pycld2 as cld2
from langdetect import detect
import numpy as np

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adrianacuppuleri/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df = pd.read_csv("WELFake_Dataset.csv")
df.drop(columns=["Unnamed: 0"], inplace=True)
df

In [None]:
df.info()

In [None]:
df['label'].value_counts()

In [None]:
df[df['title'].isna() & df['text'].isna()]

In [None]:
df[df['title'].isna() | df['text'].isna()]

In [None]:
df.dropna(axis=0, how='any', inplace=True)

In [None]:
df.reset_index(drop=True, inplace=True)
df

# Text preprocessing
## With NLTK

### Cleaning text

In [None]:
def clean(text):
    text = str(text).lower()
    text = re.sub(r'\s+', ' ', text)           # sequences of white spaces
    text = re.sub(r'[^\x00-\x7F]+',' ', text)  # Removing all the non ASCII characters
    text = re.sub(r'\s+',' ', text)            # Replacing multiple Spaces with Single Space
    text = re.sub(r'\.{2,}', ' ', text)        # Replacing Two or more dots with one
    text = re.sub(r'[^\x00-\x7F]+',' ', text)  # Removing all the non ASCII characters
    text = re.sub(r'\W+',' ', text)            # Replace everything non-alpahnumeric with a space
    return text.strip()

In [None]:
df['text_clean'] = df['text'].map(clean)
df['title_clean'] = df['title'].map(clean)

In [None]:
df["empty_cell_text"] = df['text_clean'].str.contains(r'^\s*$', na=False)
df["empty_cell_title"] = df['title_clean'].str.contains(r'^\s*$', na=False)

In [None]:
df

### Removing empty cells

In [None]:
df.drop(df.loc[df["empty_cell_text" or "empty_cell_title"]].index, inplace=True)
df

In [None]:
df.reset_index(drop=True, inplace=True)
df.drop(columns=["empty_cell_text", "empty_cell_title"], inplace=True)

In [None]:
df

In [None]:
#df.to_csv("df_cleaned.csv")

### Language detection

In [None]:
def detect_lang(text):
    _, _, _, detected_language = cld2.detect(text, returnVectors=True)
    return str(detected_language)

In [None]:
df['text_lang'] = df['text_clean'].map(detect_lang)
df['title_lang'] = df['title_clean'].map(detect_lang)

In [None]:
df['text_lang'].astype(str)
df['title_lang'].astype(str)

In [None]:
df['text_lang'] = ~df["text_lang"].str.contains('ENGLISH|Unknown', regex=True)

In [None]:
df['title_lang'] = ~df["title_lang"].str.contains('ENGLISH|Unknown')

In [None]:
df

In [None]:
df.drop(df.loc[df["text_lang" or "title_lang"]].index, inplace=True)

In [None]:
df

In [None]:
df.reset_index(drop=True, inplace=True)

### Tokenizing

In [None]:
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
df['token_title'] = df.apply(lambda row: nltk.word_tokenize(row['title_clean']), axis=1)

In [None]:
df['token_text'] = df.apply(lambda row: nltk.word_tokenize(row['text_clean']), axis=1)

In [None]:
df

In [None]:
df.drop(columns=["text_lang", "title_lang"], inplace=True)

In [None]:
df.to_csv("df_token.csv")

In [None]:
df = pd.read_csv("df_token.csv")
df.drop(columns=["Unnamed: 0"], inplace=True)
df

### Lemmatizing

In [None]:
from nltk import pos_tag, word_tokenize,pos_tag_sents

In [None]:
lem = WordNetLemmatizer()

In [None]:
sent = 'These sentences involves some horsing around'
>>> for word, tag in pos_tag(word_tokenize(sent)):
...     wntag = tag[0].lower()
...     wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
...     lemma = lem.lemmatize(word, wntag) if wntag else word
...     print (lemma)

In [None]:
text=df['title_clean'].tolist()
title=[]
for i in text:
   title.append(str(i))

tagged_texts = pos_tag_sents(map(word_tokenize, title))
df["POS_title"]=tagged_texts

In [None]:
text=df['text_clean'].tolist()
title=[]
for i in text:
   title.append(str(i))

tagged_texts = pos_tag_sents(map(word_tokenize, title))
df["POS_text"]=tagged_texts

In [None]:
df

### Stopwords

### Frequency