# Imports

In [None]:
# pip install -U nltk

In [None]:
# pip install -U textstat

In [None]:
# pip install langdetect

In [None]:
import re, string, gc
import numpy as np
import pandas as pd 

import nltk
from nltk import word_tokenize
from nltk.tokenize import SyllableTokenizer
from nltk.corpus import words
from nltk.corpus import stopwords
from nltk.tag import pos_tag

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import f_classif, SelectKBest
from scipy import sparse

from textstat import sentence_count
from langdetect import detect, detect_langs

from collections import Counter

nltk.download('punkt')
nltk.download('words')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

In [None]:
root_dir = "gdrive/MyDrive/Uczelnia/Magisterka/Datasets"

# Helpers

In [None]:
def display_all(toPrint):
    pd.options.display.max_colwidth = None
    pd.options.display.max_rows = None
    display(toPrint)
    pd.reset_option("display.max_colwidth")

# Preprocessing

In [None]:
def remove_numbers(s):
    print(s)
    return re.sub(r'[0-9][0-9.,-]*', '', s)

def remove_stop_words(example_sent):
    stop_words = set(stopwords.words('english'))
    
    word_tokens = word_tokenize(example_sent)
    
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    
    filtered_sentence = []
    
    for w in word_tokens:
        if w not in stop_words:
            filtered_sentence.append(w)
            
    return " ".join(filtered_sentence)

def pos_parts(text_words):
    poses = dict(Counter(dict(pos_tag(text_words, tagset='universal')).values()))
    all_values = sum(poses.values())

    def get_pos_part(tag):
        return poses.get(tag, 0)/all_values

    return [get_pos_part('ADJ'), get_pos_part('ADP'), get_pos_part('ADV'), get_pos_part('CONJ'), get_pos_part('DET'), get_pos_part('NOUN'), get_pos_part('NUM'), get_pos_part('PRON'), get_pos_part('PRT'), get_pos_part('VERB')]


def counts(text):
    count = lambda l1, l2: len(list(filter(lambda c: c in l2, l1)))
    number_of_punctuation = count(text, string.punctuation)
    number_of_capitals = sum(1 for c in text if c.isupper())
    number_of_lowers = sum(1 for c in text if c.islower())
    text_words = word_tokenize(remove_punctuation(text))

    number_of_stopwords = len([w for w in text_words if w in stopwords.words('english')])
    poss = pos_parts(text_words)

    vectorizer = SyllableTokenizer()

    number_of_sentences = sentence_count(text) * 1.0
    number_of_words = len(text_words) * 1.0

    if len(text_words) > 0:
        number_of_syllables = np.concatenate([vectorizer.tokenize(word) for word in text_words]).size * 1.0
    else:
        number_of_syllables = 0

    values = [number_of_sentences, number_of_words, number_of_syllables, number_of_punctuation, number_of_capitals, number_of_lowers, number_of_stopwords/number_of_words]
    values.extend(poss)
    return values

def remove_punctuation(s):
    return re.sub('[^\w\s]', "", s)

def average_syllables_per_word(number_of_syllable, number_of_words):
    return (number_of_syllable)/(number_of_words)

def flesch_kincaid_grade(number_of_sentences, number_of_words, number_of_syllables):
    return 0.39 * (number_of_words / number_of_sentences) + 11.8 * (number_of_syllables / number_of_words) - 15.59
    
def preproc(s):
    return re.sub(r'[0-9][0-9.,-]*', ' NUMBERSPECIALTOKEN ', s).lower()

In [None]:
class DataSet:
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels
        print(texts)
        d = np.array([counts(no_number_text) for no_number_text in [remove_numbers(text) for text in texts]])
        print(d)
        
        self.counts = pd.DataFrame(d, columns=['# sentences', '# words', '# syllables', "# punctuation", "# capitals", "# lowers", "% stopwords", '% ADJ', '% ADP', '% ADV', '% CONJ', '% DET', '% NOUN', '% NUM', '% PRON', '% PRT', '% VERB'])
        self.counts['avg syllables per word'] = average_syllables_per_word(self.counts['# syllables'], self.counts['# words'])
        self.counts['F-K grade'] = flesch_kincaid_grade(self.counts['# sentences'], self.counts['# words'], self.counts['# syllables'])

    def save_preprocessed(self, file_name):
        initial = pd.DataFrame({"text": self.texts, "label": self.labels})
        pd.concat([initial, self.counts], axis=1).to_csv(file_name, index=False)

## ISOT Fake News Dataset

In [None]:
def remove_header(text):
    if len(text.split()) > 2 and text.split()[1] == "(Reuters)":
        return re.sub(r'^.*?-', '', text)

In [None]:
fakes = pd.read_csv(root_dir + "/Dataset/Fake.csv",  index_col=False)
fakes['label'] = np.full((fakes.shape[0]), "fake")

truths = pd.read_csv(root_dir + "/Dataset/True.csv",  index_col=False)
truths['label'] = np.full((truths.shape[0]), "true")

truths.dropna(subset=['text'], inplace=True)
fakes.dropna(subset=['text'], inplace=True)



truths['text'] = truths['text'].map(remove_header)
truths.reset_index(drop=True)

isot = pd.concat((fakes, truths), ignore_index=True)

In [None]:
non_english = []
for i, text in enumerate(isot['text']):
    try: 
        lang = detect(text) 
        if lang != 'en':
            non_english.append(i)
    except:
        non_english.append(i)

isot.drop(index=non_english, inplace=True)
# isot.dropna(inplace=True)

In [None]:
isot_dataset = DataSet(isot['text'].values, isot['label'].values)
isot_dataset.save_preprocessed(root_dir + "/isot_dataset_preprocessed.csv")

## Kaggle Dataset

In [None]:
kaggles = pd.read_csv(root_dir + "/kaggle_dataset/kaggle_news_dataset.csv", usecols=[1,2,4]).astype('U')
kaggles = pd.read_csv(root_dir + "/kaggle_dataset/kaggle_news_dataset.csv").astype('U')

non_english = []
for i, text in enumerate(kaggles['content']):
    try: 
        lang = detect(text) 
        if lang != 'en':
            non_english.append(i)
    except:
        # print(text)
        # print("-----")
        non_english.append(i)

kaggles.drop(index=non_english, inplace=True)
kaggles.dropna(inplace=True)

kaggle_dataset = DataSet(kaggles['content'].values, kaggles['label'].values)
kaggle_dataset.save_preprocessed(root_dir + "/kaggle_dataset_preprocessed.csv")