# Part 1: Data Processing

### Importing data

In [None]:
import pandas as pd

new_copy = False #run the data cleaning and create a new CSV file

if new_copy: 
    raw_data_fake_news = pd.read_csv("C:\\Users\\Maher\\Documents\\995,000_rows.csv", low_memory=False)
    data_fake_news = raw_data_fake_news.copy()[['domain', 'type', 'content', 'title', 'authors', 'meta_description', 'meta_keywords']]
    
else: data_fake_news = pd.read_csv("C:\\Users\\Maher\\Documents\\Data_Copy.csv", low_memory=False)


### Cleaning

In [None]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re

#compile regular expressions
num_pattern = re.compile(r"(\d+)")
date_pattern = re.compile(r'\b\d{1,2}[/.-]\d{1,2}[/.-]\d{2,4}\b')
email_pattern = re.compile(r'\b[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-z|A-Z]{2,}\b')
url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

def tokenize_text(text: str):
    """cleans raw data"""
    text = str(text).lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(date_pattern, "<DATE>", text)
    text = re.sub(email_pattern, "<EMAIL>", text)
    text = re.sub(url_pattern, "<URL>", text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(num_pattern, "<NUM>", text)
    return text

#compile stopwords and initialize stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def stemming_no_stopwords(text: str):
    """returns stemmed data where stopwords are removed from string"""
    return ' '.join(stemmer.stem(word) for word in text.split() if word not in stop_words)

def df_type_binary(df):
    """Return the dataframe where the types grouped into reliable or fake"""
    reliable_type = {"reliable", "political", "clickbait"}
    df["type"] = df["type"].apply(lambda x: x in reliable_type)
    return df[~df["type"].isin(['fake', 'satire', 'bias', 'conspiracy', 'junksci'])]

In [None]:
#cleaning raw text using clean_text function on all elements
if new_copy: data_fake_news['content'] = data_fake_news['content'].apply(tokenize_text)

In [None]:
#stemming and removing stopwords, while calculating the reduction in vocabulary
if new_copy: 
    data_fake_news['content'] = data_fake_news['content'].apply(stemming_no_stopwords)

In [None]:
#save backup copy
if new_copy: 
    data_fake_news.to_csv("C:\\Users\\Maher\\Documents\\Data_Copy.csv", index=False)

In [None]:
import json

#Importing BBC data
with open("C:\\Users\\Maher\\Documents\\GitHub\\Fake-News-Project\\BBC.json", 'r') as file:
    BBC_data = [next(iter(article.items()))[1][2] for article in json.load(file) if not next(iter(article.items()))[1][2] == None]

# Part 2: Simple Model

### Preparing data for model creation

In [None]:
#remove empty rows and create two type classes: False (fake) and True (reliable)
complete_fake_news = data_fake_news.copy()
complete_fake_news[['type', 'content']] = complete_fake_news[['type', 'content']].dropna()
complete_fake_news = df_type_binary(complete_fake_news)

#saving 10% of articles (excluding BBC) for X_test
complete_len = len(complete_fake_news)
sample_len = round(complete_len * 0.9)
sample_fake_news = complete_fake_news.copy().head(sample_len)
sample_fake_news = sample_fake_news.sample(frac=1).reset_index(drop=True) #shuffleling data

X_extend = pd.concat([pd.Series(BBC_data).apply(tokenize_text).apply(stemming_no_stopwords), 
                      sample_fake_news['content'].copy()])
y_extend = pd.concat([pd.Series([True] * len(BBC_data)), 
                      sample_fake_news['type'].copy()])

### Testing data on simple model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

one_hot_encoder = OneHotEncoder(handle_unknown='ignore')

def str_encoder(X: str):
    """Transform X string-data using OneHotEncoder"""
    return one_hot_encoder.fit_transform(X.values.reshape(-1, 1))

def logistic_model(X, y):
    """Create a logistic model and return accuracy"""
    X_encoded = str_encoder(X)
    X_train, X_val, y_train, y_val = train_test_split(X_encoded, y, train_size=0.9, stratify=y, random_state=0)
    model = LogisticRegression()
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    return [f1_score(y_val, predictions), accuracy_score(y_val, predictions)]

In [None]:
from tabulate import tabulate

#creating logistic models for different types of X data
print(tabulate([
    ["domain"] + logistic_model(sample_fake_news['domain'], sample_fake_news['type']),
    ["title"] + logistic_model(sample_fake_news['title'], sample_fake_news['type']),
    ["author"] + logistic_model(sample_fake_news['authors'], sample_fake_news['type']),
    ["meta description"] + logistic_model(sample_fake_news['meta_description'], sample_fake_news['type']),
    ["meta keywords"] + logistic_model(sample_fake_news['meta_keywords'], sample_fake_news['type']),
    ["content"] + logistic_model(sample_fake_news['content'], sample_fake_news['type']),
    ["content BBC extended"] + logistic_model(X_extend, y_extend),
    ], headers=["Train data", "f1", "Accuracy"], tablefmt='orgtbl'))

# Part 3: Advanced Model

### Creating three advanced models

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import HashingVectorizer

def naive_bayes_model(X, y):
    """Create a naive bayes model and return accuracy"""
    X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.9, stratify=y, random_state=0)
    model = Pipeline([
        ('Vectorizer', CountVectorizer()),
        ('Transformer', TfidfTransformer(use_idf=False)),
        ('Model', MultinomialNB()),
        ])
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    return [f1_score(y_val, predictions), accuracy_score(y_val, predictions)]

def stochastic_gradient_descent_model(X, y):
    """Create a classification matrix model and return accuracy"""
    X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.9, stratify=y, random_state=0)
    model = Pipeline([
        ('Vectorizer', CountVectorizer()),
        ('Transformer', TfidfTransformer(use_idf=False)),
        ('Model', SGDClassifier(loss='hinge', penalty='l2',
                              alpha=1e-4, random_state=0)),
        ])
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    return [f1_score(y_val, predictions), accuracy_score(y_val, predictions)]

def stochastic_gradient_descent_hash_model(X, y):
    """Create a classification matrix model and return accuracy"""
    X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.9, stratify=y, random_state=0)
    model = Pipeline([
        ('Vectorizer', HashingVectorizer()),
        ('Transformer', TfidfTransformer(use_idf=False)),
        ('Model', SGDClassifier(loss='hinge', penalty='l2',
                              alpha=1e-4, random_state=0)),
        ])
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    return [f1_score(y_val, predictions), accuracy_score(y_val, predictions)]

In [None]:
#printing accuracy of advanced models using the BBC extended data
print(tabulate([
    ["Naive Bayes with count vectorizer"] + naive_bayes_model(sample_fake_news['content'], sample_fake_news['type']),
    ["SDC with count vectorizer"] + stochastic_gradient_descent_model(sample_fake_news['content'], sample_fake_news['type']),
    ["SDC with hashing vectorizer"] + stochastic_gradient_descent_hash_model(sample_fake_news['content'], sample_fake_news['type']),
    ], headers=["Model", "f1", "Accuracy"], tablefmt='orgtbl'))

# Part 4: Evaluation

### Train model

In [None]:
#Creating function for simple model
simple_model = LogisticRegression()
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
simple_model.fit(one_hot_encoder.fit_transform(sample_fake_news['content'].values.reshape(-1, 1)), sample_fake_news['type'])

#Creating the advanced model choosing the best out of the three advanced models
advanced_model = Pipeline([
    ('Vectorizer', CountVectorizer()),
    ('Transformer', TfidfTransformer(use_idf=False)),
    ('Model', SGDClassifier(loss='hinge', penalty='l2',
                            alpha=1e-4, random_state=0,
                            max_iter=5, tol=None)),
    ])

advanced_model.fit(sample_fake_news['content'], sample_fake_news['type'])

### LIAR dataset 

In [None]:
liar_data = pd.read_csv("C:\\Users\\Maher\\Documents\\GitHub\\Fake-News-Project\\test.tsv", sep='\t', header=None)[[1,2]]

def binary_clean(text: str):
    """grouping data into fake and reliable"""
    if text in ["true", "mostly-true"]: return True
    elif text in ["false", "pants-fire"]: return False
    else: return None

#cleaning liar data
liar_data[1] = liar_data[1].apply(binary_clean)
liar_data[2] = liar_data[2].apply(tokenize_text).apply(stemming_no_stopwords)
liar_data.dropna(subset=[1], inplace=True)
liar_data.dropna(subset=[2], inplace=True)

#creating liar test X and y data
X_liar = liar_data[2]
y_liar = liar_data[1].tolist()

### Test advanced model

In [None]:
#Creating X and y for the complete data 
X_test = complete_fake_news['content'].tail(complete_len - sample_len)
y_test = complete_fake_news["type"].tail(complete_len - sample_len)

simple_predictions_test = simple_model.predict(one_hot_encoder.fit_transform(X_test.values.reshape(-1, 1)))
simple_predictions_liar = simple_model.predict(one_hot_encoder.fit_transform(X_liar.values.reshape(-1, 1))).tolist()

advanced_predictions_test = advanced_model.predict(X_test)
advanced_predictions_liar = advanced_model.predict(X_liar).tolist()

print(tabulate([
    ["Simple", "X_liar", f1_score(y_liar, simple_predictions_liar), accuracy_score(y_liar, simple_predictions_liar)],
    ["Simple", "X_liar", f1_score(y_liar, simple_predictions_liar), accuracy_score(y_liar, simple_predictions_liar)],
    ["Advanced", "X_test", f1_score(y_test, advanced_predictions_test), accuracy_score(y_test, advanced_predictions_test)],
    ["Advanced", "X_liar", f1_score(y_liar, advanced_predictions_liar), accuracy_score(y_liar, advanced_predictions_liar)],
    ], headers=["model", "test data", "f1", "Accuracy"], tablefmt='orgtbl'))