# Part 1: Data Processing

Importing data

In [38]:
import pandas as pd

raw_data_fake_news = pd.read_csv("C:\\Users\\Maher\\Documents\\Data_Copy.csv", low_memory=False)
#raw_data_fake_news = pd.read_csv("C:\\Users\\Maher\\Documents\\995,000_rows.csv", low_memory=False)

data_fake_news = raw_data_fake_news.copy()[ #keeping only the relevant collumns
    ['domain', 'type', 'content', 'title', 'authors', 'meta_description', 'meta_keywords']].head(1000)

data_fake_news.to_csv("C:\\Users\\Maher\\Documents\\Data_Copy.csv", index=False) #save copy

### Cleaning

In [39]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
from tabulate import tabulate

#compile regular expressions
num_pattern = re.compile(r"(\d+)")
date_pattern = re.compile(r'\b\d{1,2}[/.-]\d{1,2}[/.-]\d{2,4}\b')
email_pattern = re.compile(r'\b[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-z|A-Z]{2,}\b')
url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

def clean_text(text: str):
    """cleans raw data"""
    text = str(text).lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(date_pattern, "<DATE>", text)
    text = re.sub(email_pattern, "<EMAIL>", text)
    text = re.sub(url_pattern, "<URL>", text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(num_pattern, "<NUM>", text)
    return text

#compile stopwords and initialize stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def stemming_no_stopwords_data(text: str):
    """returns stemmed data where stopwords are removed from string"""
    return ' '.join([stemmer.stem(word) for word in text.split() if word not in stop_words])

In [40]:
#cleaning raw text using clean_text function on all elements
data_fake_news['content'] = data_fake_news['content'].apply(clean_text)

In [41]:
#stemming and removing stopwords, while calculating the reduction in vocabulary
data_fake_news['content'] = data_fake_news['content'].apply(stemming_no_stopwords_data)

# Part 2: Simple Model

In [104]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

one_hot_encoder = OneHotEncoder(handle_unknown='ignore') #initialize encoder
def str_encoder(X: str):
    """Transform X string-data using OneHotEncoder"""
    return one_hot_encoder.fit_transform(X.values.reshape(-1, 1))

def logistic_model(X, y):
    """Create a logistic model and return accuracy"""
    X_encoded = str_encoder(X)
    X_train, X_val, y_train, y_val = train_test_split(X_encoded, y, train_size=0.9, stratify=y, random_state=0)
    model_logistic = LogisticRegression()
    model_logistic.fit(X_train, y_train)
    return accuracy_score(y_val, model_logistic.predict(X_val))

def df_type_binary(df):
    """Return the dataframe where the types grouped into reliable or fake"""
    reliable_type = {"reliable", "political", "clickbait"}
    df["type"] = df["type"].apply(lambda x: x in reliable_type)
    return df[~df["type"].isin(['fake', 'satire', 'bias', 'conspiracy', 'junksci'])]

def naive_bayes_model(X, y):
    """Create a classification matrix model and return accuracy"""
    X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.9, stratify=y, random_state=0)
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer(use_idf=False)),
        ('clf', MultinomialNB()),
        ])
    text_clf.fit(X_train, y_train)
    return accuracy_score(y_val, text_clf.predict(X_val))

def sdc_model(X, y):
    """Create a classification matrix model and return accuracy"""
    X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.9, stratify=y, random_state=0)
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer(use_idf=False)),
        ('clf', SGDClassifier(loss='hinge', penalty='l2',
                              alpha=1e-3, random_state=0,
                              max_iter=5, tol=None)),
        ])
    text_clf.fit(X_train, y_train)
    return accuracy_score(y_val, text_clf.predict(X_val))

In [105]:
#prepareing y data
binary_fake_news = df_type_binary(data_fake_news.copy())
y = binary_fake_news['type']

#random guessing from porportion of reliable to fake data
fake_count, reliable_count = binary_fake_news['type'].value_counts()
random_acc = fake_count / (fake_count + reliable_count)

#creating models for different types of X data
print(tabulate([
    ["random", "none", random_acc],
    ["logistic", "domain", logistic_model(binary_fake_news['domain'], y)], 
    ["logistic", "content", logistic_model(binary_fake_news['content'], y)],
    ["logistic", "title", logistic_model(binary_fake_news['title'], y)],
    ["logistic", "meta description", logistic_model(binary_fake_news['meta_description'], y)],
    ["logistic", "meta keywords", logistic_model(binary_fake_news['meta_keywords'], y)],
    ["pipeline", "content", naive_bayes_model(binary_fake_news['content'], y)],
    ["pipeline improved", "content", sdc_model(binary_fake_news['content'], y)],
    ], headers=["model", "X data", "Accuracy"], tablefmt='orgtbl'))

| model             | X data           |   Accuracy |
|-------------------+------------------+------------|
| random            | none             |       0.58 |
| logistic          | domain           |       0.92 |
| logistic          | content          |       0.63 |
| logistic          | title            |       0.62 |
| logistic          | meta description |       0.58 |
| logistic          | meta keywords    |       0.74 |
| pipeline          | content          |       0.67 |
| pipeline improved | content          |       0.73 |




describe what importan parameters to use in function


BBC with or without

# Part 3: Advanced Model

# Part 4: Evaluation

# Part 5: Conclusions