# Part 1: Data Processing

Importing data

In [38]:
import pandas as pd

raw_data_fake_news = pd.read_csv("C:\\Users\\Maher\\Documents\\Data_Copy.csv", low_memory=False)
#raw_data_fake_news = pd.read_csv("C:\\Users\\Maher\\Documents\\995,000_rows.csv", low_memory=False)

data_fake_news = raw_data_fake_news.copy()[ #keeping only the relevant collumns
    ['domain', 'type', 'content', 'title', 'authors', 'meta_description', 'meta_keywords']].head(1000)

data_fake_news.to_csv("C:\\Users\\Maher\\Documents\\Data_Copy.csv", index=False) #save copy

### Cleaning

In [39]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
from tabulate import tabulate

#compile regular expressions
num_pattern = re.compile(r"(\d+)")
date_pattern = re.compile(r'\b\d{1,2}[/.-]\d{1,2}[/.-]\d{2,4}\b')
email_pattern = re.compile(r'\b[a-zA-Z0-9._-]+@[a-zA-Z0-9.-]+\.[a-z|A-Z]{2,}\b')
url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')

def clean_text(text: str):
    """cleans raw data"""
    text = str(text).lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(date_pattern, "<DATE>", text)
    text = re.sub(email_pattern, "<EMAIL>", text)
    text = re.sub(url_pattern, "<URL>", text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(num_pattern, "<NUM>", text)
    return text

#compile stopwords and initialize stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def stemming_no_stopwords_data(text: str):
    """returns stemmed data where stopwords are removed from string"""
    return ' '.join([stemmer.stem(word) for word in text.split() if word not in stop_words])

In [40]:
#cleaning raw text using clean_text function on all elements
data_fake_news['content'] = data_fake_news['content'].apply(clean_text)

In [41]:
#stemming and removing stopwords, while calculating the reduction in vocabulary
data_fake_news['content'] = data_fake_news['content'].apply(stemming_no_stopwords_data)

# Part 2: Simple Model

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def create_logistic_model(X, y):
    """Create a logistic model and return accuracy"""
    X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.9, stratify=y, random_state=0)
    model_logistic = LogisticRegression()
    model_logistic.fit(X_train, y_train)  # Train logistic model
    return accuracy_score(y_val, model_logistic.predict(X_val))  # Test accuracy

def df_type_binary(df):
    """Return the dataframe where the types grouped into reliable or fake"""
    reliable_type = {"reliable", "political", "clickbait"}
    df["type"] = df["type"].apply(lambda x: x in reliable_type)
    return df[~df["type"].isin(['fake', 'satire', 'bias', 'conspiracy', 'junksci'])]

one_hot_encoder = OneHotEncoder(handle_unknown='ignore') #initialize encoder
def transform_str_encoder(X: str):
    """Transform X string-data using OneHotEncoder"""
    return one_hot_encoder.fit_transform(X.values.reshape(-1, 1))


In [47]:
#prepareing y data
binary_fake_news = df_type_binary(data_fake_news.copy())
y = binary_fake_news['type']


#prepareing X data
X_domain = transform_str_encoder(binary_fake_news['domain'])
X_content = transform_str_encoder(binary_fake_news['content'])
X_title = transform_str_encoder(binary_fake_news['title'])
X_meta_desc = transform_str_encoder(binary_fake_news['meta_description'])
X_meta_key = transform_str_encoder(binary_fake_news['meta_keywords'])

#create pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('content', one_hot_encoder, ['content']),
        ('title', one_hot_encoder, ['title']),
        ('meta_desc', one_hot_encoder, ['meta_description']),
        ('meta_key', one_hot_encoder, ['meta_keywords'])
    ])

# Define pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

pipeline.fit(binary_fake_news[['content', 'title', 'meta_description', 'meta_keywords']], y)


#random guessing from porportion of reliable to fake data
fake_count, reliable_count = binary_fake_news['type'].value_counts()
random_acc = fake_count / (fake_count + reliable_count)

#creating logistic models for different types of X data
print(tabulate([
    ["randomly quessing", random_acc],
    ["domain", create_logistic_model(X_domain, y)], 
    ["content", create_logistic_model(X_content, y)],
    ["title", create_logistic_model(X_title, y)],
    ["meta description", create_logistic_model(X_meta_desc, y)],
    ["meta keywords", create_logistic_model(X_meta_key, y)]
    ], headers=["X data", "Accuracy"], tablefmt='orgtbl'))

| X data                     |   Accuracy |
|----------------------------+------------|
| randomly quessing          |       0.58 |
| domain                     |       0.92 |
| content                    |       0.63 |
| title                      |       0.62 |
| number of unique words     |       0.63 |
| percentage of unique words |       0.58 |
| meta description           |       0.58 |
| meta keywords              |       0.74 |


  random_acc = binary_fake_news['type'].value_counts()[0] / (fake_count + reliable_count)


describe what importan parameters to use in function


BBC with or without

# Part 3: Advanced Model

# Part 4: Evaluation

# Part 5: Conclusions