In [None]:
'''
Fake News Predictor App - Model Training Notebook

This Jupyter notebook loads, preprocesses, and combines multiple news datasets,
trains a logistic regression model to classify news as real or fake, evaluates
the model, and saves the trained model and vectorizer for use in the Streamlit app.
'''

# Import necessary libraries
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/driverap/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/driverap/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/driverap/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
def load_data():
    '''
    Load and preprocess the dataset from multiple CSV files.

    This function reads four datasets containing news articles, assigns labels to indicate whether 
    the articles are fake or real, removes unnecessary columns, combines them into a single DataFrame, 
    and applies data cleaning steps such as removing empty rows and limiting the text length.

    Returns:
        pd.DataFrame: A combined and cleaned DataFrame containing all the datasets.
    '''

    # Load the first dataset containing fake news articles
    fake_news_df = pd.read_csv('data/Fake.csv')
    # Load the second dataset containing real news articles
    real_news_df = pd.read_csv('data/True.csv')
    # Load the third dataset containing a mix of fake and real news articles
    more_news_df = pd.read_csv('data/news.csv')
    # Load the fourth dataset containing a larger sample of news articles
    large_news_df = pd.read_csv('data/news_fakenewscorpus.csv')

    # Add a 'label' column to the fake news dataset and drop unnecessary columns
    fake_news_df['label'] = 'fake'
    fake_news_df = fake_news_df.drop(['subject', 'date'], axis=1)

    # Add a 'label' column to the real news dataset and drop unnecessary columns
    real_news_df['label'] = 'real'
    real_news_df = real_news_df.drop(['subject', 'date'], axis=1)

    # Standardize the labels in the third dataset and drop unnecessary columns
    more_news_df['label'] = more_news_df['label'].replace({'FAKE': 'fake', 'REAL': 'real'})
    more_news_df = more_news_df.drop(['Unnamed: 0'], axis=1)

    # Combine all datasets into a single DataFrame
    combined_df = pd.concat([fake_news_df, real_news_df, more_news_df, large_news_df], ignore_index=True)

    # Remove rows where the 'text' field is empty or contains only whitespace
    combined_df = combined_df[~(combined_df['text'].str.strip() == '')]

    # Remove rows where the 'title' field is empty or contains only whitespace
    combined_df = combined_df[~(combined_df['title'].str.strip() == '')]

    # Remove rows where the length of the 'text' field exceeds 20,000 characters
    combined_df = combined_df[combined_df['text'].str.len() <= 20000]

    # Return the cleaned and combined dataset
    return combined_df

In [82]:
# Load the fake news dataset into a DataFrame
news_df = load_data()
news_df = news_df.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle the DataFrame
news_df

Unnamed: 0,title,text,label
0,Armenian Genocide Archives,The Armenians call it their holocaust – the 19...,fake
1,2008 Hillary Described Herself as a Pro-Gun Ch...,“I disagree with Senator Obama’s assertion tha...,real
2,My Head Spins At All the Choices When It Comes...,There are so many choices when it comes to who...,real
3,Is America On The Brink Of Civil War?,Is America On The Brink Of Civil War? 11/07/20...,fake
4,Russia Quietly Tightens Reins on Web With ‘Blo...,"Under the pressure of a corruption scandal, Tu...",real
...,...,...,...
300368,Today's Economic Events,A day which is a market holiday for many parts...,fake
300369,Corrections,An article on Sept. 27 about Carl P. Paladino ...,real
300370,Fixing the millionaire's amendment,Adam B has the details of the Supreme Court's ...,real
300371,The Daily Caller,‘We will have people going in over time into a...,real


In [83]:
def preprocess_data(df):
    '''
    Preprocess the text data in the DataFrame.
    This function tokenizes the text and removes punctuation and stopwords.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the fake news data to be preprocessed.

    Returns:
        pd.DataFrame: The DataFrame with preprocessed data.
    '''

    # Define stopwords
    stop_words = set(stopwords.words('english'))

    # Function to clean individual text
    def clean_text(text):
        # Tokenize text
        tokens = word_tokenize(text.lower())
        # Remove punctuation and stopwords
        tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
        return ' '.join(tokens)

    # Apply cleaning to the 'title' column
    if 'title' in df.columns:
        df['title'] = df['title'].apply(clean_text)

    # Apply cleaning to the 'text' column
    if 'text' in df.columns:
        df['text'] = df['text'].apply(clean_text)

    return df

In [None]:
# Preprocess the data using the defined function
news_df = preprocess_data(news_df)
news_df

Unnamed: 0,title,text,label
0,armenian genocide archives,armenians call holocaust 1915 forced deportati...,fake
1,2008 hillary described churchgoer,disagree senator obama assertion people countr...,real
2,head spins choices comes despicable republican...,many choices comes despicable republican senat...,real
3,america brink civil war,america brink civil war pj media valerie obama...,fake
4,russia quietly tightens reins web bloggers law,pressure corruption scandal turkey recently im...,real
...,...,...,...
300368,today economic events,day market holiday many parts around world see...,fake
300369,corrections,article 27 carl paladino hometown buffalo desc...,real
300370,fixing millionaire amendment,adam b details supreme court decision overturn...,real
300371,daily caller,people going time different doors different bu...,real


In [53]:
# Remove all rows with empty title field
news_df = news_df[~(news_df['title'].str.strip() == '')]
news_df

Unnamed: 0,title,text,label
0,armenian genocide archives,armenians call holocaust 1915 forced deportati...,fake
1,2008 hillary described churchgoer,disagree senator obama assertion people countr...,real
2,head spins choices comes despicable republican...,many choices comes despicable republican senat...,real
3,america brink civil war,america brink civil war pj media valerie obama...,fake
4,russia quietly tightens reins web bloggers law,pressure corruption scandal turkey recently im...,real
...,...,...,...
300368,today economic events,day market holiday many parts around world see...,fake
300369,corrections,article 27 carl paladino hometown buffalo desc...,real
300370,fixing millionaire amendment,adam b details supreme court decision overturn...,real
300371,daily caller,people going time different doors different bu...,real


In [66]:
# Remove all rows with empty text field
news_df = news_df[~(news_df['text'].str.strip() == '')]
news_df

Unnamed: 0,title,text,label
0,armenian genocide archives,armenians call holocaust 1915 forced deportati...,fake
1,2008 hillary described churchgoer,disagree senator obama assertion people countr...,real
2,head spins choices comes despicable republican...,many choices comes despicable republican senat...,real
3,america brink civil war,america brink civil war pj media valerie obama...,fake
4,russia quietly tightens reins web bloggers law,pressure corruption scandal turkey recently im...,real
...,...,...,...
300368,today economic events,day market holiday many parts around world see...,fake
300369,corrections,article 27 carl paladino hometown buffalo desc...,real
300370,fixing millionaire amendment,adam b details supreme court decision overturn...,real
300371,daily caller,people going time different doors different bu...,real


In [None]:
# Save the preprocessed DataFrame to a CSV file, for use in the Streamlit app
news_df.to_csv('data/news_preprocessed.csv', index=False)

In [None]:
def train_model(df):
    '''
    Train a logistic regression model on the preprocessed news data.

    This function splits the data into training and testing sets, applies TF-IDF vectorization,
    trains a logistic regression model, and evaluates its performance.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the preprocessed news data.

    Returns:
        model (LogisticRegression): The trained logistic regression model.
        vectorizer (TfidfVectorizer): The TF-IDF vectorizer used for feature extraction.
    '''

    # Split the data into features and labels
    X = df['title'] + ' ' + df['text']
    y = df['label']

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Train a logistic regression model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_tfidf, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test_tfidf)

    # Print classification report and accuracy
    print(classification_report(y_test, y_pred))
    print("Accuracy:", accuracy_score(y_test, y_pred))

    # Return the trained model and vectorizer
    return model, vectorizer

In [None]:
# Train the model and vectorizer on the preprocessed fake news data
model, vectorizer = train_model(news_df)

              precision    recall  f1-score   support

        fake       0.91      0.93      0.92     29116
        real       0.93      0.91      0.92     30707

    accuracy                           0.92     59823
   macro avg       0.92      0.92      0.92     59823
weighted avg       0.92      0.92      0.92     59823

Accuracy: 0.9172224729619043


In [None]:
# Pickle the model and vectorizer, for use in the Streamlit app
with open('models/model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('models/vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)