In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pickle

print("All libraries imported successfully!")


# CELL 2: Loading the data
#%%
# Load the fake and real news datasets
df_fake = pd.read_csv('Fake.csv')
df_true = pd.read_csv('True.csv')

# Let's see how the first few rows look
print("------ Fake News Head ------")
df_fake.head()

All libraries imported successfully!
------ Fake News Head ------


Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [2]:
# Add a 'class' column to both dataframes
df_fake['class'] = 0
df_true['class'] = 1

# Let's see the last 5 rows of both tables to confirm
print("---- Fake News Tail ----")
display(df_fake.tail())

print("\n---- True News Tail ----")
display(df_true.tail())

# Now, let's combine them into a single big dataframe
df_merged = pd.concat([df_fake, df_true], ignore_index=True)

# Shuffle the dataframe randomly to mix up fake and real news
df_merged = df_merged.sample(frac=1).reset_index(drop=True)

# Display the first few rows of our new big table
print("\n---- Merged and Shuffled Dataframe Head ----")
display(df_merged.head())

---- Fake News Tail ----


Unnamed: 0,title,text,subject,date,class
23476,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",0
23477,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",0
23478,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",0
23479,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",0
23480,10 U.S. Navy Sailors Held by Iranian Military ...,21st Century Wire says As 21WIRE predicted in ...,Middle-east,"January 12, 2016",0



---- True News Tail ----


Unnamed: 0,title,text,subject,date,class
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017",1
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017",1
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017",1
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017",1
21416,Indonesia to buy $1.14 billion worth of Russia...,JAKARTA (Reuters) - Indonesia will buy 11 Sukh...,worldnews,"August 22, 2017",1



---- Merged and Shuffled Dataframe Head ----


Unnamed: 0,title,text,subject,date,class
0,U.S. drops some claims against Texas voter ID law,WASHINGTON (Reuters) - The U.S. Justice Depart...,politicsNews,"February 27, 2017",1
1,Myanmar corrects state media report on U.N. 'a...,NAYPYITAW (Reuters) - A Myanmar state-run news...,worldnews,"October 28, 2017",1
2,GOP Lawmaker To Trump: Stop Acting Like A ‘Fr...,Angry constituents have inundated town hall me...,News,"February 22, 2017",0
3,PRINCIPAL CAUGHT STEALING From DETERIORATING D...,Detroit teachers make viral video: Blame Repub...,left-news,"Apr 2, 2016",0
4,"Before Putin talks, Trump plays down interfere...",WARSAW (Reuters) - One day before his first me...,politicsNews,"July 6, 2017",1


In [3]:
# First, let's check for any missing values in our data
print("---- Checking for Missing Values ----")
print(df_merged.isnull().sum())
print("\n")

# We see some columns like subject and date are not useful for prediction.
# We will only keep the 'text' and 'class' columns.
# Let's create a final dataframe.
df = df_merged.drop(['title', 'subject', 'date'], axis=1)

# Now let's define a function to clean the text
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Initialize the tools
port_stem = PorterStemmer()
nltk.download('stopwords') # This might be needed only once

def preprocess_text(content):
    # Remove anything that is not a letter, and convert to lowercase
    stemmed_content = re.sub('[^a-zA-Z]',' ', str(content))
    stemmed_content = stemmed_content.lower()
    
    # Split the text into a list of words
    stemmed_content = stemmed_content.split()
    
    # Remove stopwords (like 'a', 'the', 'is') and apply stemming
    # Stemming reduces words to their root form (e.g., 'running' -> 'run')
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    
    # Join the words back into a single string
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

# Now, apply this function to our 'text' column. This might take a few seconds.
print("---- Starting Text Preprocessing... ----")
df['text'] = df['text'].apply(preprocess_text)
print("---- Text Preprocessing Complete! ----\n")


# Let's see how the cleaned data looks
print("---- Cleaned Dataframe Head ----")
display(df.head())

---- Checking for Missing Values ----
title      0
text       0
subject    0
date       0
class      0
dtype: int64




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


---- Starting Text Preprocessing... ----


KeyboardInterrupt: 

In [4]:
# Separating the data (text) and the label (class)
X = df['text'].values
y = df['class'].values

# Now, we will convert the textual data to numerical data (vectors)
# We will use TF-IDF Vectorizer for this.
# It gives importance to words that are more frequent in one document but rare in others.
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

# Now, let's split the dataset into training and testing data
# We'll use 80% for training and 20% for testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)

# Let's check the shape of our data
print("Shape of our full data (X):", X.shape)
print("Shape of our training data (X_train):", X_train.shape)
print("Shape of our testing data (X_test):", X_test.shape)

Shape of our full data (X): (44898, 122002)
Shape of our training data (X_train): (35918, 122002)
Shape of our testing data (X_test): (8980, 122002)


In [5]:
# Now it's time to train our model.
# We will use a simple and effective model called Logistic Regression.

model = LogisticRegression()

# Let's fit the model with our training data.
print("---- Model Training Started... ----")
model.fit(X_train, y_train)
print("---- Model Training Complete! ----")

---- Model Training Started... ----
---- Model Training Complete! ----


In [6]:
# Now let's evaluate the model's performance on the training data first.
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, y_train)

print('Accuracy score on the training data : ', training_data_accuracy)


# Now, let's test the model on the test data (which it has never seen before).
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, y_test)

print('Accuracy score on the test data : ', test_data_accuracy)

Accuracy score on the training data :  0.9913970711064091
Accuracy score on the test data :  0.9861915367483296


In [7]:
# We need to save two things:
# 1. The trained model
# 2. The vectorizer (so we can convert new text into numbers in the same way)

# Saving the trained model
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Saving the vectorizer
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

print("Model and Vectorizer saved successfully!")

Model and Vectorizer saved successfully!
