## Welcome

In [1]:
# Welcome to my venture into fake news detection using Python

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
# Custom list of stop words
custom_stop_words = ['hillary', 'watch', 'kerry', 'bernie', 'battle', 'new', 'york']
all_stop_words = list(ENGLISH_STOP_WORDS.union(custom_stop_words)) # Converted to a list.

In [4]:
# Read the data
df = pd.read_csv('/Users/brocktbennett/GitHub/FakeNewsDetection/DSFake_News/news.csv')

In [5]:
# Drop unnecessary columns
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [6]:
# Data Exploration
print("Shape of the dataset:", df.shape)
print("\nFirst 5 records:\n", df.head())
print("\nLabel Distribution:\n", df['label'].value_counts())
print("\nChecking for missing values:\n", df.isnull().sum())

Shape of the dataset: (6335, 3)

First 5 records:
                                                title  \
0                       You Can Smell Hillary’s Fear   
1  Watch The Exact Moment Paul Ryan Committed Pol...   
2        Kerry to go to Paris in gesture of sympathy   
3  Bernie supporters on Twitter erupt in anger ag...   
4   The Battle of New York: Why This Primary Matters   

                                                text label  
0  Daniel Greenfield, a Shillman Journalism Fello...  FAKE  
1  Google Pinterest Digg Linkedin Reddit Stumbleu...  FAKE  
2  U.S. Secretary of State John F. Kerry said Mon...  REAL  
3  — Kaydee King (@KaydeeKing) November 9, 2016 T...  FAKE  
4  It's primary day in New York and front-runners...  REAL  

Label Distribution:
 REAL    3171
FAKE    3164
Name: label, dtype: int64

Checking for missing values:
 title    0
text     0
label    0
dtype: int64


In [7]:
# Displaying words removed from the first 5 records
for index, row in df.head().iterrows():
    original_words = set(row['text'].split())
    words_after_stop_words = original_words.difference(all_stop_words)
    removed_words = original_words.difference(words_after_stop_words)
    print(f"\nRemoved stop words from record {index}: {removed_words}")


Removed stop words from record 0: {'been', 'at', 'about', 'such', 'new', 'up', 'of', 'than', 'never', 'with', 'whose', 'last', 'her', 'else', 'now', 'whether', 'by', 'or', 'ever', 'why', 'through', 'down', 'into', 'how', 'were', 'on', 'the', 'he', 'she', 'be', 'over', 'often', 'in', 'while', 'is', 'him', 'have', 'much', 'and', 'may', 'if', 'had', 'but', 'even', 'against', 'no', 'an', 'same', 'its', 'it', 'from', 'own', 'too', 'where', 'under', 'out', 'all', 'well', 'for', 'throughout', 'here', 'whatever', 'off', 'was', 'around', 'being', 'when', 'who', 'as', 'are', 'has', 'would', 'your', 'that', 'only', 'other', 'whole', 'can', 'two', 'one', 'this', 'you', 'do', 'what', 'might', 'most', 'everyone', 'could', 'also', 'go', 'us', 'them', 'to', 'his', 'not', 'any', 'during', 'anywhere', 'enough', 'so', 'they', 'then', 'front', 'a', 'still', 'back', 'after', 'between', 'already', 'more', 'without', 'will', 'their', 'nothing'}

Removed stop words from record 1: {'he', 'before', 'every', 'v

In [8]:
# Get the labels
labels = df.label

In [9]:
# Split the dataset
x_train, x_test, y_train, y_test = train_test_split(df['text'], labels, test_size=0.2, random_state=7)