In [None]:
import pandas as pd     # For handling datasets (reading CSV, data manipulation)
import numpy as np      # For numerical operations (arrays, random, etc.)
import re               # regular expression (searching text)
import seaborn as sns   # For data visualization (heatmaps, distribution plots)
import matplotlib.pyplot as plt       # For plotting graphs (showing data trends)
from nltk.corpus import stopwords     # For removing common words (e.g., "the", "is") to improve text analysis
from nltk.stem.porter import PorterStemmer              # For stemming words (reducing words to root form, e.g., "running" → "run")
from sklearn.model_selection import train_test_split    # For splitting dataset into training & testing sets
from sklearn.feature_extraction.text import TfidfVectorizer  # For converting text into numerical form (TF-IDF scores)
from sklearn.linear_model import LogisticRegression          # For classifying news as real or fake (supervised learning model)
from sklearn.metrics import accuracy_score                   # For evaluating model performance (accuracy calculation)

# ***Load the dataset ***

In [None]:
news_dataset = pd.read_csv(r"/content/train.csv")

# ***Data Preprocessing ***

In [None]:
# Analyse the dataset

news_dataset.head()
news_dataset.isnull().sum()

Unnamed: 0,0
id,0
title,558
author,1957
text,39
label,0


In [None]:
#replacing null values with empty string
news_dataset=news_dataset.fillna(' ')
news_dataset.isnull().sum()

Unnamed: 0,0
id,0
title,0
author,0
text,0
label,0


In [None]:
# Merging `author` and `title` into `content`

# it provides more context, enhances pattern recognition, and simplifies preprocessing for better predictions.
news_dataset['content']=news_dataset['author']+' '+news_dataset['title']
print(news_dataset['content'])

0        Darrell Lucus House Dem Aide: We Didn’t Even S...
1        Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo...
2        Consortiumnews.com Why the Truth Might Get You...
3        Jessica Purkiss 15 Civilians Killed In Single ...
4        Howard Portnoy Iranian woman jailed for fictio...
                               ...                        
20795    Jerome Hudson Rapper T.I.: Trump a ’Poster Chi...
20796    Benjamin Hoffman N.F.L. Playoffs: Schedule, Ma...
20797    Michael J. de la Merced and Rachel Abrams Macy...
20798    Alex Ansary NATO, Russia To Hold Parallel Exer...
20799              David Swanson What Keeps the F-35 Alive
Name: content, Length: 20800, dtype: object


In [None]:
#Printing the stopwords in english , will remove these during stemming process

import nltk
nltk.download('stopwords')
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# STEMMING PROCEDURE

# DEFINITION - Stemming is the process of reducing a word to its root word
# EXAMPLE - acting, actor, actress --> act
# BENEFIT - reduces the number of unique words, making text processing faster


port_stem = PorterStemmer()
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)   # only want alphabets(a-z,A-Z) only, thus replacing numbers, punctuations with ' ' -> space
    stemmed_content = stemmed_content.lower()           # convert all to lowercase for uniformity
    stemmed_content = stemmed_content.split()           # splits the sentence into list of words == "hello world" → ["hello", "world"]
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]  # convert in root form and remove the stop words
    stemmed_content = ' '.join(stemmed_content)         # Joins the processed words back into a sentence
    return stemmed_content


news_dataset['content'] = news_dataset['content'].apply(stemming)
print(news_dataset['content'])

0        darrel lucu hou dem aid even see comey letter ...
1        daniel j flynn flynn hillari clinton big woman...
2                   consortiumnew com truth might get fire
3        jessica purkiss civilian kill singl us airstri...
4        howard portnoy iranian woman jail fiction unpu...
                               ...                        
20795    jerom hudson rapper trump poster child white s...
20796    benjamin hoffman n f l playoff schedul matchup...
20797    michael j de la merc rachel abram maci said re...
20798    alex ansari nato russia hold parallel exerci b...
20799                            david swanson keep f aliv
Name: content, Length: 20800, dtype: object


In [None]:
# Seprarting the data and the label
X=news_dataset['content'].values        # .values converts a Pandas Series into a NumPy array
Y=news_dataset['label'].values

print(X)    # dependent (target) variable
print(Y)    # independent variable

X.shape, Y.shape

['darrel lucu hou dem aid even see comey letter jason chaffetz tweet'
 'daniel j flynn flynn hillari clinton big woman campu breitbart'
 'consortiumnew com truth might get fire' ...
 'michael j de la merc rachel abram maci said receiv takeov approach hudson bay new york time'
 'alex ansari nato russia hold parallel exerci balkan'
 'david swanson keep f aliv']
[1 0 1 ... 0 1 1]


((20800,), (20800,))

In [None]:
# Converting the textual data to numerical data
# Because ML models cannot process text directly, they require numerical input for calculations

# use - Vectorizer function (Tfidf)

# TF-IDF → Term Frequency - Inverse Document Frequency
# TF counts number of times a word is repeating and based on this frequency it assigns importance to a particular word
# IDF finds those repeating word which aren't significant and reduces their importance
# based on this it make feature vectors, which is a list of numbers that represent how important each word is in a document


# so,  after applying TF-IDF, a news article is converted into a series of numbers (feature vectors) instead of text, making it usable for ML models


vectorizer = TfidfVectorizer()
vectorizer.fit(X)                   # .fit() trains or learns patterns from data, here it learns word importance
X = vectorizer. transform(X)

# ***Splitting the Dataset into Train & Test Data ***

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=42)

# ***Training the Model : Logistic Regression ***

In [None]:
model = LogisticRegression()
model.fit(X_train,Y_train)

# ***Evaluation for Logistic Regression ***

In [None]:
# accuracy score on training data

train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score (train_prediction, Y_train)
print("Accuracy Score: ", training_data_accuracy)

Accuracy Score:  0.9873798076923077


In [133]:
# accuracy score on testing data

test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score (test_prediction, Y_test)
print("Accuracy Score: ", test_data_accuracy)

Accuracy Score:  0.9752403846153846


In [None]:
# predictive system
X_new = X_test[0]

prediction = model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

[0]
The news is Real
