In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import re

import joblib
import string

In [2]:
fake = pd.read_csv('Fake.csv')
true = pd.read_csv('True.csv')

In [3]:
fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [4]:
true

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"
...,...,...,...,...
21412,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,worldnews,"August 22, 2017"
21413,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",worldnews,"August 22, 2017"
21414,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,worldnews,"August 22, 2017"
21415,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,worldnews,"August 22, 2017"


In [5]:
fake['class'] = 0
true['class'] = 1

In [6]:
data = pd.concat([fake, true], axis=0)

In [7]:
data.sample(5)

Unnamed: 0,title,text,subject,date,class
13625,Meeting on North Korea crisis to be in Canada ...,OTTAWA (Reuters) - A planned meeting of foreig...,worldnews,"November 28, 2017",1
16811,IS LONDON About To Elect Its First MUSLIM Mayo...,London is about to find out why putting politi...,Government News,"Jan 22, 2016",0
7028,Trump close to naming congressman Price as hea...,(Reuters) - U.S. President-elect Donald Trump ...,politicsNews,"November 29, 2016",1
18002,Merkel tells Rajoy of support for unity of Spain,BERLIN (Reuters) - German Chancellor Angela Me...,worldnews,"October 9, 2017",1
1936,Republican’s Excuse For Cutting Food Stamps: ...,Republican lawmakers love using the Bible to d...,News,"March 31, 2017",0


In [8]:
data = data.drop(['title', 'subject', 'date'], axis=1)

In [9]:
data.reset_index(inplace=True)

In [10]:
data.drop(['index'], axis=1, inplace=True)

In [11]:
data.sample(5)

Unnamed: 0,text,class
24606,WASHINGTON/NEW YORK (Reuters) - The Trump admi...,1
11362,"Tucker Carlson asks Robin Bronk, CEO of the Cr...",0
19668,Watch:Here s the truth about the Clinton Found...,0
34885,ISTANBUL (Reuters) - A Turkish judge ordered f...,1
22756,The world of personal digital devices is movin...,0


In [12]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

In [13]:
data["text"] = data["text"].apply(clean_text)

In [14]:
x=data['text']
y=data['class']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=42)

In [None]:
# Yes, training a model allows it to learn patterns from the training data,
# which helps it make predictions on unseen data (like x_test).
# The effectiveness depends on how well the model generalizes.
print(f"Train Accuracy: {train_score:.4f}")
print(f"Test Accuracy: {test_score:.4f}")
# If test accuracy is close to train accuracy, the model is generalizing well.

In [15]:
vectorizer = TfidfVectorizer()
xv_train = vectorizer.fit_transform(x_train)
xv_test = vectorizer.transform(x_test)

In [16]:
rf = RandomForestClassifier()
rf.fit(xv_train, y_train)
rf_predictions = rf.predict(xv_test)
print(rf.score(xv_test, y_test))

0.9907349665924277


In [17]:
lr = LogisticRegression()
lr.fit(xv_train, y_train)
lr_predictions = lr.predict(xv_test)
print(lr.score(xv_test, y_test))

0.9858351893095768


In [18]:

train_score = lr.score(xv_train, y_train)
test_score = lr.score(xv_test, y_test)
print(f"Train Accuracy: {train_score:.4f}")
print(f"Test Accuracy: {test_score:.4f}")

Train Accuracy: 0.9904
Test Accuracy: 0.9858


In [19]:
print(classification_report(y_test,lr_predictions))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      5895
           1       0.98      0.99      0.99      5330

    accuracy                           0.99     11225
   macro avg       0.99      0.99      0.99     11225
weighted avg       0.99      0.99      0.99     11225



In [None]:
Your precision, recall, f1-score, and support are high because your model is performing very well on the test set. This can happen for several reasons:

1. **Clean and Separable Data:** If your fake and true news samples are very different in terms of text, the model can easily distinguish between them, leading to high scores.
2. **Large Dataset:** You have a large dataset (44,898 samples), which helps the model generalize better and reduces the risk of overfitting.
3. **Effective Preprocessing:** Your text cleaning and TF-IDF vectorization may have removed noise and highlighted important features.

**Is it overfitting?**
- Overfitting occurs when the model performs well on the training data but poorly on unseen data.
- In your case, the train accuracy is `0.9904` and test accuracy is `0.9858`, which are very close. This suggests **no significant overfitting**.
- If the test accuracy was much lower than the train accuracy, that would indicate overfitting.

**Summary:**  
Your high scores are likely due to good data quality and preprocessing, not overfitting. The similar train and test accuracy values confirm this.

In [20]:
joblib.dump(vectorizer, 'vectorizer.jb')
joblib.dump(lr, 'lr_model.jb')
joblib.dump(rf, 'rf_model.jb')

['rf_model.jb']