In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [8]:
fake_df = pd.read_csv(r"C:\Users\sadaw\OneDrive\Desktop\Fake news classifier\data\raw\Fake.csv")
true_df = pd.read_csv(r"C:\Users\sadaw\OneDrive\Desktop\Fake news classifier\data\raw\True.csv")


In [9]:
fake_df["label"] = 0   # FAKE
true_df["label"] = 1   # REAL


In [10]:
df = pd.concat([fake_df, true_df], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)


In [11]:
df = df[["text", "label"]]
df.head()


Unnamed: 0,text,label
0,"21st Century Wire says Ben Stein, reputable pr...",0
1,WASHINGTON (Reuters) - U.S. President Donald T...,1
2,(Reuters) - Puerto Rico Governor Ricardo Rosse...,1
3,"On Monday, Donald Trump once again embarrassed...",0
4,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",1


In [12]:
df.info()
df["label"].value_counts()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    44898 non-null  object
 1   label   44898 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 701.7+ KB


label
0    23481
1    21417
Name: count, dtype: int64

#Train-Test split now.


In [14]:
X = df["text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


The TF-IDf model.

In [15]:
tfidf = TfidfVectorizer(
    stop_words="english",
    max_df=0.7
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [16]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [17]:
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.984966592427617
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      4696
           1       0.98      0.99      0.98      4284

    accuracy                           0.98      8980
   macro avg       0.98      0.99      0.98      8980
weighted avg       0.98      0.98      0.98      8980



In [18]:
import joblib

joblib.dump(tfidf, "../models/tfidf_vectorizer.pkl")
joblib.dump(model, "../models/logistic_regression.pkl")


['../models/logistic_regression.pkl']

In [19]:
loaded_tfidf = joblib.load("../models/tfidf_vectorizer.pkl")
loaded_model = joblib.load("../models/logistic_regression.pkl")

sample_text = ["Breaking: Government announces free money scheme"]
sample_vec = loaded_tfidf.transform(sample_text)
loaded_model.predict(sample_vec)


array([0])

**Linear Svm**

In [20]:
from sklearn.svm import LinearSVC

svm_model = LinearSVC()
svm_model.fit(X_train_tfidf, y_train)


0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [21]:
svm_pred = svm_model.predict(X_test_tfidf)

print("SVM Accuracy:", accuracy_score(y_test, svm_pred))
print(classification_report(y_test, svm_pred))


SVM Accuracy: 0.9938752783964365
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4696
           1       0.99      0.99      0.99      4284

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [22]:
joblib.dump(svm_model, "../models/linear_svm.pkl")


['../models/linear_svm.pkl']