Downloading the required package .

In [None]:
!pip install scikit-learn nltk joblib



In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
from google.colab import files
import zipfile
import os

- Uploading the downloaded  dataset in zip file which contains separated in two files:
1.   Fake.csv (23502 fake news article)
2.   True.csv (21417 true news article)
- Downloaded from kaggle source

In [21]:
uploaded = files.upload()
for file in uploaded.keys():
    with zipfile.ZipFile(file, 'r') as zip_ref:
        zip_ref.extractall("data")

# Check contents
print(os.listdir("data"))


Saving archive.zip to archive (1).zip
['True.csv', 'Fake.csv']


Load and merge data

In [22]:
# Correct file paths after unzip
fake = pd.read_csv("data/Fake.csv")
true = pd.read_csv("data/True.csv")

# Add labels
fake["label"] = 0
true["label"] = 1

# Combine and shuffle
df = pd.concat([fake, true], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)
df = df[["text", "label"]]

# Check for NaNs
print(df.isnull().sum())


text     0
label    0
dtype: int64


Split data

In [23]:
X = df['text']
y = df['label']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Vectorize text TF-IDF

In [24]:
tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)
x_train_tfidf = tfidf.fit_transform(x_train)
x_test_tfidf = tfidf.transform(x_test)




Train and evaluation model - using logistic regression

In [25]:
model = LogisticRegression(max_iter=1000)
model.fit(x_train_tfidf, y_train)

In [26]:

y_pred = model.predict(x_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9866369710467706

Confusion Matrix:
 [[4631   70]
 [  50 4229]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4701
           1       0.98      0.99      0.99      4279

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



Saving model abd vectorizing it and then download the trained files

In [27]:
joblib.dump(model, 'model.pkl')
joblib.dump(tfidf, 'tfidf.pkl')

['tfidf.pkl']

In [28]:
files.download('model.pkl')
files.download('tfidf.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>