In [144]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [145]:
# Read the datasets
fake_data = pd.read_csv("Fake.csv")
true_data = pd.read_csv("True.csv")

In [146]:
# Add labels to the datasets: 1 for fake, 0 for real
fake_data["label"] = 1
true_data["label"] = 0

In [147]:
# Combine datasets
data = pd.concat([fake_data, true_data], axis=0).reset_index(drop=True)

In [148]:
# Shuffle the dataset to mix fake and real samples
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

In [149]:
# Use 'text' and 'label' columns
X, y = data["text"], data["label"]

In [150]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [151]:
# Text vectorization using TF-IDF
vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7)
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [152]:
# Train a LinearSVC model
clf = LinearSVC(random_state=42)
clf.fit(X_train_vectorized, y_train)

In [153]:
# Evaluate the model
accuracy = clf.score(X_test_vectorized, y_test)
print(f"Model accuracy: {accuracy:.4f}")

Model accuracy: 0.9927


In [154]:
# Test the model on new text
with open("mytest.txt", "r", encoding="utf-8") as f:
    text = f.read()

vectorized_text = vectorizer.transform([text])
prediction = clf.predict(vectorized_text)

In [155]:
# Output the prediction
print(f"The prediction for the given text is: {'FAKE' if prediction[0] == 1 else 'REAL'}")

The prediction for the given text is: REAL
