In [1]:
import pandas as pd

df = pd.read_csv("/content/FakeNews_Cleaned_Sample.csv")
df = df[['text', 'label']]
df.head()


Unnamed: 0,text,label
0,more tax development both store agreement lawy...,real
1,probably guess western behind likely next inve...,fake
2,them identify forward present success risk sev...,fake
3,phone which item yard Republican safe where po...,fake
4,wonder myself fact difficult course forget exa...,fake


In [3]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Added to resolve LookupError

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

df['clean_text'] = df['text'].apply(preprocess_text)
df[['text', 'clean_text']].head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Unnamed: 0,text,clean_text
0,more tax development both store agreement lawy...,tax development store agreement lawyer hear ou...
1,probably guess western behind likely next inve...,probably guess western behind likely next inve...
2,them identify forward present success risk sev...,identify forward present success risk several ...
3,phone which item yard Republican safe where po...,phone item yard republican safe police identif...
4,wonder myself fact difficult course forget exa...,wonder fact difficult course forget exactly pa...


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)

X_tfidf = tfidf.fit_transform(df['clean_text'])
y = df['label']

X_tfidf.shape


(500, 869)

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape


((400, 869), (100, 869))

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.55
              precision    recall  f1-score   support

        fake       0.56      0.72      0.63        53
        real       0.53      0.36      0.43        47

    accuracy                           0.55       100
   macro avg       0.55      0.54      0.53       100
weighted avg       0.55      0.55      0.54       100

