In [5]:
import pandas as pd

real_news = pd.read_csv('True.csv')
fake_news = pd.read_csv('Fake.csv')

# adding labels
real_news['label'] = 1 #for real news
fake_news['label'] = 0 #for fake news

dataset = pd.concat([real_news, fake_news], ignore_index=True)

# Text Preprocessing


In [11]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
  # convert to lowercase
  text = text.lower()
  # remove special characters and numbers
  text = re.sub(r'[^a-zA-z\s]', '', text)
  # remove stopwords
  text = ' '.join([word for word in text.split() if word not in stop_words])
  # Lemmatization
  text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

  return text

dataset['cleaned_text'] = dataset['text'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Train a classifier

In [26]:
from sklearn.model_selection import train_test_split

X = dataset['cleaned_text']
y = dataset['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Feature Extraction

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
# convert text into numerical features
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Model Training

In [28]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,classification_report

model = MultinomialNB()
model.fit(X_train_vec, y_train)

y_pred = model.predict(X_test_vec)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Accuracy: 0.9412026726057906
              precision    recall  f1-score   support

           0       0.95      0.93      0.94      5829
           1       0.93      0.95      0.94      5396

    accuracy                           0.94     11225
   macro avg       0.94      0.94      0.94     11225
weighted avg       0.94      0.94      0.94     11225



In [29]:
import joblib
joblib.dump(model, 'fake_news_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')


['vectorizer.pkl']