In [4]:
import pandas as pd

# Load the dataset
real_news = pd.read_csv("True.csv")
fake_news = pd.read_csv("Fake.csv")

# Add labels
real_news['label'] = 1  # 1 for real news
fake_news['label'] = 0  # 0 for fake news

# Combine datasets
dataset = pd.concat([real_news, fake_news], ignore_index=True)

# Check for missing values
print(dataset.isnull().sum())

# Check class distribution
print(dataset['label'].value_counts())

title      0
text       0
subject    0
date       0
label      0
dtype: int64
label
0    23481
1    21417
Name: count, dtype: int64


In [5]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (only need to run once)
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
# Download the punkt_tab dataset
nltk.download('punkt_tab') # This line was added to fix the LookupError

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Join tokens back into a single string
    return " ".join(tokens)

# Apply preprocessing to the 'text' column in the dataset
dataset['text'] = dataset['text'].apply(preprocess_text)

# Display the first few rows of the preprocessed dataset
print(dataset.head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text       subject  \
0  washington reuters head conservative republica...  politicsNews   
1  washington reuters transgender people allowed ...  politicsNews   
2  washington reuters special counsel investigati...  politicsNews   
3  washington reuters trump campaign adviser geor...  politicsNews   
4  seattlewashington reuters president donald tru...  politicsNews   

                 date  label  
0  December 31, 2017       1  
1  December 29, 2017       1  
2  December 31, 2017       1  
3  December 30, 2017       1  
4  December 29, 2017       1  


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(dataset['text']).toarray()
y = dataset['label']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Evaluate models
def evaluate_model(y_true, y_pred, model_name):
    print(f"Evaluation for {model_name}:")
    print(f"Accuracy: {accuracy_score(y_true, y_pred)}")
    print(f"Precision: {precision_score(y_true, y_pred)}")
    print(f"Recall: {recall_score(y_true, y_pred)}")
    print(f"F1-Score: {f1_score(y_true, y_pred)}")

evaluate_model(y_test, y_pred_rf, "Random Forest")

Evaluation for Random Forest:
Accuracy: 0.9969933184855234
Precision: 0.9960802397970948
Recall: 0.9976905311778291
F1-Score: 0.9968847352024922
