In [1]:
import pandas as pd
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# Download necessary NLTK resources (uncomment if not downloaded)
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# Define the folder path containing the CSV files
folder_path = '/Users/aamershah/Desktop/gig/News_dataset'

# Define the file names
fake_file = 'Updated_Fake.csv'
real_file = 'Updated_Real.csv'

# Construct the full paths
fake_file_path = os.path.join(folder_path, fake_file)
real_file_path = os.path.join(folder_path, real_file)

# Load the updated fake and real news data
fake_df = pd.read_csv(fake_file_path)
real_df = pd.read_csv(real_file_path)

# Function to preprocess text
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Lowercasing
    tokens = [word.lower() for word in tokens]
    
    # Remove punctuation
    tokens = [word for word in tokens if word.isalnum()]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if not word in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# Apply preprocessing to the 'text' column in both DataFrames
fake_df['preprocessed_text'] = fake_df['text'].apply(preprocess_text)
real_df['preprocessed_text'] = real_df['text'].apply(preprocess_text)

# Display the first few rows of the updated DataFrames to verify preprocessing
print("Fake News DataFrame after preprocessing:")
print(fake_df[['text', 'preprocessed_text']].head())

print("\nReal News DataFrame after preprocessing:")
print(real_df[['text', 'preprocessed_text']].head())


Fake News DataFrame after preprocessing:
                                                text  \
0  Donald Trump just couldn t wish all Americans ...   
1  House Intelligence Committee Chairman Devin Nu...   
2  On Friday, it was revealed that former Milwauk...   
3  On Christmas day, Donald Trump announced that ...   
4  Pope Francis used his annual Christmas Day mes...   

                                   preprocessed_text  
0  donald trump wish american happy new year leav...  
1  house intelligence committee chairman devin nu...  
2  friday revealed former milwaukee sheriff david...  
3  christmas day donald trump announced would bac...  
4  pope francis used annual christmas day message...  

Real News DataFrame after preprocessing:
                                                text  \
0  WASHINGTON (Reuters) - The head of a conservat...   
1  WASHINGTON (Reuters) - Transgender people will...   
2  WASHINGTON (Reuters) - The special counsel inv...   
3  WASHINGTON (Reuters) - 

In [2]:
# Calculate the number of occurrences for each subject in both datasets
subject_counts_fake = fake_df['subject'].value_counts()
subject_counts_real = real_df['subject'].value_counts()

# Find the maximum count for subjects in both datasets to determine the resampling target
resample_size = max(subject_counts_fake.max(), subject_counts_real.max())

# Function to resample each subject to the target size
def resample_subjects(df, target_size):
    # Use 'replace=True' to allow oversampling by sampling with replacement
    return df.groupby('subject').apply(lambda x: x.sample(target_size, replace=True)).reset_index(drop=True)

# Resample both DataFrames
balanced_fake_df = resample_subjects(fake_df, resample_size)
balanced_real_df = resample_subjects(real_df, resample_size)

# Verify the balancing by counting the occurrences of each subject in the balanced datasets
print("Balanced Fake News Subjects:")
print(balanced_fake_df['subject'].value_counts())

print("\nBalanced Real News Subjects:")
print(balanced_real_df['subject'].value_counts())

Balanced Fake News Subjects:
subject
politics     16640
worldnews    16640
Name: count, dtype: int64

Balanced Real News Subjects:
subject
politics     16640
worldnews    16640
Name: count, dtype: int64


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))

# Combine the balanced fake and real datasets
combined_balanced_df = pd.concat([balanced_fake_df, balanced_real_df], ignore_index=True)

# Shuffle the combined dataframe to ensure a good mix
combined_balanced_df = combined_balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Extract features and target
X = combined_balanced_df['text']  # the preprocessed text
y = combined_balanced_df['label']  # the labels

# Fit and transform the text data to create TF-IDF features
X_tfidf = tfidf_vectorizer.fit_transform(X)

print("TF-IDF features shape:", X_tfidf.shape)


TF-IDF features shape: (66560, 10000)


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

# Assuming X_tfidf and y are already defined and represent your feature matrix and labels respectively

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Initialize the SVM classifier
svm_classifier = SVC(random_state=42)

# Set up parameter grid to search
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}

# Set up GridSearchCV with SVM
grid_search = GridSearchCV(svm_classifier, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Best estimator found by grid search
best_svm_classifier = grid_search.best_estimator_

# Predict on the test set using the best estimator
y_pred_svm = best_svm_classifier.predict(X_test)

# Evaluate the SVM model
print("SVM Classifier Report:")
print(classification_report(y_test, y_pred_svm))

# Initialize the Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the Naive Bayes classifier
nb_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred_nb = nb_classifier.predict(X_test)

# Evaluate the Naive Bayes model
print("Naive Bayes Classifier Report:")
print(classification_report(y_test, y_pred_nb))




SVM Classifier Report:
              precision    recall  f1-score   support

        fake       1.00      1.00      1.00      6671
        real       1.00      1.00      1.00      6641

    accuracy                           1.00     13312
   macro avg       1.00      1.00      1.00     13312
weighted avg       1.00      1.00      1.00     13312

Naive Bayes Classifier Report:
              precision    recall  f1-score   support

        fake       0.95      0.92      0.93      6671
        real       0.92      0.95      0.94      6641

    accuracy                           0.94     13312
   macro avg       0.94      0.94      0.94     13312
weighted avg       0.94      0.94      0.94     13312



In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Define SVM classifier
svm_classifier = SVC(random_state=42)

# Set up parameter grid to search
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}

# Set up GridSearchCV
grid_search = GridSearchCV(svm_classifier, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Best estimator found by grid search
best_svm_classifier = grid_search.best_estimator_

# Predict on the test set using the best estimator
y_pred_svm = best_svm_classifier.predict(X_test)
print("SVM Classifier Report:")
print(classification_report(y_test, y_pred_svm))
