In [None]:
# Dataset Link: https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset

import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from nltk.tokenize import sent_tokenize
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [None]:
df1 = pd.read_csv('Fake.csv', encoding='latin1', engine='python', on_bad_lines='skip')
df2 = pd.read_csv('True.csv', encoding='latin1', engine='python', on_bad_lines='skip')

In [None]:
df1.head()

In [None]:
df1.drop(['title', 'subject', 'date'], axis=1, inplace=True)

In [None]:
df1.head()

In [None]:
#setting label to fake

df1['label'] = 'fake'

In [None]:
df1.head()

In [None]:
df2.head()

In [None]:
df2.drop(columns=['title', 'subject', 'date'], axis=1, inplace=True)

In [None]:
df2["label"] = "true"

In [None]:
df2.head()

In [None]:
#merging df1 and df2

df = pd.concat([df1, df2]).reset_index(drop=True)

In [None]:
df.to_csv('news.csv', index=False)

In [None]:
print(df['text'].apply(type).unique())

In [None]:
df['text'] = df['text'].fillna('')  # Replace NaN values with an empty string

In [None]:
df['text'] = df['text'].astype(str)

In [None]:
wordnet = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Prepare the corpus list
corpus = []

# Convert 'text' column to string and handle missing values
df['text'] = df['text'].fillna('').astype(str)

# Process each review
for i in range(len(df)):
    try:
        review = df['text'][i]
        # print(f"Original review at index {i}: {review}")  # Debugging line
        review = re.sub('[^a-zA-Z]', ' ', review)
        review = review.lower()
        review = nltk.word_tokenize(review)
        review = [wordnet.lemmatize(word) for word in review if word not in stop_words]
        review = ' '.join(review)
        corpus.append(review)
    except TypeError as e:
        print(f"TypeError at index {i}: {e}")
        # print(f"Value causing error: {df['text'][i]}")

In [None]:
words = []

for sent in corpus:
    try:
        tokens = sent_tokenize(sent)
            # print(tokens)
        for token in tokens:
            words.append(simple_preprocess(token))
    except TypeError as e:
        print(f"TypeError at index {i}: {e}")

In [None]:
model = Word2Vec(words)

In [None]:
print(model.wv.most_similar('trump'))

In [None]:
def avg_word2vec(doc):
    vectors = [model.wv[word] for word in doc if word in model.wv.index_to_key]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [None]:
X = []
y = []

for i in range(len(corpus)):
    try:
        doc_words = simple_preprocess(corpus[i])
        if doc_words:
          X.append(avg_word2vec(doc_words))
          y.append(df['label'][i])
    except TypeError as e:
        print(f"TypeError at index {i}: {e}")

In [None]:
X = np.array(X)

In [None]:
X = pd.DataFrame(X)
y = pd.Series(y).map({'fake': 0, 'true': 1})

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

In [None]:
random_forest_grid = {
    'n_estimators': [100, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [10, 20, 30, 40, 50],
    'criterion': ['gini', 'entropy']
}

# Create and train the Random Forest classifier
rnf = RandomForestClassifier()
rnf.fit(X_train, y_train)

grid = GridSearchCV(estimator=rnf, param_grid=random_forest_grid, cv=5, n_jobs=-1, verbose=2)
grid.fit(X_train, y_train)

# Get the best parameters from the grid search
best_params = grid.best_params_
print("Best Parameters:", best_params)

# Create a new Random Forest classifier with the best parameters
rnf = RandomForestClassifier(**best_params)
rnf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rnf.predict(X_test)

In [None]:
df_result = pd.DataFrame({'y_real': y_test[:10], 'y_pred': y_pred[:10]})
df_result['difference'] = df_result['y_real'] - df_result['y_pred']
df_result

In [None]:
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
with open('model.pkl', 'wb') as file:
    pickle.dump(rnf, file)

In [None]:
news = "bishnudev khutia dies in an accident today"

news = news.lower()
news = nltk.word_tokenize(news)
news = [wordnet.lemmatize(word) for word in news if word not in stop_words]
news = ' '.join(news)

In [None]:
result = rnf.predict(test_data)
result

In [None]:
if result[0] == 0:
    print("Fake News")
else:
    print("True News")