# Prepare Data

In [1]:
!rm -rf /content/sample_data

In [2]:
!gdown 1fli_hyDy7Io0coUNdk1P-DUPWtpfBwsX

Downloading...
From: https://drive.google.com/uc?id=1fli_hyDy7Io0coUNdk1P-DUPWtpfBwsX
To: /content/news-NLP.csv
100% 30.7M/30.7M [00:00<00:00, 41.7MB/s]


# Import Lib

In [22]:
!pip install scikit-learn
import pandas as pd
from gensim.models import FastText
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler



# Prepare Data Training

In [4]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
df = pd.read_csv('news-NLP.csv')
df = df.drop(df.columns[0], axis=1)

In [6]:
df['label'] = df['label'].apply(lambda x: 1 if x == "FAKE" else 0)
df['content'] = df['title'] + ' ' + df['text']

In [7]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [8]:
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return words

In [9]:
df['processed_content'] = df['content'].apply(preprocess_text)

# FastText Model

In [10]:
fasttext_model = FastText(sentences=df['processed_content'], vector_size=100, window=5, min_count=5, workers=4, sg=0, epochs=10)

In [11]:
def document_vector(doc, model):
    # Filter out words not in the FastText vocabulary
    words = [word for word in doc if word in model.wv.key_to_index]
    if len(words) == 0:
        return np.zeros(model.vector_size)  # Handle empty documents
    return np.mean(model.wv[words], axis=0)

In [12]:
df['doc_vector'] = df['processed_content'].apply(lambda x: document_vector(x, fasttext_model))

In [13]:
X = np.vstack(df['doc_vector'].values)
y = df['label'].values

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training with Naive Bayes

In [26]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [27]:
model = MultinomialNB()
model.fit(X_train_scaled, y_train)

# Evaluate

In [28]:
y_pred = model.predict(X_test_scaled)

In [29]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Accuracy : {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy : 0.8058405682715075
Precision: 0.7817109144542773
Recall: 0.8439490445859873
F1 Score: 0.8116385911179173
