In [None]:
# Basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# NLP tools
import re
import string
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')

# ML tools
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
# Load datasets
true_df = pd.read_csv('/content/True.csv', on_bad_lines='skip', encoding='ISO-8859-1')
fake_df = pd.read_csv('/content/Fake.csv', on_bad_lines='skip', encoding='ISO-8859-1')

# Add labels
true_df['label'] = 1  # Real
fake_df['label'] = 0  # Fake

# Combine & shuffle
df = pd.concat([true_df, fake_df], axis=0)
df = df.sample(frac=1).reset_index(drop=True)

# Preview
df.head()


Unnamed: 0,title,text,subject,date,label
0,Australia campuses warned of 'clandestine' inf...,SYDNEY (Reuters) - Australia s domestic spy ch...,worldnews,"October 25, 2017",1
1,TRUMP CABINET MEMBER MICK MULVANEY ON DC: âT...,https://www.youtube.com/watch?time_continue=2&...,politics,"Mar 26, 2017",0
2,No Wordsâ¦ [VIDEO],For the first time in the history of 100% FED ...,left-news,"Dec 12, 2015",0
3,"Trump baffles Sweden with crime comment, says ...",(This story corrects paragraph 8 to clarify S...,politicsNews,"February 19, 2017",1
4,Schaeuble says held 'friendly and constructive...,BERLIN (Reuters) - German Finance Minister Wol...,politicsNews,"March 16, 2017",1


In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if pd.isnull(text):
        return ""
    # Lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove punctuation and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Apply cleaning to the 'title' and 'text' columns
df['clean_title'] = df['title'].apply(clean_text)
df['clean_text'] = df['text'].apply(clean_text)

# Combine title and text as final input
df['final_text'] = df['clean_title'] + " " + df['clean_text']

# Quick peek
df[['title', 'text', 'final_text']].head()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Unnamed: 0,title,text,final_text
0,Australia campuses warned of 'clandestine' inf...,SYDNEY (Reuters) - Australia s domestic spy ch...,australia campus warned clandestine influence ...
1,TRUMP CABINET MEMBER MICK MULVANEY ON DC: âT...,https://www.youtube.com/watch?time_continue=2&...,trump cabinet member mick mulvaney dc place mu...
2,No Wordsâ¦ [VIDEO],For the first time in the history of 100% FED ...,word video first time history fed almost speec...
3,"Trump baffles Sweden with crime comment, says ...",(This story corrects paragraph 8 to clarify S...,trump baffle sweden crime comment say based tv...
4,Schaeuble says held 'friendly and constructive...,BERLIN (Reuters) - German Finance Minister Wol...,schaeuble say held friendly constructive talk ...


In [None]:
# Use final_text as input
X = df['final_text']
y = df['label']

# Split into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression

# Train the model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9902004454342984

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4715
           1       0.99      0.99      0.99      4265

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980


Confusion Matrix:
 [[4662   53]
 [  35 4230]]


In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train_tfidf, y_train)

xgb_pred = xgb_model.predict(X_test_tfidf)

print("XGBoost Accuracy:", accuracy_score(y_test, xgb_pred))

Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.9981069042316258


In [None]:
import xgboost as xgb
import pickle

# Assuming you already have your trained model as `xgb_model`
# Save XGBoost model in native format (recommended)
xgb_model.save_model('xgb_model.json')

# Save the TF-IDF vectorizer (pickle is okay for sklearn objects)
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)