In [None]:
# Install kagglehub
!pip install kagglehub -q

# Set Kaggle credentials (replace with your own Kaggle username & API key)
# Uncomment below segment and add your own kaggle username and key
"""
import os
os.environ['KAGGLE_USERNAME'] = ""
os.environ['KAGGLE_KEY'] = ""
"""

# Import kagglehub and download WELFake dataset
import kagglehub
import pandas as pd
import glob

path = kagglehub.dataset_download("vcclab/welfake-dataset")
print("Dataset downloaded to:", path)

# Load CSV into pandas
csv_files = glob.glob(path + "/*.csv")
print("CSV files found:", csv_files)

df = pd.read_csv(csv_files[0])
print("Dataframe shape:", df.shape)
print("Columns:", df.columns)
df.head()


In [None]:
import pandas as pd
from sklearn.utils import shuffle
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
from nltk import pos_tag, word_tokenize

# Download all required NLTK resources (including the 'eng' variant)
for resource in ['punkt', 'punkt_tab', 'wordnet', 'omw-1.4',
                 'averaged_perceptron_tagger', 'averaged_perceptron_tagger_eng',
                 'stopwords']:
    nltk.download(resource, quiet=True)

stop_words = set(stopwords.words('english'))

# Step 0: Shuffle the dataset
df = shuffle(df, random_state=42)

# 1. Remove unnecessary columns safely
df = df.drop(columns=['Unnamed: 0'], errors='ignore')

# 2. Handle missing data
df = df.dropna(subset=['title', 'text'])

# 3. Combine text features
df['content'] = df['title'] + " " + df['text']

# 4. Text cleaning: lowercase and strip spaces
df['content'] = df['content'].str.lower().str.strip()

# 5. Remove punctuation
df['content'] = df['content'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

# 6. POS-aware lemmatization with stopword removal
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    """Map NLTK POS tags to WordNet POS tags for lemmatization."""
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def preprocess_text(text):
    """Tokenize, remove stopwords, POS-aware lemmatize, and return cleaned string."""
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word, get_wordnet_pos(pos))
             for word, pos in pos_tag(words)
             if word not in stop_words]
    return ' '.join(words)

df['content'] = df['content'].apply(preprocess_text)

print("Preprocessing done. Dataset shape:", df.shape)
print(df.head())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from wordcloud import WordCloud

# Set style
sns.set(style="whitegrid")

# 1. Label distribution (bar plot)
plt.figure(figsize=(6,4))
sns.countplot(x='label', data=df, palette="viridis")
plt.title("Label Distribution (0 = real, 1 = fake)")
plt.xlabel("Label")
plt.ylabel("Number of Articles")
plt.show()

# 2. Text length distribution (histogram)
df['text_length'] = df['content'].apply(lambda x: len(x.split()))
plt.figure(figsize=(8,4))
sns.histplot(df['text_length'], bins=50, kde=True, color='skyblue')
plt.title("Text Length Distribution")
plt.xlabel("Number of Words")
plt.ylabel("Number of Articles")
plt.show()

# 3. Word count boxplot (to see outliers)
plt.figure(figsize=(8,4))
sns.boxplot(x='text_length', data=df, color='lightgreen')
plt.title("Boxplot of Text Length")
plt.xlabel("Number of Words")
plt.show()

# 4. Top 15 most frequent words (bar plot)
all_words = ' '.join(df['content']).split()
word_counts = Counter(all_words)
top_words = dict(word_counts.most_common(15))
plt.figure(figsize=(10,6))
sns.barplot(x=list(top_words.values()), y=list(top_words.keys()), palette="magma")
plt.title("Top 15 Most Frequent Words")
plt.xlabel("Frequency")
plt.ylabel("Words")
plt.show()

# 5. Article length vs label (violin plot)
plt.figure(figsize=(8,4))
sns.violinplot(x='label', y='text_length', data=df, palette="coolwarm")
plt.title("Text Length Distribution by Label")
plt.xlabel("Label (0=real, 1=fake)")
plt.ylabel("Number of Words")
plt.show()

# 6. Label proportion pie chart
label_counts = df['label'].value_counts()
plt.figure(figsize=(6,6))
plt.pie(label_counts, labels=['Fake','Real'], autopct='%1.1f%%', colors=['tomato','lightblue'], startangle=140)
plt.title("Proportion of Fake vs Real News")
plt.show()


# 7. Wordcloud
# Separate real and fake news
real_text = " ".join(df[df['label'] == 0]['content'])
fake_text = " ".join(df[df['label'] == 1]['content'])

# Generate word clouds
real_wc = WordCloud(width=800, height=400, background_color='white', max_words=200).generate(real_text)
fake_wc = WordCloud(width=800, height=400, background_color='white', max_words=200).generate(fake_text)

# Plot word clouds
plt.figure(figsize=(16,8))

plt.subplot(1,2,1)
plt.imshow(real_wc, interpolation='bilinear')
plt.axis('off')
plt.title('Real News Word Cloud', fontsize=20)

plt.subplot(1,2,2)
plt.imshow(fake_wc, interpolation='bilinear')
plt.axis('off')
plt.title('Fake News Word Cloud', fontsize=20)

plt.show()




In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. Define features (X) and target (y)
X = df['content']
y = df['label']

# 2. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) # stratify is good for classification

# 3. Initialize and fit the vectorizer ONLY on the training data
print("Fitting TF-IDF Vectorizer on training data...")
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)

# 4. Transform the test data using the already-fitted vectorizer
print("Transforming test data...")
X_test_tfidf = vectorizer.transform(X_test)

print("Data is ready for model training.")

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# 1. Define the model i want to tune
log_reg_model = LogisticRegression(max_iter=1000)

# 2. Set up the grid of parameters to test
# This grid will create 3 (C values) * 2 (solvers) = 6 different model 'varieties' to test.
param_grid = {
    'C': [0.1, 1, 10],  # Controls regularization. Lower values prevent overfitting.
    'solver': ['liblinear', 'saga']
}

# 3. Configuring the grid search
# cv=5 means 5-fold cross-validation
# scoring='f1' tells it to find the model with the best F1-score
grid_search = GridSearchCV(estimator=log_reg_model, param_grid=param_grid, cv=5, scoring='f1', verbose=2, n_jobs=-1)

# 4. Run the tuning process on your training data
print("Starting hyperparameter tuning...")
grid_search.fit(X_train_tfidf, y_train)

# 5. Extract the best model found
best_lr_model = grid_search.best_estimator_
print(f"\\nBest Hyperparameters Found: {grid_search.best_params_}")
print(f"Best F1-score from cross-validation: {grid_search.best_score_:.4f}")

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Make predictions on the unseen test data
y_pred = best_lr_model.predict(X_test_tfidf)

# Print the detailed classification report
print("\\nFinal modeleval report :")
print(classification_report(y_test, y_pred, target_names=['Real', 'Fake']))

# Visualize the Confusion Matrix
print("confution matrix :")
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Real', 'Fake'], yticklabels=['Real', 'Fake'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()