Algorithm Used: Support Vector Machine

upport Vector Machines (SVM) are well-suited for text classification tasks like spam detection because they handle high-dimensional and sparse data (e.g., TF-IDF vectors) effectively. SVM works by finding the optimal hyperplane that maximally separates different classes (spam vs. ham) with the largest margin. This makes it robust to overfitting, especially in small to medium-sized datasets.

In [2]:
# Install necessary libraries (if not installed)
# !pip install scikit-learn pandas numpy nltk

# Import libraries
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# 1. Create a Fake News dataset
data = {
    "text": [
        "Breaking: Scientists discover a cure for COVID-19!",  # Fake
        "NASA confirms water on the moon, groundbreaking discovery!",  # Real
        "Experts warn of stock market collapse due to secret government policies.",  # Fake
        "Government announces new tax relief for small businesses.",  # Real
        "Shocking: Aliens found living in Area 51!",  # Fake
        "New study finds link between exercise and improved mental health.",  # Real
        "Politician caught hiding millions in offshore accounts.",  # Fake
        "Medical researchers develop breakthrough cancer treatment.",  # Real
        "Secret messages found in ancient pyramids predict end of the world!",  # Fake
        "Tech company launches revolutionary AI that changes programming forever.",  # Real
    ],
    "label": [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]  # 1 = Fake, 0 = Real
}

df = pd.DataFrame(data)

# 2. Preprocess the text (lowercase, remove special characters, stopwords)
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)  # Remove non-word characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

df['clean_text'] = df['text'].apply(preprocess)

# 3. Convert text into numerical features using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_text'])
y = np.array(df['label'])

# 4. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 5. Train an SVM classifier
model = SVC(kernel='linear')
model.fit(X_train, y_train)

# 6. Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# 7. Test with new headlines
new_headlines = [
    "Scientists invent teleportation technology, changing travel forever!",
    "World Health Organization announces breakthrough in malaria vaccine.",
    "Shocking discovery: Atlantis city found under the ocean!",
    "New smartphone released with groundbreaking AI features."
]

new_headlines_clean = [preprocess(news) for news in new_headlines]
new_features = vectorizer.transform(new_headlines_clean)
predictions = model.predict(new_features)

# Print Predictions
for headline, pred in zip(new_headlines, predictions):
    category = "Fake News" if pred == 1 else "Real News"
    print(f"'{headline}' → {category}")


Model Accuracy: 0.33
'Scientists invent teleportation technology, changing travel forever!' → Fake News
'World Health Organization announces breakthrough in malaria vaccine.' → Real News
'Shocking discovery: Atlantis city found under the ocean!' → Fake News
'New smartphone released with groundbreaking AI features.' → Real News


[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
