<a href="https://colab.research.google.com/github/adhil456/Nm-phase-2/blob/main/Untitled8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import nltk
import re
import string

# Ensure stopwords are downloaded
nltk.download('stopwords')
from nltk.corpus import stopwords

# 1. Create and save sample dataset
sample_data = {
    'text': [
        "Donald Trump sends out a tweet that shocks the world.",
        "Scientists discover a new species in the Amazon rainforest.",
        "BREAKING: Aliens have landed on Earth, says unverified source.",
        "The economy is growing steadily under new policies.",
        "Celebrity caught in scandal, but evidence is lacking."
    ],
    'label': ["FAKE", "REAL", "FAKE", "REAL", "FAKE"]
}
df = pd.DataFrame(sample_data)
df.to_csv('fake_or_real_news.csv', index=False)

print("Dataset created and saved. Shape:", df.shape)
print(df.head())

# 2. Preprocess the text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

df['text'] = df['text'].astype(str).apply(clean_text)

# 3. Split the dataset
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

# 4. TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train = vectorizer.fit_transform(X_train)
tfidf_test = vectorizer.transform(X_test)

# 5. Model training
model = PassiveAggressiveClassifier(max_iter=50)
model.fit(tfidf_train, y_train)

# 6. Predictions and evaluation
y_pred = model.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"\nAccuracy: {score * 100:.2f}%")
print("Confusion Matrix:")
print(conf_matrix)

# 7. Test on custom news
def predict_news(news_text):
    cleaned = clean_text(news_text)
    vectorized = vectorizer.transform([cleaned])
    prediction = model.predict(vectorized)
    return prediction[0]

# Example
sample_news = "The government just announced a new policy to support farmers."
print("\nPrediction for custom news:", predict_news(sample_news))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Dataset created and saved. Shape: (5, 2)
                                                text label
0  Donald Trump sends out a tweet that shocks the...  FAKE
1  Scientists discover a new species in the Amazo...  REAL
2  BREAKING: Aliens have landed on Earth, says un...  FAKE
3  The economy is growing steadily under new poli...  REAL
4  Celebrity caught in scandal, but evidence is l...  FAKE

Accuracy: 100.00%
Confusion Matrix:
[[1]]

Prediction for custom news: REAL


