# 📰 Fake News Detection on Twitter
This notebook performs NLP-based fake news detection using a dataset of tweets. We'll clean, preprocess, vectorize text, train machine learning models, and evaluate their performance.

In [None]:
# Step 1: Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import spacy
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nlp = spacy.load("en_core_web_sm")

In [None]:
# Step 2: Load Dataset
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

print(train_df.head())
print(train_df.info())

In [None]:
# Step 3: Data Cleaning & EDA
train_df['keyword'].fillna('none', inplace=True)
train_df['location'].fillna('unknown', inplace=True)

sns.countplot(x='target', data=train_df)
plt.title("Class Distribution")
plt.show()

In [None]:
# WordCloud for Real and Fake Tweets
real_tweets = ' '.join(train_df[train_df['target'] == 1]['text'])
fake_tweets = ' '.join(train_df[train_df['target'] == 0]['text'])
WordCloud().generate(real_tweets).to_image()

In [None]:
# Step 4: Text Preprocessing
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    stems = [stemmer.stem(word) for word in tokens]
    return ' '.join(stems)

train_df['clean_text'] = train_df['text'].apply(preprocess)

In [None]:
# Step 5: Feature Extraction
cv = CountVectorizer(max_features=5000)
X = cv.fit_transform(train_df['clean_text']).toarray()
y = train_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 6: Model Training
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(classification_report(y_test, y_pred))