In [3]:
#Import necessary libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download necessary NLTK data

try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
try:
    nltk.data.find('sentiment/vader_lexicon.zip')
except LookupError:
    nltk.download('vader_lexicon')


# Load and Explore the Dataset
file_path = 'blogs.csv'
df = pd.read_csv(file_path)

# --- Exploratory Data Analysis ---
print("--- Dataset Head ---")
print(df.head())
print("\n--- Dataset Info ---")
df.info()
print("\n--- Category Distribution ---")
print(df['Labels'].value_counts())


# Text Preprocessing
# Initialize stemmer and stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """
    Cleans and preprocesses a single text entry.
    """
    # Remove HTML tags and special characters
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    tokens = text.split()
    # Remove stopwords and apply stemming
    processed_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(processed_tokens)

# Apply preprocessing to the 'Data' column
print("\n--- Preprocessing Text Data ---")
df['Cleaned_Data'] = df['Data'].apply(preprocess_text)
print("Preprocessing complete.")
print(df[['Data', 'Cleaned_Data']].head())


#  Feature Extraction with TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000) # Using top 5000 features

# Fit and transform the cleaned data
X = tfidf_vectorizer.fit_transform(df['Cleaned_Data'])
y = df['Labels']

print("\n--- Feature Extraction Complete ---")
print("Shape of TF-IDF matrix:", X.shape)


#  Naive Bayes Model for Text Classification
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initialize and train the Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

y_pred = nb_classifier.predict(X_test)

print("\n--- Naive Bayes Model Trained and Predictions Made ---")


# Evaluate the Classifier
# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("\n--- Model Evaluation ---")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Display the classification report
print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred))


# Sentiment Analysis
# Initialize the VADER Sentiment Intensity Analyzer
sid = SentimentIntensityAnalyzer()

# Function to get sentiment category based on compound score
def get_sentiment(text):
    """
    Analyzes the sentiment of a text and returns 'Positive', 'Negative', or 'Neutral'.
    """
    sentiment_scores = sid.polarity_scores(text)
    compound_score = sentiment_scores['compound']
    if compound_score >= 0.05:
        return 'Positive'
    elif compound_score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment analysis to the original 'Data' column
print("\n--- Performing Sentiment Analysis ---")
df['Sentiment'] = df['Data'].apply(get_sentiment)
print("Sentiment analysis complete.")
print(df[['Data', 'Sentiment']].head())


# Examine Sentiment Distribution Across Categories
# Group by blog category and sentiment, then count the occurrences
sentiment_distribution = df.groupby(['Labels', 'Sentiment']).size().unstack(fill_value=0)

print("\n--- Sentiment Distribution Across Categories ---")
print(sentiment_distribution)

#Normalize the distribution to see proportionss
sentiment_proportions = sentiment_distribution.apply(lambda x: x / x.sum(), axis=1)
print("\n--- Sentiment Proportions Across Categories ---")
print(sentiment_proportions)


--- Dataset Head ---
                                                Data       Labels
0  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  alt.atheism
1  Newsgroups: alt.atheism\nPath: cantaloupe.srv....  alt.atheism
2  Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...  alt.atheism
3  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  alt.atheism
4  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...  alt.atheism

--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Data    2000 non-null   object
 1   Labels  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB

--- Category Distribution ---
Labels
alt.atheism                 100
comp.graphics               100
comp.os.ms-windows.misc     100
comp.sys.ibm.pc.hardware    100
comp.sys.mac.hardware       100
comp.windows.x              100
misc.forsale                100
rec.auto

**•	Discuss the performance of the model and any challenges encountered during the classification process**.

Performance of the Naive Bayes Model
The Multinomial Naive Bayes classifier with TF-IDF is an efficient baseline for text classification. It excels in categorizing blog posts by learning word probabilities, with the weighted F1-score as the key metric for balanced performance.

Metrics Interpretation:

Accuracy: Overall correct classifications, but misleading with imbalances.

Precision: Accuracy of positive predictions per category (e.g., reliable "Technology" labels).

Recall: Ability to find all true instances per category.

Classification Report: Detailed per-category breakdown of precision, recall, and F1-score to spot strengths/weaknesses.

Challenges in Classification
Naive Assumption: Ignores word dependencies, missing context like sarcasm.

Topical Overlap: Ambiguous content (e.g., "Automotive" blending with "Business") confuses distinctions.

Class Imbalance: Biases toward dominant categories, mitigated somewhat by stratified splitting.

Preprocessing Losses:

Aggressive stemming may merge unrelated words.

Removing punctuation/numbers discards key details (e.g., "Windows 10" → "Windows").

Limiting vocabulary to 5,000 features excludes rare but indicative terms.

**•	Reflect on the sentiment analysis results and their implications regarding the content of the blog posts.**

Categories like politics, religion, and atheism likely feature a high volume of positive and negative content, reflecting the passionate and argumentative nature of the discussions. In contrast, technical and scientific categories such as cryptography, electronics, and hardware are predominantly neutral, as their content is more informational and objective.

This implies that sentiment analysis effectively reveals the nature of the discourse within each category, distinguishing between subjective, opinion-driven debates and objective, fact-based information sharing.