In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import download
from textblob import TextBlob
import string

# Download NLTK stopwords
download('punkt')
download('stopwords')

# Load dataset
data = pd.read_csv(r'C:\Users\User\Downloads\blogs.csv')

# Data Preprocessing
# Convert text to lowercase and clean it
data['Data'] = data['Data'].str.lower().str.replace('[^a-z\\s]', '')  # Clean text

# Tokenization and removal of stopwords
stop_words = set(stopwords.words('english'))
data['Data'] = data['Data'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))

# Feature Extraction using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['Data'])
y = data['Labels']

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Naive Bayes Classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Predictions on the test set
y_pred = model.predict(X_test)

# Evaluation using classification report and confusion matrix
print("Naive Bayes Classifier Evaluation:")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Sentiment Analysis using TextBlob
# Sentiment polarity: Positive (> 0), Neutral (0), Negative (< 0)
data['Sentiment'] = data['Data'].apply(lambda x: TextBlob(x).sentiment.polarity)
data['Sentiment_Label'] = data['Sentiment'].apply(lambda x: 'Positive' if x > 0 else ('Negative' if x < 0 else 'Neutral'))

# Display sentiment distribution across categories
print("\nSentiment Distribution Across Categories:")
sentiment_distribution = data.groupby('Labels')['Sentiment_Label'].value_counts().unstack()
print(sentiment_distribution)

# Optional: Save results to a new CSV file for further analysis
data.to_csv('blogs_with_sentiments.csv', index=False)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Naive Bayes Classifier Evaluation:
                          precision    recall  f1-score   support

             alt.atheism       0.60      0.83      0.70        18
           comp.graphics       0.72      0.72      0.72        18
 comp.os.ms-windows.misc       0.75      0.95      0.84        22
comp.sys.ibm.pc.hardware       0.72      0.84      0.78        25
   comp.sys.mac.hardware       0.88      0.67      0.76        21
          comp.windows.x       1.00      0.24      0.39        25
            misc.forsale       0.82      0.78      0.80        18
               rec.autos       0.81      0.94      0.87        18
         rec.motorcycles       0.82      0.88      0.85        16
      rec.sport.baseball       0.83      0.83      0.83        18
        rec.sport.hockey       0.62      1.00      0.77        15
               sci.crypt       0.66      1.00      0.79        19
         sci.electronics       0.75      0.56      0.64        16
                 sci.med       0.88     