In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Data Exploration and Preprocessing

In [2]:
# Load the dataset
data = pd.read_csv("blogs.csv")

In [3]:
data

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism
...,...,...
1995,Xref: cantaloupe.srv.cs.cmu.edu talk.abortion:...,talk.religion.misc
1996,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc
1997,Xref: cantaloupe.srv.cs.cmu.edu talk.origins:4...,talk.religion.misc
1998,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc


In [5]:
# Data Exploration
print("Dataset shape:", data.shape)
print("Categories:", data['Labels'].unique())

Dataset shape: (2000, 2)
Categories: ['alt.atheism' 'comp.graphics' 'comp.os.ms-windows.misc'
 'comp.sys.ibm.pc.hardware' 'comp.sys.mac.hardware' 'comp.windows.x'
 'misc.forsale' 'rec.autos' 'rec.motorcycles' 'rec.sport.baseball'
 'rec.sport.hockey' 'sci.crypt' 'sci.electronics' 'sci.med' 'sci.space'
 'soc.religion.christian' 'talk.politics.guns' 'talk.politics.mideast'
 'talk.politics.misc' 'talk.religion.misc']


In [6]:
# Data Preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

data['Data'] = data['Data'].apply(preprocess_text)

In [7]:
# Feature Extraction using TF-IDF
X = data['Data']
y = data['Labels']

In [8]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features as needed
X_tfidf = tfidf_vectorizer.fit_transform(X)

# Naive Bayes Model for Text Classification

In [9]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [10]:
# Naive Bayes Model
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

MultinomialNB()

In [11]:
# Predictions
y_pred = nb_classifier.predict(X_test)

In [12]:
# Evaluation
print("\nAccuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.6875


In [13]:
print("Precision:", precision_score(y_test, y_pred, average='weighted'))

Precision: 0.7391975292827498


In [14]:
print("Recall:", recall_score(y_test, y_pred, average='weighted'))

Recall: 0.6875


In [15]:
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

F1 Score: 0.6841358271848533


In [16]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.45      0.72      0.55        18
           comp.graphics       0.68      0.72      0.70        18
 comp.os.ms-windows.misc       0.80      0.73      0.76        22
comp.sys.ibm.pc.hardware       0.74      0.56      0.64        25
   comp.sys.mac.hardware       0.59      0.62      0.60        21
          comp.windows.x       0.82      0.72      0.77        25
            misc.forsale       0.83      0.56      0.67        18
               rec.autos       0.85      0.94      0.89        18
         rec.motorcycles       0.57      0.81      0.67        16
      rec.sport.baseball       0.76      0.89      0.82        18
        rec.sport.hockey       0.83      1.00      0.91        15
               sci.crypt       0.78      0.74      0.76        19
         sci.electronics       0.41      0.56      0.47        16
                 sci.med       0.62      0.76      

# Sentiment Analysis

In [17]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Sony\AppData\Roaming\nltk_data...


True

In [18]:
# Sentiment Analysis
nltk.download('vader_lexicon')  # Download VADER lexicon
sid = SentimentIntensityAnalyzer()
sentiments = []
for post in data['Data']:
    sentiment_score = sid.polarity_scores(post)
    if sentiment_score['compound'] >= 0.05:
        sentiment = 'positive'
    elif sentiment_score['compound'] <= -0.05:
        sentiment = 'negative'
    else:
        sentiment = 'neutral'
    sentiments.append(sentiment)

data['Sentiment'] = sentiments

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Sony\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [19]:
# Evaluate sentiment distribution across different categories
sentiment_category_distribution = data.groupby(['Labels', 'Sentiment']).size().unstack(fill_value=0)
print("\nSentiment Distribution Across Categories:")
print(sentiment_category_distribution)


Sentiment Distribution Across Categories:
Sentiment                 negative  neutral  positive
Labels                                               
alt.atheism                     42        1        57
comp.graphics                   14        2        84
comp.os.ms-windows.misc         25        1        74
comp.sys.ibm.pc.hardware        20        3        77
comp.sys.mac.hardware           27        3        70
comp.windows.x                  19        3        78
misc.forsale                     8       11        81
rec.autos                       26        2        72
rec.motorcycles                 32        3        65
rec.sport.baseball              26        2        72
rec.sport.hockey                33        1        66
sci.crypt                       27        2        71
sci.electronics                 19        4        77
sci.med                         34        1        65
sci.space                       34        4        62
soc.religion.christian          26     