In [1]:
import pandas as pd 
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv("C:\\Users\\HP\\Downloads\\blogs.csv")

In [3]:
df

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism
...,...,...
1995,Xref: cantaloupe.srv.cs.cmu.edu talk.abortion:...,talk.religion.misc
1996,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc
1997,Xref: cantaloupe.srv.cs.cmu.edu talk.origins:4...,talk.religion.misc
1998,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc


# 1. Data Exploration and Preprocessing

In [4]:
# Explore categories distribution
category_distribution = df['Labels'].value_counts()
category_distribution

rec.sport.hockey            100
talk.politics.misc          100
rec.autos                   100
sci.med                     100
sci.space                   100
comp.windows.x              100
talk.politics.mideast       100
misc.forsale                100
soc.religion.christian      100
comp.sys.mac.hardware       100
sci.electronics             100
comp.os.ms-windows.misc     100
talk.religion.misc          100
rec.sport.baseball          100
comp.sys.ibm.pc.hardware    100
comp.graphics               100
sci.crypt                   100
talk.politics.guns          100
alt.atheism                 100
rec.motorcycles             100
Name: Labels, dtype: int64

In [5]:
# Explore text length distribution
df['Text_length'] = df['Data'].apply(lambda x: len(x.split()))
df['Text_length'].describe()


count     2000.000000
mean       314.166000
std        539.210333
min         31.000000
25%        127.000000
50%        194.000000
75%        319.000000
max      10384.000000
Name: Text_length, dtype: float64

In [6]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text


In [8]:
# Apply preprocessing
df['Cleaned_Text'] = df['Data'].apply(preprocess_text)

In [9]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
x = tfidf.fit_transform(df['Cleaned_Text']).toarray()
y = df['Labels']


# 2. Naive Bayes Model for Text Classification

In [10]:
# Split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


In [11]:
x_train

array([[0.        , 0.        , 0.        , ..., 0.33774788, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [12]:
y_train

968          rec.sport.baseball
240     comp.os.ms-windows.misc
819             rec.motorcycles
692                misc.forsale
420       comp.sys.mac.hardware
                 ...           
1130                  sci.crypt
1294            sci.electronics
860             rec.motorcycles
1459                  sci.space
1126                  sci.crypt
Name: Labels, Length: 1600, dtype: object

In [13]:
x_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
y_test

1860          talk.politics.misc
353     comp.sys.ibm.pc.hardware
1333                     sci.med
905           rec.sport.baseball
1289             sci.electronics
                  ...           
965           rec.sport.baseball
1284             sci.electronics
1739       talk.politics.mideast
261      comp.os.ms-windows.misc
535               comp.windows.x
Name: Labels, Length: 400, dtype: object

In [15]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [16]:
# Initialize Naive Bayes classifier
nb_classifier = MultinomialNB()

In [17]:
# Train the model
nb_classifier.fit(x_train, y_train)

In [18]:
# Make predictions
y_pred = nb_classifier.predict(x_test)

In [19]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall:", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8225
Precision: 0.8276176687627891
Recall: 0.8225
F1 Score: 0.8171026916205228

Classification Report:
                           precision    recall  f1-score   support

             alt.atheism       0.50      0.83      0.62        18
           comp.graphics       0.79      0.83      0.81        18
 comp.os.ms-windows.misc       0.86      0.82      0.84        22
comp.sys.ibm.pc.hardware       0.76      0.76      0.76        25
   comp.sys.mac.hardware       0.83      0.90      0.86        21
          comp.windows.x       0.91      0.84      0.87        25
            misc.forsale       0.82      0.78      0.80        18
               rec.autos       0.89      0.94      0.92        18
         rec.motorcycles       0.94      0.94      0.94        16
      rec.sport.baseball       0.77      0.94      0.85        18
        rec.sport.hockey       0.88      1.00      0.94        15
               sci.crypt       0.95      0.95      0.95        19
         sci.electronics 

# 3. Sentiment Analysis

In [20]:
pip install textblob

Note: you may need to restart the kernel to use updated packages.


ERROR: Exception:
Traceback (most recent call last):
  File "C:\Users\HP\anaconda3\lib\site-packages\pip\_internal\cli\base_command.py", line 105, in _run_wrapper
    status = _inner_run()
  File "C:\Users\HP\anaconda3\lib\site-packages\pip\_internal\cli\base_command.py", line 96, in _inner_run
    return self.run(options, args)
  File "C:\Users\HP\anaconda3\lib\site-packages\pip\_internal\cli\req_command.py", line 67, in wrapper
    return func(self, options, args)
  File "C:\Users\HP\anaconda3\lib\site-packages\pip\_internal\commands\install.py", line 483, in run
    installed_versions[distribution.canonical_name] = distribution.version
  File "C:\Users\HP\anaconda3\lib\site-packages\pip\_internal\metadata\pkg_resources.py", line 192, in version
    return parse_version(self._dist.version)
  File "C:\Users\HP\anaconda3\lib\site-packages\pip\_vendor\packaging\version.py", line 56, in parse
    return Version(version)
  File "C:\Users\HP\anaconda3\lib\site-packages\pip\_vendor\packagin

In [21]:
from textblob import TextBlob

In [22]:
# Sentiment analysis function
def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

In [23]:
# Apply sentiment analysis
df['Sentiment'] = df['Cleaned_Text'].apply(get_sentiment)

In [24]:
# Distribution of sentiments
sentiment_distribution = df['Sentiment'].value_counts()
print(sentiment_distribution)

Positive    1452
Negative     545
Neutral        3
Name: Sentiment, dtype: int64


In [25]:
# Group by category and sentiment
category_sentiment = df.groupby(['Labels', 'Sentiment']).size().unstack().fillna(0)
print(category_sentiment)

Sentiment                 Negative  Neutral  Positive
Labels                                               
alt.atheism                   35.0      0.0      65.0
comp.graphics                 27.0      0.0      73.0
comp.os.ms-windows.misc       24.0      0.0      76.0
comp.sys.ibm.pc.hardware      19.0      0.0      81.0
comp.sys.mac.hardware         26.0      0.0      74.0
comp.windows.x                20.0      2.0      78.0
misc.forsale                  21.0      0.0      79.0
rec.autos                     24.0      0.0      76.0
rec.motorcycles               28.0      0.0      72.0
rec.sport.baseball            35.0      0.0      65.0
rec.sport.hockey              40.0      0.0      60.0
sci.crypt                     19.0      0.0      81.0
sci.electronics               25.0      0.0      75.0
sci.med                       34.0      0.0      66.0
sci.space                     28.0      0.0      72.0
soc.religion.christian        25.0      0.0      75.0
talk.politics.guns          