In [4]:
import pandas as pd

# Load the dataset
file_path = '/content/blogs.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(data.head())

# Check for any missing values
print(data.isnull().sum())

# Check the distribution of categories
print(data['Labels'].value_counts())


                                                Data       Labels
0  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  alt.atheism
1  Newsgroups: alt.atheism\nPath: cantaloupe.srv....  alt.atheism
2  Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...  alt.atheism
3  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  alt.atheism
4  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...  alt.atheism
Data      0
Labels    0
dtype: int64
Labels
alt.atheism                 100
comp.graphics               100
talk.politics.misc          100
talk.politics.mideast       100
talk.politics.guns          100
soc.religion.christian      100
sci.space                   100
sci.med                     100
sci.electronics             100
sci.crypt                   100
rec.sport.hockey            100
rec.sport.baseball          100
rec.motorcycles             100
rec.autos                   100
misc.forsale                100
comp.windows.x              100
comp.sys.mac.hardware       100
comp.sys.ibm.pc.

In [13]:
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords


# Download the stopwords corpus
#nltk.download('stopwords')

# Function to clean the text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove digits
    return text

# Apply text cleaning
data['Data'] = data['Data'].apply(clean_text)

# Tokenization and stopwords removal
#stop_words = set(stopwords.words('english'))  # Now you can use stopwords
#data['Data'] = data['Data'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(data['Data']).toarray()
y = data['Labels']

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Naive Bayes classifier
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Make predictions
y_pred = nb.predict(X_test)

# Evaluate the model
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred, average="weighted")}')
print(f'Recall: {recall_score(y_test, y_pred, average="weighted")}')
print(f'F1-Score: {f1_score(y_test, y_pred, average="weighted")}')
print(classification_report(y_test, y_pred))


Accuracy: 0.795
Precision: 0.8261387605353611
Recall: 0.795
F1-Score: 0.7893284079184193
                          precision    recall  f1-score   support

             alt.atheism       0.50      0.89      0.64        18
           comp.graphics       0.88      0.78      0.82        18
 comp.os.ms-windows.misc       0.95      0.91      0.93        22
comp.sys.ibm.pc.hardware       0.80      0.64      0.71        25
   comp.sys.mac.hardware       0.81      0.62      0.70        21
          comp.windows.x       1.00      0.80      0.89        25
            misc.forsale       1.00      0.67      0.80        18
               rec.autos       0.89      0.94      0.92        18
         rec.motorcycles       0.94      0.94      0.94        16
      rec.sport.baseball       0.80      0.89      0.84        18
        rec.sport.hockey       0.88      1.00      0.94        15
               sci.crypt       0.83      1.00      0.90        19
         sci.electronics       0.43      0.75      0

In [15]:
from textblob import TextBlob

# Function to determine sentiment
def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment analysis
data['Sentiment'] = data['Data'].apply(get_sentiment)

# Check the distribution of sentiments
print(data['Sentiment'].value_counts())

# Examine the distribution of sentiments across categories
sentiment_by_category = pd.crosstab(data['Labels'], data['Sentiment'])
print(sentiment_by_category)


Sentiment
Positive    1510
Negative     489
Neutral        1
Name: count, dtype: int64
Sentiment                 Negative  Neutral  Positive
Labels                                               
alt.atheism                     31        0        69
comp.graphics                   25        0        75
comp.os.ms-windows.misc         22        0        78
comp.sys.ibm.pc.hardware        20        0        80
comp.sys.mac.hardware           23        0        77
comp.windows.x                  20        1        79
misc.forsale                    18        0        82
rec.autos                       22        0        78
rec.motorcycles                 27        0        73
rec.sport.baseball              34        0        66
rec.sport.hockey                37        0        63
sci.crypt                       21        0        79
sci.electronics                 20        0        80
sci.med                         30        0        70
sci.space                       27        0      