In [79]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
import re
import warnings 
warnings.filterwarnings('ignore')

In [80]:
import spacy
import wordcloud
from nltk.corpus import stopwords
import string
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import sent_tokenize,word_tokenize

# Data Exploration and Preprocessing

In [201]:
data = pd.read_csv(r"C:\Users\reddy\Downloads\data assingnment files\Assingnment files\Completed\NLP and Naive Bayes\blogs.csv")
data

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism
...,...,...
1995,Xref: cantaloupe.srv.cs.cmu.edu talk.abortion:...,talk.religion.misc
1996,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc
1997,Xref: cantaloupe.srv.cs.cmu.edu talk.origins:4...,talk.religion.misc
1998,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc


## Preprocess the data by cleaning the text 

In [84]:
nlp= spacy.load('en_core_web_sm')

In [85]:
def clean_words(text):
    text1= ' '.join(re.findall('\w+',text))
    doc= nlp(text1)
    clean_text= [token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.is_bracket and not token.is_digit and not token.is_currency and not token.is_space]
    return clean_text

In [86]:
count= CountVectorizer(analyzer=clean_words)

In [87]:
x= count.fit_transform(data['Data'])

In [88]:
tfidf= TfidfTransformer()

In [89]:
y= tfidf.fit_transform(x)

## Naive Bayes model

In [91]:
multi=MultinomialNB()

In [92]:
multi.fit(y,data['Labels'])

In [174]:
y_pred= multi.predict(y)

In [176]:
accuracy_score(data['Labels'],y_pred)

0.9835

# Naive Bayes Model for Text Classification

In [179]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

## Split the data into training and test sets

In [182]:
X_train,X_test,Y_train,Y_test = train_test_split(data['Data'],data['Labels'],train_size=0.8,random_state=50)

In [184]:
pipeline= Pipeline([('count',CountVectorizer(analyzer=clean_words)),('tfidf',TfidfTransformer()),('multi',MultinomialNB())])

In [186]:
pipeline.fit(X_train,Y_train)

In [187]:
y_pred= pipeline.predict(X_test)

In [188]:
accuracy_score(Y_test,y_pred)

0.8325

# Sentiment Analysis

In [124]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [156]:
def clean_words1(text):
    text1 = ' '.join(re.findall(r'\w+', str(text)))  # Ensure text is a string
    doc = nlp(text1)
    clean_text = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return ' '.join(clean_text)

In [158]:
data['Cleaned_Data'] = data['Data'].apply(clean_words1)

In [159]:
sia = SentimentIntensityAnalyzer()

In [162]:
data['Sentiment'] = data['Cleaned_Data'].apply(lambda x: sia.polarity_scores(x))

In [164]:
data['Sentiment'] 

0       {'neg': 0.176, 'neu': 0.653, 'pos': 0.171, 'co...
1       {'neg': 0.029, 'neu': 0.868, 'pos': 0.102, 'co...
2       {'neg': 0.208, 'neu': 0.733, 'pos': 0.06, 'com...
3       {'neg': 0.28, 'neu': 0.566, 'pos': 0.154, 'com...
4       {'neg': 0.027, 'neu': 0.869, 'pos': 0.104, 'co...
                              ...                        
1995    {'neg': 0.069, 'neu': 0.786, 'pos': 0.145, 'co...
1996    {'neg': 0.019, 'neu': 0.852, 'pos': 0.129, 'co...
1997    {'neg': 0.0, 'neu': 0.895, 'pos': 0.105, 'comp...
1998    {'neg': 0.074, 'neu': 0.74, 'pos': 0.186, 'com...
1999    {'neg': 0.148, 'neu': 0.738, 'pos': 0.114, 'co...
Name: Sentiment, Length: 2000, dtype: object

In [166]:
## The above output is called the sentiment analisis

# Evaluation

In [170]:
from sklearn.metrics import f1_score,precision_score,recall_score

In [196]:
precision = precision_score(Y_test, y_pred, average='weighted')  
recall = recall_score(Y_test, y_pred, average='weighted')        
f1 = f1_score(Y_test, y_pred, average='weighted')               

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')

Precision: 0.8530061826727061
Recall: 0.8325
F1-Score: 0.8335094453047459
