<a href="https://colab.research.google.com/github/anshupandey/Working_with_Large_Language_models/blob/main/WWL_C2_text_classification_with_Probabilistic_Approach.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Text Classification with TFIDF

In [1]:
import pandas as pd
import numpy as np

In [2]:
url = "https://raw.githubusercontent.com/anshupandey/Working_with_Large_Language_models/main/sentimentdata.csv"

df = pd.read_csv(url)
df.head()

Unnamed: 0,document,label
0,Pizza is great and I love pizza.,Positive
1,I hate burger and its bad to eat burger.,Negative
2,I hate dirty tables.,Negative
3,Burger is amazing and I love it more than anyt...,Positive
4,My boss is a monster and I hate him,Negative


In [23]:
df

Unnamed: 0,document,label
0,Pizza is great and I love pizza.,Positive
1,I hate burger and its bad to eat burger.,Negative
2,I hate dirty tables.,Negative
3,Burger is amazing and I love it more than anyt...,Positive
4,My boss is a monster and I hate him,Negative
5,The food was delivered late and I hate late de...,Negative
6,My wife love pizza and burger more than me,Positive
7,the table was bad and dirty and i hate this,Negative
8,Food was delicious and I love it,Positive
9,It great to have good food at good time,Positive


In [3]:
df.shape

(20, 2)

In [4]:
# accessing documents and labels
x = df['document']
y = df['label']

### Vectorization
Using TFIDF to perform vectorization

In [24]:
# load the predefined stop word dictionary
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
sw = list(ENGLISH_STOP_WORDS)

# we will remove the word "not" from stop word dictionary
sw.remove('not')

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer(lowercase=True, stop_words=sw, ngram_range=(1,2),min_df=2)
vec.fit(x)

In [46]:
print(len(vec.get_feature_names_out()))

21


In [48]:
print(vec.get_feature_names_out())

['bad' 'burger' 'burger bad' 'dirty' 'food' 'good' 'great' 'great good'
 'hate' 'icecream' 'icecream not' 'juice' 'ketchup' 'ketchup pizza' 'love'
 'love pizza' 'not' 'not bad' 'not good' 'pizza' 'pizza great']


In [49]:
# Generate vector
xvec = vec.transform(x)
print(xvec.shape)

(20, 21)


In [50]:
print(vec.get_feature_names_out())

['bad' 'burger' 'burger bad' 'dirty' 'food' 'good' 'great' 'great good'
 'hate' 'icecream' 'icecream not' 'juice' 'ketchup' 'ketchup pizza' 'love'
 'love pizza' 'not' 'not bad' 'not good' 'pizza' 'pizza great']


In [51]:
for i in range(len(x)):
  print(x[i])
  print(xvec[i,:])

Pizza is great and I love pizza.
  (0, 20)	0.41746017610816727
  (0, 19)	0.6384708140943208
  (0, 15)	0.41746017610816727
  (0, 14)	0.3192354070471604
  (0, 6)	0.3766932135898451
I hate burger and its bad to eat burger.
  (0, 8)	0.3291049195285131
  (0, 2)	0.4619783881845005
  (0, 1)	0.7637411646427362
  (0, 0)	0.30816449215108926
I hate dirty tables.
  (0, 8)	0.5802108442611924
  (0, 3)	0.8144663137304786
Burger is amazing and I love it more than anything
  (0, 14)	0.679092304597332
  (0, 1)	0.7340528876291438
My boss is a monster and I hate him
  (0, 8)	1.0
The food was delivered late and I hate late delivery
  (0, 8)	0.6196463213577276
  (0, 4)	0.7848811607038583
My wife love pizza and burger more than me
  (0, 19)	0.4527500952260407
  (0, 15)	0.5920556752595143
  (0, 14)	0.4527500952260407
  (0, 1)	0.4893922556994775
the table was bad and dirty and i hate this
  (0, 8)	0.5098271029287214
  (0, 3)	0.7156657019931854
  (0, 0)	0.4773876078302704
Food was delicious and I love it
  (0, 

### Sentiment Analysis with ML

In [52]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
# train the model with vectorized data and labels
model.fit(xvec,y)

In [53]:
ip = "I love pasta"

ipvec = vec.transform([ip]).toarray()
print(ipvec)

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]]


In [54]:
model.predict(ipvec)

array(['Positive'], dtype='<U8')

In [55]:
ip = "I hate pasta"

ipvec = vec.transform([ip]).toarray()
model.predict(ipvec)

array(['Negative'], dtype='<U8')

In [56]:
model.predict_proba(ipvec)

array([[0.81811178, 0.18188822]])

In [57]:
ip = "Noodles are good"
ipvec = vec.transform([ip]).toarray()
model.predict(ipvec)

array(['Positive'], dtype='<U8')

In [58]:
ip = "Noodles are bad"
ipvec = vec.transform([ip]).toarray()
model.predict(ipvec)

array(['Negative'], dtype='<U8')

In [59]:
ip = "Noodles are not good"
ipvec = vec.transform([ip]).toarray()
model.predict(ipvec)

array(['Negative'], dtype='<U8')

In [60]:
ip = "Noodles are not bad"
ipvec = vec.transform([ip]).toarray()
model.predict(ipvec)

array(['Positive'], dtype='<U8')

In [61]:
ip = "today is tuesday and pasta is not good"
ipvec = vec.transform([ip]).toarray()
model.predict(ipvec)

array(['Negative'], dtype='<U8')

In [62]:
ip = "today is not tuesday and pasta is good"
ipvec = vec.transform([ip]).toarray()
model.predict(ipvec)

array(['Positive'], dtype='<U8')