<a href="https://colab.research.google.com/github/amanullahshah32/NLP-with-SpaCy/blob/main/NLP_Basic/Bag_of_Words_(BOW).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [2]:
spam_csv = 'https://raw.githubusercontent.com/codebasics/nlp-tutorials/refs/heads/main/9_bag_of_words/spam.csv'
df = pd.read_csv(spam_csv)
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,4825
spam,747


In [4]:
df['spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0 )

In [5]:
df.shape

(5572, 3)

## Train Test Split

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size = 0.2)

In [7]:
X_train.shape

(4457,)

In [8]:
X_train

Unnamed: 0,Message
573,Can you open the door?
1748,I think u have the wrong number.
4158,This single single answers are we fighting? Pl...
3090,What Today-sunday..sunday is holiday..so no wo...
4767,Whens your radio show?
...,...
4437,"House-Maid is the murderer, coz the man was mu..."
4699,Don no da:)whats you plan?
1754,Jus came back fr lunch wif my sis only. U leh?
2681,Solve d Case : A Man Was Found Murdered On &l...


In [9]:
X_test.shape

(1115,)

In [10]:
X_train[:4]

Unnamed: 0,Message
573,Can you open the door?
1748,I think u have the wrong number.
4158,This single single answers are we fighting? Pl...
3090,What Today-sunday..sunday is holiday..so no wo...


In [11]:
y_train[:5]

Unnamed: 0,spam
573,0
1748,0
4158,0
3090,0
4767,0


In [12]:
y_train.value_counts()

Unnamed: 0_level_0,count
spam,Unnamed: 1_level_1
0,3864
1,593


## Create bag of words representation using CountVector

In [40]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
v2 = CountVectorizer()

X_train_count_vec = v.fit_transform(X_train.values)
X_train_count_vec

<4457x7655 sparse matrix of type '<class 'numpy.int64'>'
	with 59341 stored elements in Compressed Sparse Row format>

In [41]:
X_train.values

array(['Can you open the door?', 'I think u have the wrong number.',
       'This single single answers are we fighting? Plus i said am broke and you didnt reply',
       ..., 'Jus came back fr lunch wif my sis only. U leh?',
       "Solve d Case : A Man Was Found Murdered On  &lt;DECIMAL&gt; . &lt;#&gt;  AfterNoon. 1,His wife called Police. 2,Police questioned everyone. 3,Wife: Sir,I was sleeping, when the murder took place. 4.Cook: I was cooking. 5.Gardener: I was picking vegetables. 6.House-Maid: I went 2 d post office. 7.Children: We went 2 play. 8.Neighbour: We went 2 a marriage. Police arrested d murderer Immediately. Who's It? Reply With Reason, If U r Brilliant.",
       "What's up bruv, hope you had a great break. Do have a rewarding semester."],
      dtype=object)

### Example of Bag of Words

In [29]:
X_train_2 = ["I love NLP", "NLP is great"]
trying = v2.fit_transform(X_train_2)
print(trying)

  (0, 2)	1
  (0, 3)	1
  (1, 3)	1
  (1, 1)	1
  (1, 0)	1


In [30]:
X_train_2 = ["I love NLP", "NLP is great"]

# Fit and transform the data
trying = v2.fit_transform(X_train_2)

# Convert sparse matrix to dense matrix and print
print(trying.toarray())


[[0 0 1 1]
 [1 1 0 1]]


In [31]:
import pandas as pd

# Create DataFrame with the vocabulary as column names
df = pd.DataFrame(trying.toarray(), columns=v.get_feature_names_out())
print(df)


   great  is  love  nlp
0      0   0     1    1
1      1   1     0    1


In [32]:
X_train_count_vec.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0])

In [33]:
X_train_count_vec.shape

(4457, 7655)

In [45]:
len(v.get_feature_names_out())

7655

In [46]:
v.get_feature_names_out()

array(['00', '000', '000pes', ..., 'zyada', 'ú1', '〨ud'], dtype=object)

In [52]:
v.vocabulary

In [50]:
max(v.vocabulary_), min(v.vocabulary_)

('〨ud', '00')

In [54]:
X_train_np = X_train_count_vec.toarray()
X_train_np[1]

array([0, 0, 0, ..., 0, 0, 0])

In [55]:
np.where(X_train_np[0]!=0)

(array([1602, 2373, 4903, 6721, 7618]),)

In [59]:
X_train[:4]

Unnamed: 0,Message
573,Can you open the door?
1748,I think u have the wrong number.
4158,This single single answers are we fighting? Pl...
3090,What Today-sunday..sunday is holiday..so no wo...


In [60]:
X_train_np[0][1771]

0

## **Train the naive bayes model**

In [62]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_count_vec, y_train)

In [63]:
X_test_count_vec = v.transform(X_test)

## **Evaluate Performance**

In [65]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test_count_vec)

In [66]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       961
           1       0.97      0.94      0.95       154

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115

