# Naive Bayes for Text Classification:

## Bayes Using SciKit Learn:

In [133]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import MultinomialNB
from tqdm import tqdm

## Bayes to check mail spam:

Given an e-mail message, determine wheather such message is Spam or Ham:

In [114]:
data = pd.read_csv("https://raw.githubusercontent.com/amankharwal/SMS-Spam-Detection/master/spam.csv", encoding= 'latin-1')
data = data[["class", "message"]]
data

Unnamed: 0,class,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


### Using Bernoulli Naive Bayes:

In [104]:
x = np.array(data["message"])
y = np.array(data["class"])

cv = CountVectorizer()
x = cv.fit_transform(x)
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)

model = BernoulliNB(binarize=0.0)
model.fit(xtrain, ytrain)
model.score(xtest, ytest)

0.97847533632287

### Using Gaussian Naive Bayes:

In [58]:
model = ComplementNB()
model.fit(xtrain, ytrain)
model.score(xtest, ytest)

0.9641255605381166

### Using Multinomial Naive Bayes:

In [59]:
model = MultinomialNB()
model.fit(xtrain, ytrain)
model.score(xtest, ytest)

0.97847533632287

## Bayes for IMDB:

Given a Review of a Film from IMDB determine if such review is positive:

In [60]:
df = pd.read_csv('IMDB_Dataset.csv')
#https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews


In [61]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


### Using Bernoulli Naive Bayes:

In [69]:
x = np.array(df["review"])
y = np.array(df["sentiment"])

In [70]:

cv = CountVectorizer()
x = cv.fit_transform(x)
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)

model = BernoulliNB(binarize=0.0)
model.fit(xtrain, ytrain)
model.score(xtest, ytest)

0.8532

### Using Gaussian Naive Bayes:

In [66]:
model = ComplementNB()
model.fit(xtrain, ytrain)
model.score(xtest, ytest)

0.8487

### Using Multinomial Naive Bayes:

In [67]:
model = MultinomialNB()
model.fit(xtrain, ytrain)
model.score(xtest, ytest)

0.8487

# Bayes for web identification:

In [172]:
df = pd.read_csv('url_classification.csv', names = ["Link", "Type"])
# https://www.kaggle.com/datasets/shaurov/website-classification-using-url
df = df.dropna()

Given a web page, determine the category of web page it is, out of the following categories:

In [205]:
df.Type.unique()

array(['Adult', 'Arts', 'Business', 'Computers', 'Games', 'Health',
       'Home', 'Kids', 'News', 'Recreation', 'Reference', 'Science',
       'Shopping', 'Society', 'Sports'], dtype=object)

In [206]:
df

Unnamed: 0,Link,Type
1,http://www.liquidgeneration.com/,Adult
2,http://www.onlineanime.org/,Adult
3,http://www.ceres.dti.ne.jp/~nekoi/senno/senfir...,Adult
4,http://www.galeon.com/kmh/,Adult
5,http://www.fanworkrecs.com/,Adult
...,...,...
1562974,http://www.maxpreps.com/,Sports
1562975,http://www.myscore.com/,Sports
1562976,http://sportsillustrated.cnn.com/highschool,Sports
1562977,http://rss.cnn.com/rss/si_highschool?format=xml,Sports


In [207]:
x = np.array(df["Link"])
y = np.array(df["Type"])

**Using my own defined features** which only selects the amount of times a character form the english alphabet appears:

In [183]:

def convert_to_vec(dataframe):
    data = [[]]
    for i in tqdm(range(len(x))):
        string_feature = np.zeros(26)
        for j in range(len(x[i])):
            if x[i][j].isalpha and (ord(x[i][j].lower())-ord('a') >= 0) and (ord(x[i][j].lower())-ord('a') < 26):
                string_feature[ord(x[i][j].lower())-ord('a')] += 1
        data.append(string_feature)
    return data
            
        
    

In [184]:
x = convert_to_vec(x)
x.pop(0)

100%|██████████| 1562975/1562975 [01:29<00:00, 17543.45it/s]


In [201]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)

model = BernoulliNB(binarize=0.0)
model.fit(xtrain, ytrain)
model.score(xtest, ytest)

0.25447943825077174

In [202]:
model = ComplementNB()
model.fit(xtrain, ytrain)
model.score(xtest, ytest)

0.2558134327164542

In [203]:
model = MultinomialNB()
model.fit(xtrain, ytrain)
model.score(xtest, ytest)

0.26612069930741056

### Using CountVectorizer:
Which selects any character, indidistinctive of the type of character it is:

In [208]:
cv = CountVectorizer(token_pattern=r'.')
x = cv.fit_transform(x)

In [209]:
x

<1562975x71 sparse matrix of type '<class 'numpy.int64'>'
	with 28293983 stored elements in Compressed Sparse Row format>

### Using Bernoulli Naive Bayes:

In [213]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)

model = BernoulliNB(binarize=0.0)
model.fit(xtrain, ytrain)
model.score(xtest, ytest)

0.25787680545114283

### Using Gaussian Naive Bayes:

In [214]:
model = ComplementNB()
model.fit(xtrain, ytrain)
model.score(xtest, ytest)

0.2378604904109151

### Using Multinomial Naive Bayes:

In [215]:
model = MultinomialNB()
model.fit(xtrain, ytrain)
model.score(xtest, ytest)

0.25957868807882406