## Text Representation Using OneHotEncoding

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('spam.csv')

In [None]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [None]:
df_labels = pd.get_dummies(df['Category'],drop_first = 'true')

In [None]:
df = pd.concat([df,df_labels],axis=1)


Unnamed: 0,Message,spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...
5567,This is the 2nd time we have tried 2 contact u...,1
5568,Will ü b going to esplanade fr home?,0
5569,"Pity, * was in mood for that. So...any other s...",0
5570,The guy did some bitching but I acted like i'd...,0


In [None]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x = df['Message']
y = df['spam']

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vector = CountVectorizer()
cv = vector.fit_transform(x_train.values)

In [None]:
# it will give vocabulary words
vector.get_feature_names_out()[1000:1050]

array(['anthony', 'anti', 'antibiotic', 'any', 'anybody', 'anyhow',
       'anymore', 'anyone', 'anyones', 'anyplaces', 'anythiing',
       'anythin', 'anything', 'anythingtomorrow', 'anytime', 'anyway',
       'anyways', 'anywhere', 'aom', 'apart', 'apartment', 'apes',
       'apeshit', 'aphex', 'apnt', 'apo', 'apologetic', 'apologise',
       'apologize', 'apology', 'app', 'apparently', 'appeal', 'appear',
       'applausestore', 'applebees', 'apples', 'application', 'apply',
       'applyed', 'applying', 'appointment', 'appointments', 'appreciate',
       'appreciated', 'approaches', 'approaching', 'appropriate',
       'approve', 'approved'], dtype=object)

In [None]:
vector.vocabulary_

In [None]:
model = MultinomialNB()
model.fit(cv,y_train)

In [None]:
from sklearn.metrics import classification_report

In [None]:
#if data set is imbalanced then see f1 score rather than accuracy

In [None]:
x_test_cv = vector.transform(x_test)
y_pred = model.predict(x_test_cv)
model.score(x_test_cv,y_test)

0.9874439461883409

In [None]:
report = classification_report(y_pred,y_test)
print(report)

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       988
           1       0.93      0.96      0.95       127

    accuracy                           0.99      1115
   macro avg       0.96      0.98      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [None]:
emails = ["upto 20% discount it's free for you"]

In [None]:
model.predict(vector.transform(emails))

array([1], dtype=uint8)

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
clf = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('nb',MultinomialNB())
])

In [None]:
clf.fit(x_train,y_train)

In [None]:
y_pred= clf.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       984
           1       0.96      0.93      0.95       131

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



## Bag of words exercise

In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from  sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [46]:
df = pd.read_csv('imdb.csv')

In [47]:
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [49]:
df.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [50]:
df.shape

(50000, 2)

In [53]:
# positive = len(df[df['sentiment_new']==1])
# negative = len(df['sentiment_new'])-positive
# positive,negative

In [54]:
x = df['review']
df['sentiment_new'] = df['sentiment'].apply(lambda x: 1 if x=='positive' else 0)
y = df['sentiment_new']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)
model = RandomForestClassifier(n_estimators=50,criterion='entropy')
pipe = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('rf',model)
])

In [55]:
df.head(3)

Unnamed: 0,review,sentiment,sentiment_new
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1


In [57]:
pipe.fit(x_train,y_train)
y_pred = pipe.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.84      0.85      0.84      5007
           1       0.84      0.84      0.84      4993

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



In [60]:
nb_pipe = Pipeline([
    ('v',CountVectorizer()),
    ('nb',MultinomialNB())
])
nb_pipe.fit(x_train,y_train)
y_pred = nb_pipe.predict(x_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.83      0.88      0.86      5007
           1       0.88      0.82      0.85      4993

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [74]:
nb_pipe.predict(['the movie was bad '])
# print('Negative' if nb_pipe.predict(['I like the movie'])==0 else 'Positive')

array([0])