In [2]:
# features -> represent text as a vector 
# vectors -> we can assign several features for each word -> used for classification
# each feature can have a binary/decimal value associated to it -> 1,0,...
# example -> there are 2 features -> location , person -> example -> dhoni -> person : 1 , location : 0
# this can be represented in a form of a vector -> [1,0] ; associated with ['person','location']

In [3]:
# called text representation
# representing text as vector -> vector space model
# approaches to convert into vector 
# 1. one hot encoding
# 2. bag of words
# 3. Tf-Idf
# 4. Word embedding
# 5. Label Encoding

# ***Label and One Hot Encoding***

In [1]:
# NLP -> raw Text => Vector => ML

# label encoding -> similar to ml categorical vars
# one hot encoding -> """"""""""""""""""""""""""""

# in NLP , people dont use -> disadv :
# doesnt capture meaning of word 
# very much memory required to store many columns if number of unique words high 
# no fixed length representation 


# ***Bag Of Words***

In [None]:
# col of unique words -> each row -> each doc -> each [row][col] -> number of occurences of col's label in doc of row

In [None]:
# first build vocabulary -> containing unique words -> representing cols
# limitations -> if words very much : much memory

In [4]:
import pandas as pd
import numpy as np

In [8]:
df = pd.read_csv("spam.csv")

In [9]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [11]:
# encoding spam , ham
df['spam']=df['Category'].apply(lambda x : 1 if x=='spam' else 0)

In [12]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [14]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(df.Message,df.spam,test_size=0.2)

In [15]:
X_train.head()

1279    Can i meet ü at 5.. As 4 where depends on wher...
52      K fyi x has a ride early tomorrow morning but ...
825                             Have a good evening! Ttyl
3037                     ;-) ok. I feel like john lennon.
1729    As per your request 'Maangalyam (Alaipayuthe)'...
Name: Message, dtype: object

In [16]:
X_train.shape

(4457,)

In [17]:
X_test.shape

(1115,)

In [18]:
type(X_train)

pandas.core.series.Series

In [19]:
type(y_train)

pandas.core.series.Series

In [21]:
# building bag of words model ->
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train.values)
X_train_cv

<4457x7784 sparse matrix of type '<class 'numpy.int64'>'
	with 59810 stored elements in Compressed Sparse Row format>

In [22]:
X_train_cv.toarray()[:2][0]

array([0, 0, 0, ..., 0, 0, 0])

In [23]:
X_train_cv.shape

(4457, 7784)

In [24]:
cv.vocabulary_

{'can': 1647,
 'meet': 4461,
 'at': 1124,
 'as': 1093,
 'where': 7523,
 'depends': 2272,
 'on': 4973,
 'wan': 7411,
 'in': 3674,
 'lor': 4249,
 'fyi': 3085,
 'has': 3372,
 'ride': 5824,
 'early': 2526,
 'tomorrow': 6989,
 'morning': 4625,
 'but': 1589,
 'he': 3396,
 'crashing': 2080,
 'our': 5046,
 'place': 5270,
 'tonight': 6998,
 'have': 3387,
 'good': 3211,
 'evening': 2686,
 'ttyl': 7096,
 'ok': 4958,
 'feel': 2828,
 'like': 4152,
 'john': 3870,
 'lennon': 4117,
 'per': 5189,
 'your': 7754,
 'request': 5764,
 'maangalyam': 4327,
 'alaipayuthe': 902,
 'been': 1298,
 'set': 6076,
 'callertune': 1632,
 'for': 2970,
 'all': 919,
 'callers': 1631,
 'press': 5438,
 'to': 6966,
 'copy': 2031,
 'friends': 3037,
 'famous': 2792,
 'quote': 5580,
 'when': 7519,
 'you': 7750,
 'develop': 2299,
 'the': 6844,
 'ability': 757,
 'listen': 4186,
 'anything': 1009,
 'unconditionally': 7169,
 'without': 7591,
 'losing': 4253,
 'temper': 6791,
 'or': 5014,
 'self': 6040,
 'confidence': 1978,
 'it': 37

In [25]:
X_train_np = X_train_cv.toarray()
X_train_np[:4]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [28]:
np.where(X_train_np[0]!=0)

(array([1093, 1124, 1647, 2272, 3674, 4249, 4461, 4973, 7411, 7523]),)

In [30]:
 # building model ->
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_cv,y_train)

In [31]:
cv.transform(X_test)

<1115x7784 sparse matrix of type '<class 'numpy.int64'>'
	with 13290 stored elements in Compressed Sparse Row format>

In [32]:
X_test_cv = cv.transform(X_test)

In [34]:
from sklearn.metrics import classification_report
y_pred = model.predict(X_test_cv)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       981
           1       0.96      0.95      0.95       134

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [35]:
# hue hue , greamt accuramcy 

In [36]:
# easier way , using sklearn pipeline
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('nb',MultinomialNB())
])
# pipeline is simply a api

In [37]:
clf.fit(X_train,y_train)

In [39]:
y_pred2 = clf.predict(X_test)
print(classification_report(y_test,y_pred2))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       981
           1       0.96      0.95      0.95       134

    accuracy                           0.99      1115
   macro avg       0.98      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



Smol Exercise

In [72]:
df2 = pd.read_csv("IMDB_Dataset.csv",engine="python",
error_bad_lines=False,)
df2.head()



  df2 = pd.read_csv("IMDB_Dataset.csv",engine="python",
Skipping line 19011: unexpected end of data


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [75]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df2.review, df2.sentiment, test_size=0.2)

In [76]:
from sklearn.ensemble import RandomForestClassifier
clf2 = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('rf',RandomForestClassifier(n_estimators=50,criterion='entropy'))
])

In [77]:
clf2.fit(X_train,y_train)

In [79]:
y_pred_1 = clf2.predict(X_test)
print(classification_report(y_test,y_pred_1))

              precision    recall  f1-score   support

    negative       0.84      0.84      0.84      1968
    positive       0.83      0.83      0.83      1834

    accuracy                           0.84      3802
   macro avg       0.84      0.84      0.84      3802
weighted avg       0.84      0.84      0.84      3802



In [80]:
from sklearn.neighbors import KNeighborsClassifier
clf3 = Pipeline([
    ('vectoriz',CountVectorizer()),
    ('knn',KNeighborsClassifier(n_neighbors=10,metric='euclidean'))
])

In [81]:
clf3.fit(X_train,y_train)

In [82]:
y_pred_2 = clf3.predict(X_test)
print(classification_report(y_test,y_pred_2))

              precision    recall  f1-score   support

    negative       0.66      0.65      0.66      1968
    positive       0.63      0.64      0.64      1834

    accuracy                           0.65      3802
   macro avg       0.65      0.65      0.65      3802
weighted avg       0.65      0.65      0.65      3802



In [83]:
from sklearn.naive_bayes import MultinomialNB
clf4 = Pipeline([
    ('vec',CountVectorizer()),
    ('nb',MultinomialNB())
])

In [84]:
clf4.fit(X_train,y_train)

In [85]:
y_pred_3 = clf4.predict(X_test)

In [86]:
print(classification_report(y_test,y_pred_3))

              precision    recall  f1-score   support

    negative       0.84      0.87      0.86      1968
    positive       0.86      0.82      0.84      1834

    accuracy                           0.85      3802
   macro avg       0.85      0.85      0.85      3802
weighted avg       0.85      0.85      0.85      3802



In this process, we convert text into a very high dimensional numeric vector using the technique of Bag of words.

Model like K-Nearest Neighbours(KNN) doesn't work well with high dimensional data because with large number of dimensions, it becomes difficult for the algorithm to calculate distance in each dimension. In higher dimensional space, the cost to calculate distance becomes expensive and hence impacts the performance of model.