# Concept OF StopWords

Stop Words are used to remove the most common occuring elements 

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
import pandas as pd

In [3]:
vect = CountVectorizer(stop_words ='english')


In [4]:
message =["Hello Brother How are you ?","Let's plan a trip together to goa","Invite others too!"]

In [5]:
new_message = vect.fit_transform(message)

In [6]:
new_message

<3x7 sparse matrix of type '<class 'numpy.int64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [7]:
vect.get_feature_names()

['brother', 'goa', 'hello', 'invite', 'let', 'plan', 'trip']

In [8]:
vect.get_stop_words()

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

# Let's Work on the Restaurant Reviews 

In [9]:
dataset = pd.read_csv('datasets/Restaurant_Reviews.tsv',sep='\t')

In [10]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [11]:
x = dataset['Review'].values

In [12]:
cv = CountVectorizer()

In [13]:
y = dataset['Liked'].values

In [14]:
# let's perform the data splitting
import re

In [15]:
def clean(x):
    s = re.sub('[^A-Za-z]',' ',x)
    s = re.sub('\s+',' ',s)
    s = s.strip()
    return s.lower()


In [16]:
dataset['Review'] = dataset['Review'].apply(clean)

In [17]:
dataset['Review']

0                                   wow loved this place
1                                      crust is not good
2               not tasty and the texture was just nasty
3      stopped by during the late may bank holiday of...
4      the selection on the menu was great and so wer...
                             ...                        
995    i think food should have flavor and texture an...
996                              appetite instantly gone
997    overall i was not impressed and would not go back
998    the whole experience was underwhelming and i t...
999    then as if i hadn t wasted enough of my life t...
Name: Review, Length: 1000, dtype: object

In [18]:
# Now performing the data splitting

In [19]:
x_data = dataset['Review'].values
y_data = dataset['Liked'].values

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
xtrain,xtest,ytrain,ytest = train_test_split(x_data,y_data,test_size=0.2,random_state = 16)

In [22]:
cv_model = CountVectorizer(stop_words ='english')

In [23]:
cv_train = cv_model.fit_transform(xtrain)
cv_test = cv_model.transform(xtest)

In [24]:
# now we will pass the data in ML algo
from sklearn.linear_model import LogisticRegression

In [25]:
log = LogisticRegression()

In [26]:
log.fit(cv_train,ytrain)

LogisticRegression()

In [27]:
pred = log.predict(cv_test)

In [28]:
from sklearn.metrics import accuracy_score

In [29]:
accuracy_score(ytest,pred)

0.765

In [30]:
test = ['food is amazing']
cleaned_data =[]
for i in test:
    t = clean(i)
    cleaned_data.append(t)

In [31]:
cleaned_data

['food is amazing']

In [32]:
t1 = cv_model.transform(cleaned_data)

In [33]:
log.predict(t1)

array([1], dtype=int64)

In [34]:
# drawbacks of stopwords is that it has a word NOT in it

In [46]:
# creating a sample data
data = ["Hello Brother How are you ?","Let's plan a trip together to goa","Invite others too!","My cool friend name is Deepali","We will plan a trip together soon!","I am not interested in watching movies"]

In [38]:
# we can use nltk library to install stopwords
import nltk

In [39]:
from nltk.corpus import stopwords

In [40]:
stopwords = set(stopwords.words('english'))

In [41]:
# removing the word "not" from stopwords list
if 'not' in stopwords:
    stopwords.remove('not')

In [42]:
# confirming wether the word not is present or not
if 'not' in stopwords:
    print("Yes it's there")
else:
    print("Not there")

Not there


In [43]:
from sklearn.feature_extraction.text import CountVectorizer

In [44]:
cv1 = CountVectorizer(stop_words = stopwords)

In [45]:
cv2 = CountVectorizer()

In [47]:
x_new1 = cv1.fit_transform(data).toarray()

In [48]:
x_new2 = cv2.fit_transform(data).toarray()

In [49]:
x_new1.shape

(6, 18)

In [50]:
x_new2.shape

(6, 29)

In [51]:
import pandas as pd


In [52]:
dataset1 = pd.DataFrame(x_new1,columns = cv1.get_feature_names())
dataset2 = pd.DataFrame(x_new2,columns = cv2.get_feature_names())

In [53]:
dataset1.head()

Unnamed: 0,brother,cool,deepali,friend,goa,hello,interested,invite,let,movies,name,not,others,plan,soon,together,trip,watching
0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,1,1,0
2,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
3,0,1,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0


In [54]:
dataset2.head()

Unnamed: 0,am,are,brother,cool,deepali,friend,goa,hello,how,in,...,plan,soon,to,together,too,trip,watching,we,will,you
0,0,1,1,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,1,0,0,0,...,1,0,1,1,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,1,0,1,0,1,0,1,1,0


In [57]:
# loading the email spam filtering data
email_data = pd.read_csv("datasets/sms.tsv",sep='\t',header=None,names=['target','message'])

In [58]:
email_data.head()

Unnamed: 0,target,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [60]:
dataset.shape

(1000, 2)

In [61]:
# dataset cleaning
def clean(x):
    s = re.sub('[^A-Za-z]',' ',x)
    s = re.sub('\s+',' ',s)
    s = s.strip()
    return s.lower()
    

In [63]:
email_data['target'].value_counts()

ham     4825
spam     747
Name: target, dtype: int64

In [67]:
#applying clean function
email_data['message'] = email_data['message'].apply(clean)

In [69]:
email_data.head()

Unnamed: 0,target,message
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in a wkly comp to win fa cup final ...
3,ham,u dun say so early hor u c already then say
4,ham,nah i don t think he goes to usf he lives arou...


In [70]:
email_data['target'] = email_data['target'].map({"ham":0,"spam":1})

In [71]:
email_data['target'].value_counts()

0    4825
1     747
Name: target, dtype: int64

In [73]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

In [74]:
# removing "not"
if 'not' in stopwords:
    stopwords.remove('not')

In [75]:
# checking
if 'not' in stopwords:
    print("Its there")
else:
    print("Not There")

Not There


In [76]:
cv = CountVectorizer(stop_words=stopwords)

In [78]:
# data splitting
from sklearn.model_selection import train_test_split

In [79]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=15)

In [80]:
train_data = cv.fit_transform(xtrain)

In [81]:
test_data = cv.transform(xtest)

In [82]:
#importing the logististic regression
from sklearn.linear_model import LogisticRegression

In [84]:
log = LogisticRegression()

In [85]:
log.fit(train_data,ytrain)

LogisticRegression()

In [87]:
pred= log.predict(test_data)

In [88]:
# accuracy score
from sklearn.metrics import accuracy_score

In [89]:
accuracy_score(ytest,pred)

0.76

In [93]:
# creating the testing data
test = ["This is not nice man , kindly change it @ 123"]

In [94]:
cleaned_data =[]
for i in test:
    cleaned_data.append(clean(i))

In [95]:
cleaned_data

['this is not nice man kindly change it']

In [96]:
test_sample = cv.transform(cleaned_data)

In [98]:
log.predict(test_sample)

array([0], dtype=int64)

### Prediction is correct ;)