# EXTRACTING THE FEATURES FROM THE TEXT DATA

In [1]:
#for extracting the information from categorical variable --> one hot encoding

#for extracting the information from text data ---> CountVectorizer

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

In [3]:
vect = CountVectorizer()

In [4]:
message =['Hi Arsh How Are You',
         'Did u go to the gym ?',
         'Yeah! i went @ fitnessbox']

In [5]:
x_new = vect.fit_transform(message).toarray()

In [6]:
x_new

array([[1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0]], dtype=int64)

In [7]:
#for getting the feature names
print(vect.get_feature_names())

['are', 'arsh', 'did', 'fitnessbox', 'go', 'gym', 'hi', 'how', 'the', 'to', 'went', 'yeah', 'you']


In [8]:
#these are the unique words that are extracted

In [9]:
df = pd.DataFrame(x_new,columns = vect.get_feature_names())

In [10]:
df.head()

Unnamed: 0,are,arsh,did,fitnessbox,go,gym,hi,how,the,to,went,yeah,you
0,1,1,0,0,0,0,1,1,0,0,0,0,1
1,0,0,1,0,1,1,0,0,1,1,0,0,0
2,0,0,0,1,0,0,0,0,0,0,1,1,0


# STOP WORD FILTERING METHOD

In [11]:
# A strategy to remove the words that are common to most of the documents

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [13]:
import pandas as pd

In [14]:
vect = CountVectorizer(stop_words = "english")

In [15]:
message =['Hi Arsh How Are You',
         'Did u go to the gym ?',
         'Yeah! i went @ fitnessbox']

In [16]:
data = vect.fit_transform(message)

In [17]:
vect.get_feature_names()

['arsh', 'did', 'fitnessbox', 'gym', 'hi', 'went', 'yeah']

In [18]:
# for listing the stop words 
vect.get_stop_words()

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

# WORKING ON RESTAURANT REVIEWS DATASET

In [19]:
dataset = pd.read_csv("datasets/Restaurant_Reviews.tsv",sep="\t")

In [20]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [21]:
# converting the text to the numeric datatype

In [23]:
x =dataset.Review.values

In [24]:
print(x)

['Wow... Loved this place.' 'Crust is not good.'
 'Not tasty and the texture was just nasty.'
 'Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.'
 'The selection on the menu was great and so were the prices.'
 'Now I am getting angry and I want my damn pho.'
 "Honeslty it didn't taste THAT fresh.)"
 'The potatoes were like rubber and you could tell they had been made up ahead of time being kept under a warmer.'
 'The fries were great too.' 'A great touch.' 'Service was very prompt.'
 'Would not go back.'
 'The cashier had no care what so ever on what I had to say it still ended up being wayyy overpriced.'
 'I tried the Cape Cod ravoli, chicken, with cranberry...mmmm!'
 'I was disgusted because I was pretty sure that was human hair.'
 'I was shocked because no signs indicate cash only.'
 'Highly recommended.' 'Waitress was a little slow in service.'
 'This place is not worth your time, let alone Vegas.'
 'did not like at all.' 'The Burrittos Blah!'

In [25]:
cv = CountVectorizer()

In [26]:
y = dataset.Liked.values

In [27]:
# Now We Need to Clean The Data

In [28]:
import re

In [29]:
#re is for working with the regular expression

In [41]:
def clean(x):
    s = re.sub('[^A-Za-z]',' ',x)
    s = re.sub('\s+',' ',s)
    s = s.strip()
    return s.lower()

In [42]:
#let's see one example
o1=clean("Hi Arsh1234 how are u! @srm")


In [43]:
print(o1)

hi arsh how are u srm


In [46]:
dataset['Review'] = dataset.Review.apply(clean)

In [47]:
dataset.columns

Index(['Review', 'Liked'], dtype='object')

In [48]:
dataset.Review.values

array(['wow loved this place', 'crust is not good',
       'not tasty and the texture was just nasty',
       'stopped by during the late may bank holiday off rick steve recommendation and loved it',
       'the selection on the menu was great and so were the prices',
       'now i am getting angry and i want my damn pho',
       'honeslty it didn t taste that fresh',
       'the potatoes were like rubber and you could tell they had been made up ahead of time being kept under a warmer',
       'the fries were great too', 'a great touch',
       'service was very prompt', 'would not go back',
       'the cashier had no care what so ever on what i had to say it still ended up being wayyy overpriced',
       'i tried the cape cod ravoli chicken with cranberry mmmm',
       'i was disgusted because i was pretty sure that was human hair',
       'i was shocked because no signs indicate cash only',
       'highly recommended', 'waitress was a little slow in service',
       'this place is no

In [49]:
from sklearn.model_selection import train_test_split

In [50]:
#now again initializing the dependent and independent variables
x_data = dataset.Review.values
y_data = dataset.Liked.values

In [51]:
x_data.shape

(1000,)

In [52]:
y_data.shape

(1000,)

In [53]:
xtrain,xtest,ytrain,ytest = train_test_split(x_data,y_data,test_size=.2)

In [54]:
xtrain.shape

(800,)

In [55]:
cv = CountVectorizer()

In [56]:
cv_train = cv.fit_transform(xtrain)

In [57]:
cv_test = cv.transform(xtest)

In [58]:
cv_train.shape

(800, 1765)

In [59]:
cv.get_feature_names()

['about',
 'above',
 'absolute',
 'absolutely',
 'absolutley',
 'accident',
 'accomodate',
 'accordingly',
 'accountant',
 'ache',
 'acknowledged',
 'across',
 'actual',
 'actually',
 'added',
 'affordable',
 'after',
 'afternoon',
 'again',
 'ago',
 'ahead',
 'airline',
 'albondigas',
 'all',
 'allergy',
 'almonds',
 'almost',
 'also',
 'although',
 'always',
 'am',
 'amazing',
 'ambiance',
 'ambience',
 'amount',
 'ample',
 'an',
 'and',
 'angry',
 'annoying',
 'another',
 'anticipated',
 'any',
 'anymore',
 'anyone',
 'anything',
 'anytime',
 'anyway',
 'anyways',
 'apologize',
 'app',
 'apparently',
 'appetite',
 'appetizer',
 'appetizers',
 'approval',
 'are',
 'area',
 'aren',
 'arepas',
 'aria',
 'around',
 'array',
 'arrived',
 'arrives',
 'arriving',
 'article',
 'as',
 'ask',
 'asked',
 'asking',
 'assure',
 'at',
 'ate',
 'atmosphere',
 'atrocious',
 'attached',
 'attack',
 'attentive',
 'attitudes',
 'auju',
 'authentic',
 'average',
 'avocado',
 'avoid',
 'avoided',
 'away

In [60]:
# now passing the data into ml algorithm

# since we are having two classes so we sill be using logistic regression

from sklearn.linear_model import LogisticRegression
log = LogisticRegression()

In [61]:
log.fit(cv_train,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [62]:
pred = log.predict(cv_test)

In [63]:
from sklearn.metrics import accuracy_score

In [64]:
score = accuracy_score(ytest,pred)

In [65]:
score

0.815

In [66]:
#testing a sample
test = ['food was not good@123']


In [67]:
cleaned_data =[]
for i in test:
    t=clean(i)
    cleaned_data.append(t)

In [68]:
cleaned_data

['food was not good']

In [69]:
t1 = cv.transform(cleaned_data)

In [70]:
t1.shape

(1, 1765)

In [71]:
op = log.predict(t1)

In [72]:
print("The Output is : ",op)

The Output is :  [0]
