In [1]:
# Yelp reviews classification with advanced ML

### Topics covered in this notebook

- **Tokenization**: breaking text into tokens (words, sentences, n-grams)
- **Stopword removal**: a/an/the
- **Stemming and lemmatization**: root word
- **TF-IDF**: word importance
- **Spelling correction**: "New York City"
- **Language detection**: "translate this page"
- **Machine learning**

## 1: Reading in the Yelp Reviews

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
import nltk
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline

In [5]:
yelp = pd.read_csv('yelp.csv')
yelp.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [6]:
yelp.shape

(10000, 10)

In [7]:
# create a new DataFrame that only contains the 5-star and 1-star reviews
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]
#yelp_best_worst = yelp

In [8]:
# define X and y
X = yelp_best_worst.text
y = yelp_best_worst.stars

In [9]:
# split the new DataFrame into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [10]:
X_train.shape

(3064,)

In [11]:
X_test.shape

(1022,)

## 2: Tokenization

- **What:** Separate text into units such as sentences or words
- **Why:** Gives structure to previously unstructured text
- **Notes:** Relatively easy with English language text, not easy with some languages

In [12]:
X_train

6841    FILLY-B's!!!!!  only 8 reviews?? NINE now!!!\n...
1728    My husband and I absolutely LOVE this restaura...
3853    We went today after lunch. I got my usual of l...
671     Totally dissapointed.  I had purchased a coupo...
4920    Costco Travel - My husband and I recently retu...
                              ...                        
9396    Pros: \n-No breed restrictions on dogs\n-Washe...
2661    Sorry Banana Leaf... I'm usually not picky at ...
9756    Alright this is the deal of deals, 2.75 for st...
554     Hands down a great lil joint! Gotta get the gu...
2575    Absolutely disgusting.  I had enchiladas and a...
Name: text, Length: 3064, dtype: object

In [13]:
# use CountVectorizer to create document-term matrices from X_train and X_test
vect = CountVectorizer(stop_words='english',min_df=10)

In [14]:
#Tokenisation and Vectorisation
vect.fit(X_train)

In [25]:
print(vect.get_feature_names_out()[0:])

['00' '10' '100' ... 'yummy' 'zero' 'zucchini']


In [None]:
len(vect.get_feature_names())

In [16]:
X_train_dtm = vect.transform(X_train)

In [17]:
X_train_dtm

<3064x2336 sparse matrix of type '<class 'numpy.int64'>'
	with 112810 stored elements in Compressed Sparse Row format>

In [18]:
type(X_train_dtm)

scipy.sparse._csr.csr_matrix

In [19]:
X_train_dtm.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [24]:
demo = pd.DataFrame(X_train_dtm.toarray())
demo.columns = vect.get_feature_names_out()
demo

Unnamed: 0,00,10,100,11,12,13,14,15,150,16,...,yes,yesterday,yogurt,york,young,yuck,yum,yummy,zero,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3059,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3060,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3061,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3062,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
X_test_dtm = vect.transform(X_test)

In [22]:
X_test_dtm

<1022x2336 sparse matrix of type '<class 'numpy.int64'>'
	with 37109 stored elements in Compressed Sparse Row format>

In [26]:
demotest = pd.DataFrame(X_test_dtm.toarray())
demotest.columns = vect.get_feature_names_out()
demotest

Unnamed: 0,00,10,100,11,12,13,14,15,150,16,...,yes,yesterday,yogurt,york,young,yuck,yum,yummy,zero,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1017,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1018,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1019,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1020,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,3,0,0,0,0,0


- **ngram_range:** tuple (min_n, max_n)
- The lower and upper boundary of the range of n-values for different n-grams to be extracted. All values of n such that min_n <= n <= max_n will be used.

In [28]:
# include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1, 2),stop_words='english',lowercase=True)
vect.fit(X_train)
print(len(vect.get_feature_names_out()))

150034


In [30]:
# last 50 features
print(vect.get_feature_names_out()[0:10000])

['00' '00 00' '00 15' ... 'bar 15pm' 'bar addition' 'bar aforementioned']


In [31]:
X_train

6841    FILLY-B's!!!!!  only 8 reviews?? NINE now!!!\n...
1728    My husband and I absolutely LOVE this restaura...
3853    We went today after lunch. I got my usual of l...
671     Totally dissapointed.  I had purchased a coupo...
4920    Costco Travel - My husband and I recently retu...
                              ...                        
9396    Pros: \n-No breed restrictions on dogs\n-Washe...
2661    Sorry Banana Leaf... I'm usually not picky at ...
9756    Alright this is the deal of deals, 2.75 for st...
554     Hands down a great lil joint! Gotta get the gu...
2575    Absolutely disgusting.  I had enchiladas and a...
Name: text, Length: 3064, dtype: object

In [32]:
from sklearn.metrics import confusion_matrix,classification_report
vect = CountVectorizer(stop_words='english',lowercase=True,min_df=10)
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)
print(X_train_dtm.shape)
print(X_test_dtm.shape)
from sklearn.linear_model import LogisticRegression
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)
print("Number of Features")
print(X_train_dtm.shape[1])
print("Training Accuracy")
print(nb.score(X_train_dtm,y_train))
print("Testing Accuracy")
print(nb.score(X_test_dtm,y_test))
print("Confusion Matrix")
print(confusion_matrix(y_test,y_pred_class))
print("Classifcation Report")
print(classification_report(y_test,y_pred_class))

(3064, 2336)
(1022, 2336)
Number of Features
2336
Training Accuracy
0.9513707571801566
Testing Accuracy
0.913894324853229
Confusion Matrix
[[151  33]
 [ 55 783]]
Classifcation Report
              precision    recall  f1-score   support

           1       0.73      0.82      0.77       184
           5       0.96      0.93      0.95       838

    accuracy                           0.91      1022
   macro avg       0.85      0.88      0.86      1022
weighted avg       0.92      0.91      0.92      1022



In [33]:
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
pipe = Pipeline((
("vect",CountVectorizer(stop_words='english',lowercase=True)),
("nb",MultinomialNB()) ,
))
pipe.fit(X_train,y_train)
y_pred_class = pipe.predict(X_test)

print("Training Accuracy")
print(pipe.score(X_train,y_train))
print("Testing Accuracy")
print(pipe.score(X_test,y_test))
print("Confusion Matrix")
print(confusion_matrix(y_test,y_pred_class))
print("Classifcation Report")
print(classification_report(y_test,y_pred_class))

Training Accuracy
0.9758485639686684
Testing Accuracy
0.9158512720156555
Confusion Matrix
[[124  60]
 [ 26 812]]
Classifcation Report
              precision    recall  f1-score   support

           1       0.83      0.67      0.74       184
           5       0.93      0.97      0.95       838

    accuracy                           0.92      1022
   macro avg       0.88      0.82      0.85      1022
weighted avg       0.91      0.92      0.91      1022



### 7. Building a Deep Learning Model

In [34]:
yelp = pd.read_csv('yelp.csv')
yelp.head()
yelp_best_worst = yelp
yelp_best_worst['stars']=yelp_best_worst['stars'].apply(lambda x: x-1)
yelp.describe()

Unnamed: 0,stars,cool,useful,funny
count,10000.0,10000.0,10000.0,10000.0
mean,2.7775,0.8768,1.4093,0.7013
std,1.214636,2.067861,2.336647,1.907942
min,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,0.0
50%,3.0,0.0,1.0,0.0
75%,4.0,1.0,2.0,1.0
max,4.0,77.0,76.0,57.0


In [35]:
# define X and y
X = yelp_best_worst.text
y = yelp_best_worst.stars

In [36]:
# split the new DataFrame into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [37]:
vect = CountVectorizer(ngram_range=(1, 2),stop_words='english',lowercase=True,min_df=10)
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)
print("Number of Features")
print(X_train_dtm.shape[1])

Number of Features
6772


In [38]:
import tensorflow as tf

We will use CountVectorizer features in this case. This can be replaced by TF-IDF features

In [39]:
#Start building a Keras Sequential Model
tf.keras.backend.clear_session()
model = tf.keras.Sequential()

In [40]:
X_train[3]

"Rosie, Dakota, and I LOVE Chaparral Dog Park!!! It's very convenient and surrounded by a lot of paths, a desert xeriscape, baseball fields, ballparks, and a lake with ducks.\n\nThe Scottsdale Park and Rec Dept. does a wonderful job of keeping the park clean and shaded.  You can find trash cans and poopy-pick up mitts located all over the park and paths.\n\nThe fenced in area is huge to let the dogs run, play, and sniff!"

In [41]:
y_train[3]

4

In [42]:
y_train = tf.keras.utils.to_categorical(y_train, num_classes=5)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=5)

In [43]:
y_train[3]

array([0., 0., 0., 0., 1.], dtype=float32)

In [44]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Reshape((6772,),input_shape=(6772,)))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dense(100, activation='relu'))
model.add(tf.keras.layers.Dense(10, activation='relu'))
model.add(tf.keras.layers.Dense(5, activation='softmax'))

In [45]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 reshape (Reshape)           (None, 6772)              0         
                                                                 
 batch_normalization (Batch  (None, 6772)              27088     
 Normalization)                                                  
                                                                 
 dense (Dense)               (None, 100)               677300    
                                                                 
 dense_1 (Dense)             (None, 10)                1010      
                                                                 
 dense_2 (Dense)             (None, 5)                 55        
                                                                 
Total params: 705453 (2.69 MB)
Trainable params: 691909 (2.64 MB)
Non-trainable params: 13544 (52.91 KB)
_______________

In [46]:
adam_op = tf.keras.optimizers.Adam(lr=0.001)
#model.compile(optimizer=adam_op, loss='binary_crossentropy', metrics=['acc'])
model.compile(optimizer=adam_op, loss='categorical_crossentropy', metrics=[tf.keras.metrics.Recall()])



In [47]:
#X_train_ct_array = X_train_ct.toarray()
#X_test_ct_array = X_test_ct.toarray()

In [48]:
model.fit(X_train_dtm.toarray(), y_train,
           validation_data=(X_test_dtm.toarray(), y_test),
           epochs=50, batch_size=128)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x2495dbe3310>

In [49]:
# define a function that accepts a vectorizer and calculates the accuracy
def tokenize_test(vect):
    vect.fit(X_train)
    X_train_dtm = vect.transform(X_train)
    print ('Features: ', X_train_dtm.shape[1])
    X_test_dtm = vect.transform(X_test)
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)
    print("Training Accuracy")
    print(nb.score(X_train_dtm,y_train))
    print("Testing Accuracy")
    print(nb.score(X_test_dtm,y_test))
    print("Confusion Matrix")
    print(confusion_matrix(y_test,y_pred_class))
    print("Classifcation Report")
    print(classification_report(y_test,y_pred_class))

In [55]:
# include 1-grams and 2-grams
# vect = CountVectorizer(stop_words='english')
vect = CountVectorizer(ngram_range=(1, 2),stop_words='english',min_df=3)
# tokenize_test(vect)

## 3: Stopword Removal

- **What:** Remove common words that will likely appear in any text
- **Why:** They don't tell you much about your text

In [56]:
# show vectorizer options
vect

- **stop_words:** string {'english'}, list, or None (default)
- If 'english', a built-in stop word list for English is used.
- If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens.
- If None, no stop words will be used.

In [57]:
vect = CountVectorizer(stop_words='english')

In [58]:
# set of stop words
print(vect.get_stop_words())

frozenset({'seem', 'himself', 'be', 'who', 'alone', 'my', 'although', 'into', 'any', 'she', 'almost', 'system', 'thru', 'above', 'might', 'together', 'hence', 'found', 'you', 'against', 'something', 'are', 'beforehand', 'mostly', 'becoming', 'whereas', 'we', 'give', 'through', 'whose', 'serious', 'up', 'can', 'herein', 'if', 'anything', 'during', 'until', 'several', 'twelve', 'first', 're', 'latterly', 'always', 'whole', 'both', 'co', 'own', 'here', 'whether', 'beyond', 'bill', 'ltd', 'because', 'would', 'meanwhile', 'detail', 'those', 'to', 'at', 'fill', 'they', 'inc', 'about', 'becomes', 'again', 'yourself', 'yourselves', 'formerly', 'nothing', 'keep', 'had', 'sometime', 'one', 'much', 'few', 'then', 'our', 'put', 'find', 'ever', 'toward', 'con', 'between', 'anyone', 'even', 'as', 'around', 'by', 'nor', 'often', 'after', 'take', 'along', 'seems', 'become', 'former', 'mine', 'thence', 'ours', 'ourselves', 'whenever', 'ten', 'thereupon', 'thereafter', 'either', 'neither', 'was', 'etc',

In [59]:
len(vect.get_stop_words())

318

In [61]:
# remove English stop words
vect = CountVectorizer(stop_words='english')
# tokenize_test(vect)

In [62]:
# without stopwords, dtm size
vect = CountVectorizer()
vect.fit(X_train)
vect.transform(X_train)

<7500x25797 sparse matrix of type '<class 'numpy.int64'>'
	with 622700 stored elements in Compressed Sparse Row format>

In [None]:
# with stopwords, dtm size
vect = CountVectorizer(stop_words='english')
vect.fit(X_train)
vect.transform(X_train)

In [None]:
my_additional_stop_words = ['place','zumba']

In [None]:
from sklearn.feature_extraction import text
my_stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words)

In [None]:
len(my_stop_words)

In [None]:
# with stopwords, dtm size
vect = CountVectorizer(stop_words=my_stop_words)
vect.fit(X_train)
vect.transform(X_train)

In [None]:
# remove updated stop words
vect = CountVectorizer(stop_words=my_stop_words)
tokenize_test(vect)

## 4: Other CountVectorizer Options

- **max_features:** int or None, default=None
- If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus.

In [None]:
# remove English stop words and only keep 100 features
vect = CountVectorizer(stop_words='english', max_features=100)
tokenize_test(vect)

In [None]:
# all 100 features
print(vect.get_feature_names())

In [None]:
# include 1-grams and 2-grams, and limit the number of features
vect = CountVectorizer(ngram_range=(1, 2), max_features=100000)
tokenize_test(vect)

- **min_df:** float in range [0.0, 1.0] or int, default=1
- When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts.

In [None]:
# include 1-grams and 2-grams, and only include terms that appear at least 2 times
vect = CountVectorizer(ngram_range=(1, 2), min_df=2)
tokenize_test(vect)

##  5: Introduction to TextBlob

TextBlob: "Simplified Text Processing"

In [None]:
# print the first review
print (yelp_best_worst.text[0])

In [None]:
yelp_best_worst.stars[0]

In [None]:
# save it as a TextBlob object
review = TextBlob(yelp_best_worst.text[0])

In [None]:
nltk.download('punkt')

In [None]:
# list the words
review.words

In [None]:
# list the sentences
review.sentences

In [None]:
# some string methods are available
review.lower()

##  6: Stemming and Lemmatization

**Stemming:**

- **What:** Reduce a word to its base/stem/root form
- **Why:** Often makes sense to treat related words the same way
- **Notes:**
    - Uses a "simple" and fast rule-based approach
    - Stemmed words are usually not shown to users (used for analysis/indexing)
    - Some search engines treat words with the same stem as synonyms

In [None]:
review.words

In [None]:
# initialize stemmer
stemmer = SnowballStemmer('english')

In [None]:
# stem each word
print ([stemmer.stem(word) for word in review.words])

**Lemmatization**

- **What:** Derive the canonical form ('lemma') of a word
- **Why:** Can be better than stemming
- **Notes:** Uses a dictionary-based approach (slower than stemming)

In [None]:
nltk.download('wordnet')

In [None]:
# assume every word is a noun
print ([word.lemmatize() for word in review.words])

In [None]:
# assume every word is a verb
print ([word.lemmatize(pos='v') for word in review.words])

In [None]:
# define a function that accepts text and returns a list of lemmas
def split_into_lemmas(text):
    text = text.lower()
    words = TextBlob(text).words
    return [word.lemmatize() for word in words]

In [None]:
split_into_lemmas

In [None]:
# use split_into_lemmas as the feature extraction function (WARNING: SLOW!)
vect = CountVectorizer(analyzer=split_into_lemmas,stop_words='english',ngram_range=(1, 1),min_df=5)
#vect = CountVectorizer(stop_words='english',ngram_range=(1, 1),min_df=1)

In [None]:
tokenize_test(vect)

## 7: Term Frequency-Inverse Document Frequency (TF-IDF)

- **What:** Computes "relative frequency" that a word appears in a document compared to its frequency across all documents
- **Why:** More useful than "term frequency" for identifying "important" words in each document (high frequency in that document, low frequency in other documents)
- **Notes:** Used for search engine scoring, text summarization, document clustering

In [None]:
# example documents
simple_train = ['call you tonight', 'Call me a cab', 'please call me... PLEASE!']

In [None]:
# Term Frequency
vect = CountVectorizer()
tf = pd.DataFrame(vect.fit_transform(simple_train).toarray(), columns=vect.get_feature_names())
tf

In [None]:
# Document Frequency
vect = CountVectorizer(binary=True)
df = vect.fit_transform(simple_train).toarray().sum(axis=0)
pd.DataFrame(df.reshape(1, 6), columns=vect.get_feature_names())

In [None]:
# TfidfVectorizer
vect = TfidfVectorizer()
pd.DataFrame(vect.fit_transform(simple_train).toarray(), columns=vect.get_feature_names())

## Classify Yelp Review with TFIDF?

1. Separate data into train and test

2. Create DTM using TFIDF

3. Build classification model with Naive Bayes

4. Calculate classification score

5. Compare it with CountVectorizer

6. Lemmatization Vs. Stemming when using in TFIDF

## 9: Sentiment Analysis

In [None]:
review = TextBlob("Sayan is teaching amazingly well !!!")

In [None]:
print (review)

In [None]:
# polarity ranges from -1 (most negative) to 1 (most positive)
review.sentiment.polarity

In [None]:
yelp_best_worst.head(20)

In [None]:
# save it as a TextBlob object
review = TextBlob(yelp_best_worst.text[12])

In [None]:
print (review)

In [None]:
# polarity ranges from -1 (most negative) to 1 (most positive)
review.sentiment.polarity

In [None]:
# understanding the apply method
yelp['length'] = yelp.text.apply(len)
yelp.head()

In [None]:
# define a function that accepts text and returns the polarity
def detect_sentiment(text):
    return TextBlob(text).sentiment.polarity

In [None]:
# create a new DataFrame column for sentiment (WARNING: SLOW!)
yelp['sentiment'] = yelp.text.apply(detect_sentiment)

In [None]:
yelp[yelp['stars']==1]

In [None]:
# box plot of sentiment grouped by stars
yelp.boxplot(column='sentiment', by='stars')

In [None]:
onestar = yelp[yelp['stars']==1]
onestar.describe()

In [None]:
onestar[onestar['sentiment']==1].values

In [None]:
# reviews with most positive sentiment
yelp[yelp.sentiment == 1].text.head()

In [None]:
# reviews with most negative sentiment
yelp[yelp.sentiment == -1].text.head()

In [None]:
# widen the column display
pd.set_option('max_colwidth', 500)

In [None]:
# negative sentiment in a 5-star review
yelp[(yelp.stars == 1) & (yelp.sentiment > 0.5)].head()

In [None]:
# positive sentiment in a 1-star review
yelp[(yelp.stars == 1) & (yelp.sentiment > 0.5)].head(5)

In [None]:
# reset the column display width
pd.reset_option('max_colwidth')

## 10: Adding Features to a Document-Term Matrix

In [None]:
# create a DataFrame that only contains the 5-star and 1-star reviews
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]

In [None]:
# define X and y
feature_cols = ['text', 'sentiment', 'cool', 'useful', 'funny']
X = yelp_best_worst[feature_cols]
y = yelp_best_worst.stars

In [None]:
# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
# use CountVectorizer with text column only
vect = CountVectorizer()

In [None]:
X_train_dtm = vect.fit_transform(X_train.text)

In [None]:
X_test_dtm = vect.transform(X_test.text)

In [None]:
print (X_train_dtm.shape)
print (X_test_dtm.shape)

(3064, 16825)
(1022, 16825)


In [None]:
# shape of other four feature columns
X_train.drop('text', axis=1).shape

(3064, 4)

In [None]:
# cast other feature columns to float and convert to a sparse matrix
extra = sp.sparse.csr_matrix(X_train.drop('text', axis=1).astype(float))
extra.shape

(3064, 4)

In [None]:
# combine sparse matrices
X_train_dtm_extra = sp.sparse.hstack((X_train_dtm, extra))
X_train_dtm_extra.shape

(3064, 16829)

In [None]:
# repeat for testing set
extra = sp.sparse.csr_matrix(X_test.drop('text', axis=1).astype(float))
X_test_dtm_extra = sp.sparse.hstack((X_test_dtm, extra))
X_test_dtm_extra.shape

(1022, 16829)

In [None]:
# use logistic regression with text column only
logreg = LogisticRegression(C=1e9)
logreg.fit(X_train_dtm, y_train)
y_pred_class = logreg.predict(X_test_dtm)
print (metrics.accuracy_score(y_test, y_pred_class))

0.9246575342465754


In [None]:
# use logistic regression with all features
logreg = LogisticRegression(C=1e9)
logreg.fit(X_train_dtm_extra, y_train)
y_pred_class = logreg.predict(X_test_dtm_extra)
print (metrics.accuracy_score(y_test, y_pred_class))

0.9207436399217221


## 11: Fun TextBlob Features

In [None]:
# spelling correction
TextBlob('indai is good counrteyz').correct()

TextBlob("india is good counrteyz")

In [None]:
# spellcheck
Word('parot').spellcheck()

[('part', 0.9929478138222849), ('parrot', 0.007052186177715092)]

In [None]:
# definitions
Word('bank').define('v')

['tip laterally',
 'enclose with a bank',
 'do business with a bank or keep an account at a bank',
 'act as the banker in a game or in gambling',
 'be in the banking business',
 'put into a bank account',
 'cover with ashes so to control the rate of burning',
 'have confidence or faith in']

In [None]:
# language identification
TextBlob('விஜய் தனது புதிய யூடியூப் சேனலை திறக்கிறார்').detect_language()

'ta'

## Conclusion

- NLP is a gigantic field
- Understanding the basics broadens the types of data you can work with
- Simple techniques go a long way
- Use scikit-learn for NLP whenever possible