In [1]:
import pandas as pd
import numpy as np

In [2]:
yelp = pd.read_csv('data/yelp.csv')

In [3]:
yelp_best_worst = yelp[(yelp.stars == 5) | (yelp.stars == 1)]

In [4]:
X = yelp_best_worst.text
y = yelp_best_worst.stars

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
X_train_mat = vect.fit_transform(X_train)
X_test_mat = vect.fit_transform(X_test)

In [7]:
# to convert to lowercase.

# Option 1
lower_text = []
for i in yelp.text:
    lower_text.append(i.lower())
yelp['text'] = lower_text
    
    
# Option 2
yelp['text'] = yelp["text"].str.lower()

In [8]:
vect.get_feature_names()

['00',
 '000',
 '00pm',
 '02',
 '04',
 '05',
 '06',
 '10',
 '100',
 '1000',
 '100s',
 '101',
 '1030',
 '105',
 '108',
 '109',
 '10am',
 '10pm',
 '10th',
 '10yo',
 '11',
 '110',
 '115',
 '116',
 '11am',
 '12',
 '120',
 '13',
 '1300',
 '13331',
 '13th',
 '14',
 '15',
 '150',
 '157',
 '16',
 '16th',
 '17',
 '175',
 '17th',
 '18',
 '1800',
 '1895',
 '19',
 '1968',
 '1978',
 '1980s',
 '1990',
 '1997',
 '19th',
 '1pm',
 '1st',
 '20',
 '200',
 '2003',
 '2006',
 '2007',
 '2009',
 '2010',
 '2011',
 '2012',
 '20min',
 '20minutes',
 '20th',
 '21',
 '21st',
 '22',
 '23',
 '24',
 '24th',
 '25',
 '250',
 '26',
 '27',
 '27th',
 '28',
 '29',
 '2nd',
 '2pm',
 '30',
 '300',
 '30am',
 '30ish',
 '30pm',
 '30s',
 '30th',
 '31',
 '316',
 '32',
 '34',
 '35',
 '350',
 '35th',
 '36',
 '38',
 '39',
 '39th',
 '3pm',
 '3rd',
 '3x',
 '40',
 '400',
 '4000',
 '400s',
 '40ish',
 '40th',
 '41',
 '42',
 '44',
 '44th',
 '45',
 '4655',
 '46th',
 '48th',
 '49',
 '4cxbhzxxtmexf9krjmfviq',
 '4ish',
 '4pm',
 '4x6',
 '50',
 '

In [9]:
X_test_mat.shape

(1022, 9620)

In [10]:
vect2 = CountVectorizer(lowercase=False)
X_train_mat2 = vect2.fit_transform(X_train)
X_test_mat2 = vect2.fit_transform(X_test)


In [11]:
vect2.get_feature_names()

['00',
 '000',
 '00pm',
 '02',
 '04',
 '05',
 '06',
 '10',
 '100',
 '1000',
 '100s',
 '101',
 '1030',
 '105',
 '108',
 '109',
 '10am',
 '10pm',
 '10th',
 '10yo',
 '11',
 '110',
 '115',
 '116',
 '11am',
 '12',
 '120',
 '13',
 '1300',
 '13331',
 '13th',
 '14',
 '15',
 '150',
 '157',
 '16',
 '16th',
 '17',
 '175',
 '17th',
 '18',
 '1800',
 '1895',
 '19',
 '1968',
 '1978',
 '1980s',
 '1990',
 '1997',
 '19th',
 '1PM',
 '1pm',
 '1st',
 '20',
 '200',
 '2003',
 '2006',
 '2007',
 '2009',
 '2010',
 '2011',
 '2012',
 '20min',
 '20minutes',
 '20th',
 '21',
 '21st',
 '22',
 '23',
 '24',
 '24th',
 '25',
 '250',
 '26',
 '27',
 '27th',
 '28',
 '29',
 '2nd',
 '2pm',
 '30',
 '300',
 '30am',
 '30ish',
 '30pm',
 '30s',
 '30th',
 '31',
 '316',
 '32',
 '34',
 '35',
 '350',
 '35th',
 '36',
 '38',
 '39',
 '39th',
 '3pm',
 '3rd',
 '3x',
 '40',
 '400',
 '4000',
 '400s',
 '40ish',
 '40th',
 '41',
 '42',
 '44',
 '44th',
 '45',
 '4655',
 '46th',
 '48th',
 '49',
 '4cXbhzxxtmExF9kRjmFViQ',
 '4ish',
 '4pm',
 '4x6',
 

In [12]:
vect = CountVectorizer()
X_train_mat = vect.fit_transform(X_train)
X_test_mat = vect.transform(X_test)

In [13]:
from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB()
model.fit(X_train_mat, y_train)
y_pred = model.predict(X_test_mat)

In [14]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test,y_pred))

0.8395303326810176


In [15]:
y_test.value_counts(normalize=True)

5    0.819961
1    0.180039
Name: stars, dtype: float64

In [16]:
y_train.value_counts()

5    2499
1     565
Name: stars, dtype: int64

In [17]:
X_test_mat.shape

(1022, 16825)

In [18]:
len(vect.vocabulary_)

16825

In [19]:
X_test_mat.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [20]:
# function that accepts a vectorizer and calculates the accuracy
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


def myFunction(vectorizer):
    X_train_mat = vectorizer.fit_transform(X_train)
    X_test_mat = vectorizer.transform(X_test)
    model = BernoulliNB()
    model.fit(X_train_mat, y_train)
    y_pred = model.predict(X_test_mat)
    print ('Accuracy:', accuracy_score(y_test, y_pred))

In [None]:
vect_1 = CountVectorizer(min_df=1, max_features=6000)

In [28]:
myFunction(vect_1)

Accuracy: 0.8493150684931506


In [29]:
# "my cat is awesome."

'my cat is awesome.'

In [None]:
# Unigram= "my" "cat" "is" "awesome"

# Bigram= "my cat" "cat is" "is awesome"

# Trigram = 'my cat is' "cat is awesome"

In [32]:
vect_2 = CountVectorizer(ngram_range=(1,2))

myFunction(vect_2)

Accuracy: 0.8228962818003914


In [36]:
vect_3 = CountVectorizer(stop_words='english', max_features=6000)

myFunction(vect_3)

Accuracy: 0.8639921722113503


In [39]:
cv = CountVectorizer(stop_words = 'english', ngram_range=(1,2))

X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)
cv_df = pd.DataFrame(X_test_cv.toarray(), columns = cv.get_feature_names())
cv_df.head()

Unnamed: 0,00,00 00,00 15,00 24,00 25,00 30,00 50,00 actually,00 amazing,00 arriving,...,zwiebel,zwiebel kräuter,zzed,zzed pants,éclairs,éclairs napoleons,école,école lenôtre,ém,ém huge
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
cv_df['stars'] = y_test.reset_index(inplace=True, drop=True)
cv_df[cv_df.stars == 5].sum().sort_values(ascending=False)[:50]

ém huge               0.0
food different        0.0
food downtown         0.0
food downhill         0.0
food donate           0.0
food don              0.0
food dollars          0.0
food doesn            0.0
food does             0.0
food dive             0.0
food disturbing       0.0
food display          0.0
food dishes           0.0
food disgusting       0.0
food discounted       0.0
food disappointing    0.0
food disadvantage     0.0
food draft            0.0
food drink            0.0
food drinks           0.0
food eggs             0.0
food evening          0.0
food establishment    0.0
food ended            0.0
food endeavors        0.0
food el               0.0
food egyptian         0.0
food eats             0.0
food drive            0.0
food eating           0.0
food eatery           0.0
food eaten            0.0
food eat              0.0
food dry              0.0
food dropped          0.0
food diner            0.0
food did              0.0
food court            0.0
food diarrhe

In [50]:
cv_df[cv_df.stars == 1].sum().sort_values(ascending=False)[:50]

ém huge               0.0
food different        0.0
food downtown         0.0
food downhill         0.0
food donate           0.0
food don              0.0
food dollars          0.0
food doesn            0.0
food does             0.0
food dive             0.0
food disturbing       0.0
food display          0.0
food dishes           0.0
food disgusting       0.0
food discounted       0.0
food disappointing    0.0
food disadvantage     0.0
food draft            0.0
food drink            0.0
food drinks           0.0
food eggs             0.0
food evening          0.0
food establishment    0.0
food ended            0.0
food endeavors        0.0
food el               0.0
food egyptian         0.0
food eats             0.0
food drive            0.0
food eating           0.0
food eatery           0.0
food eaten            0.0
food eat              0.0
food dry              0.0
food dropped          0.0
food diner            0.0
food did              0.0
food court            0.0
food diarrhe

In [53]:
yelp_best_worst.text[0]

'My wife took me here on my birthday for breakfast and it was excellent.  The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure.  Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning.  It looked like the place fills up pretty quickly so the earlier you get here the better.\n\nDo yourself a favor and get their Bloody Mary.  It was phenomenal and simply the best I\'ve ever had.  I\'m pretty sure they only use ingredients from their garden and blend them fresh when you order it.  It was amazing.\n\nWhile EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious.  It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete.  It was the best "toast" I\'ve ever had.\n\nAnyway, I can\'t wait to go back!'

In [59]:
from textblob import TextBlob, Word
reviews = TextBlob(yelp_best_worst.text[0])

In [61]:
reviews.sentences

[Sentence("My wife took me here on my birthday for breakfast and it was excellent."),
 Sentence("The weather was perfect which made sitting outside overlooking their grounds an absolute pleasure."),
 Sentence("Our waitress was excellent and our food arrived quickly on the semi-busy Saturday morning."),
 Sentence("It looked like the place fills up pretty quickly so the earlier you get here the better."),
 Sentence("Do yourself a favor and get their Bloody Mary."),
 Sentence("It was phenomenal and simply the best I've ever had."),
 Sentence("I'm pretty sure they only use ingredients from their garden and blend them fresh when you order it."),
 Sentence("It was amazing."),
 Sentence("While EVERYTHING on the menu looks excellent, I had the white truffle scrambled eggs vegetable skillet and it was tasty and delicious."),
 Sentence("It came with 2 pieces of their griddled bread with was amazing and it absolutely made the meal complete."),
 Sentence("It was the best "toast" I've ever had."),


In [64]:
reviews.upper()

TextBlob("MY WIFE TOOK ME HERE ON MY BIRTHDAY FOR BREAKFAST AND IT WAS EXCELLENT.  THE WEATHER WAS PERFECT WHICH MADE SITTING OUTSIDE OVERLOOKING THEIR GROUNDS AN ABSOLUTE PLEASURE.  OUR WAITRESS WAS EXCELLENT AND OUR FOOD ARRIVED QUICKLY ON THE SEMI-BUSY SATURDAY MORNING.  IT LOOKED LIKE THE PLACE FILLS UP PRETTY QUICKLY SO THE EARLIER YOU GET HERE THE BETTER.

DO YOURSELF A FAVOR AND GET THEIR BLOODY MARY.  IT WAS PHENOMENAL AND SIMPLY THE BEST I'VE EVER HAD.  I'M PRETTY SURE THEY ONLY USE INGREDIENTS FROM THEIR GARDEN AND BLEND THEM FRESH WHEN YOU ORDER IT.  IT WAS AMAZING.

WHILE EVERYTHING ON THE MENU LOOKS EXCELLENT, I HAD THE WHITE TRUFFLE SCRAMBLED EGGS VEGETABLE SKILLET AND IT WAS TASTY AND DELICIOUS.  IT CAME WITH 2 PIECES OF THEIR GRIDDLED BREAD WITH WAS AMAZING AND IT ABSOLUTELY MADE THE MEAL COMPLETE.  IT WAS THE BEST "TOAST" I'VE EVER HAD.

ANYWAY, I CAN'T WAIT TO GO BACK!")

In [67]:
words = reviews.words
words

WordList(['My', 'wife', 'took', 'me', 'here', 'on', 'my', 'birthday', 'for', 'breakfast', 'and', 'it', 'was', 'excellent', 'The', 'weather', 'was', 'perfect', 'which', 'made', 'sitting', 'outside', 'overlooking', 'their', 'grounds', 'an', 'absolute', 'pleasure', 'Our', 'waitress', 'was', 'excellent', 'and', 'our', 'food', 'arrived', 'quickly', 'on', 'the', 'semi-busy', 'Saturday', 'morning', 'It', 'looked', 'like', 'the', 'place', 'fills', 'up', 'pretty', 'quickly', 'so', 'the', 'earlier', 'you', 'get', 'here', 'the', 'better', 'Do', 'yourself', 'a', 'favor', 'and', 'get', 'their', 'Bloody', 'Mary', 'It', 'was', 'phenomenal', 'and', 'simply', 'the', 'best', 'I', "'ve", 'ever', 'had', 'I', "'m", 'pretty', 'sure', 'they', 'only', 'use', 'ingredients', 'from', 'their', 'garden', 'and', 'blend', 'them', 'fresh', 'when', 'you', 'order', 'it', 'It', 'was', 'amazing', 'While', 'EVERYTHING', 'on', 'the', 'menu', 'looks', 'excellent', 'I', 'had', 'the', 'white', 'truffle', 'scrambled', 'eggs', 

In [69]:
#Stemming and Lemmatization
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')

print([stemmer.stem(word) for word in words])

['my', 'wife', 'took', 'me', 'here', 'on', 'my', 'birthday', 'for', 'breakfast', 'and', 'it', 'was', 'excel', 'the', 'weather', 'was', 'perfect', 'which', 'made', 'sit', 'outsid', 'overlook', 'their', 'ground', 'an', 'absolut', 'pleasur', 'our', 'waitress', 'was', 'excel', 'and', 'our', 'food', 'arriv', 'quick', 'on', 'the', 'semi-busi', 'saturday', 'morn', 'it', 'look', 'like', 'the', 'place', 'fill', 'up', 'pretti', 'quick', 'so', 'the', 'earlier', 'you', 'get', 'here', 'the', 'better', 'do', 'yourself', 'a', 'favor', 'and', 'get', 'their', 'bloodi', 'mari', 'it', 'was', 'phenomen', 'and', 'simpli', 'the', 'best', 'i', 've', 'ever', 'had', 'i', "'m", 'pretti', 'sure', 'they', 'onli', 'use', 'ingredi', 'from', 'their', 'garden', 'and', 'blend', 'them', 'fresh', 'when', 'you', 'order', 'it', 'it', 'was', 'amaz', 'while', 'everyth', 'on', 'the', 'menu', 'look', 'excel', 'i', 'had', 'the', 'white', 'truffl', 'scrambl', 'egg', 'veget', 'skillet', 'and', 'it', 'was', 'tasti', 'and', 'delic

In [75]:
print([word.lemmatize(pos='') for word in words])

['My', 'wife', 'took', 'me', 'here', 'on', 'my', 'birthday', 'for', 'breakfast', 'and', 'it', 'wa', 'excellent', 'The', 'weather', 'wa', 'perfect', 'which', 'made', 'sitting', 'outside', 'overlooking', 'their', 'ground', 'an', 'absolute', 'pleasure', 'Our', 'waitress', 'wa', 'excellent', 'and', 'our', 'food', 'arrived', 'quickly', 'on', 'the', 'semi-busy', 'Saturday', 'morning', 'It', 'looked', 'like', 'the', 'place', 'fill', 'up', 'pretty', 'quickly', 'so', 'the', 'earlier', 'you', 'get', 'here', 'the', 'better', 'Do', 'yourself', 'a', 'favor', 'and', 'get', 'their', 'Bloody', 'Mary', 'It', 'wa', 'phenomenal', 'and', 'simply', 'the', 'best', 'I', "'ve", 'ever', 'had', 'I', "'m", 'pretty', 'sure', 'they', 'only', 'use', 'ingredient', 'from', 'their', 'garden', 'and', 'blend', 'them', 'fresh', 'when', 'you', 'order', 'it', 'It', 'wa', 'amazing', 'While', 'EVERYTHING', 'on', 'the', 'menu', 'look', 'excellent', 'I', 'had', 'the', 'white', 'truffle', 'scrambled', 'egg', 'vegetable', 'skill

In [73]:
print([word.lemmatize() for word in words])

['My', 'wife', 'took', 'me', 'here', 'on', 'my', 'birthday', 'for', 'breakfast', 'and', 'it', 'wa', 'excellent', 'The', 'weather', 'wa', 'perfect', 'which', 'made', 'sitting', 'outside', 'overlooking', 'their', 'ground', 'an', 'absolute', 'pleasure', 'Our', 'waitress', 'wa', 'excellent', 'and', 'our', 'food', 'arrived', 'quickly', 'on', 'the', 'semi-busy', 'Saturday', 'morning', 'It', 'looked', 'like', 'the', 'place', 'fill', 'up', 'pretty', 'quickly', 'so', 'the', 'earlier', 'you', 'get', 'here', 'the', 'better', 'Do', 'yourself', 'a', 'favor', 'and', 'get', 'their', 'Bloody', 'Mary', 'It', 'wa', 'phenomenal', 'and', 'simply', 'the', 'best', 'I', "'ve", 'ever', 'had', 'I', "'m", 'pretty', 'sure', 'they', 'only', 'use', 'ingredient', 'from', 'their', 'garden', 'and', 'blend', 'them', 'fresh', 'when', 'you', 'order', 'it', 'It', 'wa', 'amazing', 'While', 'EVERYTHING', 'on', 'the', 'menu', 'look', 'excellent', 'I', 'had', 'the', 'white', 'truffle', 'scrambled', 'egg', 'vegetable', 'skill

In [100]:
def split_to_lemma(review):
    review = str(review).lower()
    words = TextBlob(review).words
    return[word.lemmatize() for word in words]

In [105]:
vect_4 = CountVectorizer(analyzer=split_to_lemma, decode_error='replace', 
                         stop_words='english', max_features=6000, ngram_range = (1,2))

In [106]:
myFunction(vect_4)

Accuracy: 0.8483365949119374


In [107]:
train_set = ['call you tonight', 'call me a cab', 'please call me...PLEASE']

In [108]:
vect_5 = CountVectorizer()
tf = pd.DataFrame(vect_5.fit_transform(train_set).toarray(), columns=vect_5.get_feature_names())

In [109]:
tf

Unnamed: 0,cab,call,me,please,tonight,you
0,0,1,0,0,1,1
1,1,1,1,0,0,0
2,0,1,1,2,0,0
