In [218]:
### Preprocessing the Yelp Comments on Berlin Restaurants

In [219]:
#### Get required packages

In [220]:
import pandas as pd
import os
import re
import nltk
import string
import numpy as np
#from nltk import word_tokenize
#nltk.download()

#### Get dataset 

In [221]:
yelp = pd.read_csv('german_merged.csv')
yelp.columns.values[0]="ID"
yelp = yelp.rename(columns={'Overall Rating':'Overall_Rating',
                                      "Total Reviews":'Total_Reviews',
                                      "Restaurant Name":"Restaurant_Name",
                                     "Price Range":"Price_Range"})
yelp.columns
yelp['ID'] = yelp['ID'].astype(int)
yelp['Comment'] = yelp['Comment'].astype(str)
yelp['Overall_Rating'] = yelp['Overall_Rating'].astype(float)
yelp['Date'] = yelp['Date'].astype('datetime64[ns]',"dd-MM-yyyy")
yelp.dtypes
print(yelp.info())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9427 entries, 0 to 9426
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   ID               9427 non-null   int32         
 1   Restaurant_Name  9427 non-null   object        
 2   Overall_Rating   9427 non-null   float64       
 3   Total_Reviews    9427 non-null   int64         
 4   Specialty        9427 non-null   object        
 5   Region           9427 non-null   object        
 6   Price_Range      9427 non-null   object        
 7   Author           9427 non-null   object        
 8   Comment          9427 non-null   object        
 9   Rating           9427 non-null   int64         
 10  Date             9427 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int32(1), int64(2), object(6)
memory usage: 773.4+ KB
None


  yelp['Date'] = yelp['Date'].astype('datetime64[ns]',"dd-MM-yyyy")


In [222]:
yelp.head()
# len(yelp)
yelp = yelp.drop_duplicates(subset=['Comment'], inplace=False)
# print(len(yelp)) #15 duplicates found

#### Basic Cleaning: Remove punctuation and set everything to lower case

In [237]:
#replace unknown
#remove pattern "x Fotos" in Author - replace with Unknown

yelp['Author'] = yelp.Author.where((yelp.Author == 'Unknown') | ('Foto' in yelp.Author), None)
yelp['Price_Range'] = yelp.Price_Range.where(yelp.Price_Range == 'Unknown', None)
print(yelp[yelp.ID==1114])

def clean_text(text):
    #Remove unicode characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    #Filter to allow only alphabets
    #text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    
    #Fix &
    text = re.sub(r'&amp;', '&', text)
    
    #Remove punctuations etc.
    #text = re.sub(r'[?!.;:",#@-]', '', text)
    text = re.sub(r'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', '', text)
    
    #Remove Prices and numbers
    text = re.sub(r'[0-9]+€|[0-9]+', '', text)

    #Convert to lowercase to maintain consistency
    text = text.lower()
    return text


yelp['Comment'] = yelp.Comment.apply(lambda x: clean_text(x))


#remove comments in english
#

        ID Restaurant_Name  Overall_Rating  Total_Reviews Specialty Region  \
1114  1114    Cocolo Ramen             4.2             42  Ramen /   Mitte   

     Price_Range Author                                            Comment  \
1114        None   None  war heut abend zum ersten mal dort und bin nur...   

      Rating       Date                                  Comment_tokenized  
1114       5 2012-02-03  [war, heut, abend, zum, ersten, mal, dort, und...  


In [224]:
yelp.Comment.values[68]

'das als kleine bodega tapasrestaurant mit kleiner aussenterrasse fr raucher ist eine toplocation fr span. kche. sowohl die datteln im speckmantel wie die gambas al ajillo und die kartoffeln mit pikanter sauce sind spitze. zu zweit bezahlten wir , euro samt / l rotwein,  x datteln (je  stck),  hhnchen in honigsalsa(scharf,hmmmm), garnelen in knoblauchsauce, kroketten mit hhnchen sowie kartoffeln (ca  stck frhlingskartoffeln) wir finden das ist nicht das billigste, aber seinen preis auf jeden fall wert. wie heisst es in ebay: weiter so und gerne wieder'

#### Tokenization

In [225]:
# using re
# Explicitly tell Python that lower and upper case symbols are to be treated as the same symbol
def tokenize(text):
    tokens = re.split('\W+',text)
    return tokens
yelp['Comment_tokenized'] = yelp['Comment'].apply (lambda x: tokenize(x.lower()))

print(yelp.Comment_tokenized[0])

['ich', 'habe', 'mir', 'bewertungen', 'zu', 'restaurants', 'auf', 'menulist', 'angesehen', 'und', 'dieses', 'restaurant', 'hatte', 'gute', 'kritiken', 'also', 'habe', 'ich', 'es', 'ausprobiert', 'und', 'es', 'nicht', 'bereut', 'menulist', 'menu', 'restaurant']


#### Lemmatizing

In [226]:
wn = nltk.WordNetLemmatizer()
#dir(wn)

In [227]:
"""def lemmatizing(tokenized_text):
    text =  [wn.lemmatize(word) for word in tokenized_text]
    return text

yelp['Comment_lemmatized'] = yelp['Comment_tokenized'].apply(lambda x: lemmatizing(x))
"""

"def lemmatizing(tokenized_text):\n    text =  [wn.lemmatize(word) for word in tokenized_text]\n    return text\n\nyelp['Comment_lemmatized'] = yelp['Comment_tokenized'].apply(lambda x: lemmatizing(x))\n"

### Remove Punctuation, Tokenize, remove stopwords and lemmatize all in one

In [228]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = " ".join([wn.lemmatize(word) for word in tokens if word not in stopwords])
    return text

In [229]:
yelp['Comment_clean'] = yelp['Comment'].apply(lambda x: clean_text(x))

NameError: name 'stopwords' is not defined

In [None]:
yelp[["ID","Comment_clean"]].to_csv("Yelp_Cleaned_Comments.csv",header=True)

### Vecotrization- Process of encoding text as integers to create feature vectors, basically transform char to numeric

#### 1. Count Vectorization

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=clean_text)
X_counts = count_vect.fit_transform(yelp['Comment'])

# Peak into the words
print(X_counts.shape)
print(count_vect.get_feature_names()[189:204])

In [None]:
X_counts

In [None]:
X_counts_df = pd.DataFrame(X_counts.toarray())
X_counts_df

In [None]:
X_counts_df.columns = count_vect.get_feature_names()
X_counts_df

#### N - Grams

In [None]:
def clean_text_ngram(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = " ".join([wn.lemmatize(word) for word in tokens if word not in stopwords])
    return text

In [None]:
yelp['Comment_clean' ] = yelp['Comment'].apply(lambda x : clean_text_ngram(x))
yelp.Comment_clean.head()

In [None]:
ngram_vect = CountVectorizer(ngram_range=(2,2))
X_counts = ngram_vect.fit_transform(yelp['Comment_clean'])
print(X_counts.shape)
print(ngram_vect.get_feature_names())

In [None]:
X_counts_df = pd.DataFrame(X_counts.toarray())
X_counts_df.columns = ngram_vect.get_feature_names()
X_counts_df

#### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(yelp['Comment'])
print(X_tfidf.shape)
print(tfidf_vect.get_feature_names()[1:15])

In [None]:
X_tfidf_df = pd.DataFrame(X_tfidf.toarray())
X_tfidf_df.columns = tfidf_vect.get_feature_names_out()
X_tfidf_df[['service','food','experience','place']]
len(X_tfidf_df[X_tfidf_df['experience']==0])

### Feature Engineering 

In [None]:
"""
Create new Features to best estimate the actual star rating given 
e.g. sentiment polarity & subjectivity with TextBlob,
length of text field, 
percentage of characters that are punctuation in the text. 
percentage of characters that are capitalized
"""
from textblob import TextBlob

In [None]:
#dir(TextBlob)
help(TextBlob.sentiment_assessments)

In [None]:
cmo = yelp['Comment_clean']

def senti_score(text):
    sentiment = []
    sentiment_temp = []
    result = []

    for elem in text:
        blob = TextBlob(elem)
        for sentence in blob.sentences:
            sentiment_temp.append(sentence.sentiment.polarity)
        sentiment = Average(sentiment_temp)
        result.append(sentiment)

yelp['Sentiment'] = result
yelp.head()

In [None]:
# hypothesis: longer comments tend to be correlated with more positive overall rating


### EDA - Word frequencies

#### Removing Stop Words from Word Frequency Counts

In [None]:
#pip install wordcloud

# Import list of stopwards
from wordcloud import STOPWORDS
print(STOPWORDS)

#### Generate frequency of every word from all comments

In [None]:
def gen_freq(text):
    #Will store the list of words
    word_list = []

    #Loop over all the comments and extract words into word_list
    for tw_words in text.split():
        word_list.extend(tw_words)

    #Create word frequencies using word_list
    word_freq = pd.Series(word_list).value_counts()

    #Print top 5 and bottom 5 words
    word_freq[:10]
    
    return word_freq
gen_freq(yelp.Comment.str)

#### --> Problem: stopwords have highest frequencies 

In [None]:
# Drop Stopwords from frequency table
word_freq = gen_freq(yelp.Comment.str)

word_freq = word_freq.drop(labels=STOPWORDS, errors='ignore')
word_freq.head()

In [None]:
word_freq[156:164]

In [None]:
# Import libraries for Plotting
import matplotlib.pyplot as plt
from wordcloud import WordCloud

#Generate word cloud
wc = WordCloud(width=400, height=330, max_words=100, background_color='white').generate_from_frequencies(word_freq)

plt.figure(figsize=(12, 8))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()

### Basic ML application

##### Feature Engineering

In [None]:
# Generate even greater list with stopwords 
stop_words_manual =['a', 'about', 'above', 'after', 'again', 'against', 'all', 'also', 'am', 'an', 'and',
              'any', 'are', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below',
              'between', 'both', 'but', 'by', 'can', "can't", 'cannot', 'com', 'could', "couldn't", 'did',
              "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each', 'else', 'ever',
              'few', 'for', 'from', 'further', 'get', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having',
              'he', "he'd", "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how',
              "how's", 'however', 'http', 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it',
              "it's", 'its', 'itself', 'just', 'k', "let's", 'like', 'me', 'more', 'most', "mustn't", 'my', 'myself',
              'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'otherwise', 'ought', 'our', 'ours',
              'ourselves', 'out', 'over', 'own', 'r', 'same', 'shall', "shan't", 'she', "she'd", "she'll", "she's",
              'should', "shouldn't", 'since', 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs',
              'them', 'themselves', 'then', 'there', "there's", 'these', 'they', "they'd", "they'll", "they're",
              "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', "wasn't",
              'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where',
              "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's", 'with', "won't", 'would', "wouldn't",
              'www', 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves']

# Merge Manual list and wordcloud's stopwords list
stop_words = list(STOPWORDS)+stop_words_manual

# but remove duplicates
stop_words = [*set(stop_words)]

print(stop_words)

In [None]:
# Regenrate Frequency of all words
freq_clean = gen_freq(yelp.Comment.str).drop(labels=stop_words, errors='ignore')

# how many different words do we still have 
# code to count number of entries in freq_clean

# Get 100 rarest words only
rare_100 =freq_clean[-100:]
rare_100.head()

#### Create new variables per restaurant comment from characteristics of Comment - Feature Engineering

In [None]:
#Check whether a negation term is present in the text
def any_neg(words):
    for word in words:
        if word in ['n', 'no', 'non', 'not'] or re.search(r"\wn't", word):
            return 1
    else:
        return 0
    
#Check whether one of the 100 rare words is present in the text
def any_rare(words, rare_100):
    for word in words:
        if word in rare_100:
            return 1
    else:
        return 0

#Check whether prompt words are present
def is_question(words):
    for word in words:
        if word in ['when', 'what', 'how', 'why', 'who']:
            return 1
    else:
        return 0

In [None]:
#Number of words in a comment
yelp['word_count'] = yelp.Comment.str.split().apply(lambda x: len(x))
#Negation present or not
yelp['any_neg'] = yelp.Comment.str.split().apply(lambda x: any_neg(x))
#Prompt present or not
yelp['is_question'] = yelp.Comment.str.split().apply(lambda x: is_question(x))
#Any of the most 100 rare words present or not
yelp['any_rare'] = yelp.Comment.str.split().apply(lambda x: any_rare(x, rare_100))

In [None]:
print(yelp.head())

#### Basic ML : Split dataset into predicting variables and predicted variable as well as into training dataset and test dataset

In [None]:
from sklearn.model_selection import train_test_split

X = yelp[['word_count', 'any_neg', 'any_rare', 'Price_Range', 'is_question','Overall_Rating']]
y = yelp[['Rating']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=27)

##### 4. Train an ML model for Text Classification

##### Now that the dataset is ready, it is time to train a Machine Learning model on the same. You will be using a Naive Bayes classifier from sklearn which is a prominent python library used for machine learning.

In [None]:
from sklearn.naive_bayes import GaussianNB

#Initialize GaussianNB classifier
model = GaussianNB()
#Fit the model on the train dataset
model = model.fit(X_train, y_train)
#Make predictions on the test datasetsyd
pred = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

print("Accuracy:", accuracy_score(y_test, pred)*100, "%")