### Preprocessing the Yelp Comments on Berlin Restaurants

#### Get required packages

In [152]:
import pandas as pd
import re
import nltk
import string
import nltk.tokenize
#Run in Terminal
#pip install spacy
#python -m spacy download de_core_news_md
import spacy as spacy



#### Get dataset

In [153]:
yelp = pd.read_csv('../1_scraping/intermediary_outputs/german_merged.csv')
yelp.columns.values[0]="ID"
yelp = yelp.rename(columns={'Overall Rating':'Overall_Rating',
                                      "Total Reviews":'Total_Reviews',
                                      "Restaurant Name":"Restaurant_Name",
                                     "Price Range":"Price_Range"})
yelp.columns
yelp['ID'] = yelp['ID'].astype(int)
yelp['Comment'] = yelp['Comment'].astype(str)
yelp['Overall_Rating'] = yelp['Overall_Rating'].astype(float)
yelp['Date'] = yelp['Date'].astype('datetime64[ns]',"dd-MM-yyyy")
yelp.dtypes
print(yelp.info())



  yelp['Date'] = yelp['Date'].astype('datetime64[ns]',"dd-MM-yyyy")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9721 entries, 0 to 9720
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   ID               9721 non-null   int32         
 1   Restaurant_Name  9721 non-null   object        
 2   Overall_Rating   9721 non-null   float64       
 3   Total_Reviews    9721 non-null   int64         
 4   Specialty        9721 non-null   object        
 5   Region           9721 non-null   object        
 6   Price_Range      9721 non-null   object        
 7   Author           9721 non-null   object        
 8   Comment          9721 non-null   object        
 9   Rating           9721 non-null   int64         
 10  Date             9721 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int32(1), int64(2), object(6)
memory usage: 797.6+ KB
None


#### Remove Duplicates

In [154]:
print(len(yelp))
# Remove duplicates
yelp = yelp.drop_duplicates(subset=['Comment'], inplace=False)
print(len(yelp)) #15 duplicates found

9721
9706


#### Basic Cleaning

In [155]:
#remove pattern "Unknown" / "x Fotos" in Author - replace with None
yelp['Author'] = yelp.Author.where((yelp.Author == 'Unknown') | ('Foto' in yelp.Author), None)
yelp['Price_Range'] = yelp.Price_Range.where(yelp.Price_Range == 'Unknown', None)
#print(yelp[yelp.ID==1114])

def clean_text(text):
    #fix enter instead of punctuation or space but keep contiguous capitalization
    text = re.sub('(?<=.)(?=[A-Z][a-z])', r" ", text)
    #remove random urls w/o http
    text = re.sub('[\w]+\.[\w]+\/+[\w]+','',text)
    #remove URL with http
    text = re.sub(r'http\S+', '', text)
    #remove URL with www
    text = re.sub(r'www\S+', '', text)
    # add space after . to avoid word concatenation when user left no space after .
    text = re.sub(r'(?<=[.,)!])(?=[^\s])', r' ', text)
    # Remove Emoji chars
    emoticons = r'[\W]+(?::|;|=)(?:-)?(?:\)+|\(|D|P)'
    text = re.sub(emoticons, '' , text)
    #remove hashtags but leave the content of a hashtag in text
    text = re.sub(r"([#]+)", "", text)
    #remove @name shoutouts +  weird shoutouts with space between @ and name
    pattern_shoutout_one = r"((\w+|[^a-z])[@](\s+\w+|\w+|.*))"
    text = re.sub(pattern_shoutout_one, "",text)
    #Remove weird unicode characters such as U+2026
    text = re.sub(r'[^\x00-\x7FäöüÄÖÜß]+', '', text)
    # # remove hashtags and normal shoutouts with @
    # pattern_hashtags_shoutouts = r"([@#]\w+)"
    # text = re.sub(pattern_hashtags_shoutouts,"" ,text)

    #remove digits
    text = re.sub(r'[0-9]+', '', text)
    #remove 1 char words
    text = re.sub(r'\b\w{1}\b', ' ', text)
    #remove restaurant name?

    #language detection?

    #Filter to allow only alphabets
    #text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    
    #Fix &
    #text = re.sub(r'&amp;', '&', text)
    

   # text = re.sub(r'[?!.;:",#@-]', ' ', text)
    #text = re.sub(r'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', '', text)
    
    #Remove Prices and numbers
    #

    #Remove punctuations etc.
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    #Convert to lowercase to maintain consistency
    text = text.lower()
    return text

df = yelp[['ID', 'Comment']]
pd.set_option('max_colwidth', 800)


yelp['Comment'] = yelp.Comment.apply(lambda x: clean_text(x))
print("URL case")
print(yelp.Comment[0])
print(yelp.Comment[3562])
print("Emoji case")
print(yelp.Comment[28])
print(yelp.Comment[6])
print(yelp.Comment[226])
print(yelp.Comment[164])
print("Shoutouts")
print("1")
print(yelp.Comment[278])#remove comments in english
print("2")
print(yelp.Comment[2672])
print("3")
print(yelp.Comment[6434])
print("4")
print(yelp.Comment[2690])
print("Hashtags")
print(yelp.Comment[1134])#remove comments in english
print(yelp.Comment[4605])
print("Commentar 112 gelandetTolles")
print(yelp.Comment[112])
print(yelp.Comment[2680])

URL case
ich habe mir  bewertungen zu  restaurants auf  menulist angesehen und dieses  restaurant hatte gute  kritiken also habe ich es ausprobiert und es nicht bereut 
neue  homepage
Emoji case
der wohl verrückteste  kellner den ich je hatte aber eine einzigartige  kneipe die sich  hinter der  stadtklause versteckt  im  kellergewölbe kann man auch noch sitzen günstiges deutsches  essen und  bier in einem eigentlich so touristisch überfluteten  potsdamer  platz  ecke gerade bei den sonst so hohen  bierpreisen in der  umgebung des  sony  center liefert die  stadtklause eine  entspannung für den  geldbeutel  preis leistung stimmt
super leckeres  essen cooles  atmosphäre  gutes  bier leider keine ec  karten oder  kreditkarten aktzeptiert das ist ein bissle schade
mit  abstand das beste  schnitzel weit und breit  super nettes  ambiente und tolles  personal beim nächsten berlin besuch wieder  daumen hoch
beste  ramen die ich jemals gegessen habe ja es ist voll aber man wartet mi ich lange a

In [156]:
yelp.Comment.values[68]

'das als kleine  bodega  tapasrestaurant mit kleiner  aussenterrasse für  raucher ist eine  toplocation für span küche  sowohl die  datteln im  speckmantel wie die  gambas al ajillo und die  kartoffeln mit pikanter  sauce sind spitze  zu zweit bezahlten wir   euro samt     rotwein     datteln je   stück   hähnchen in  honigsalsascharf hmmmm   garnelen in  knoblauchsauce  kroketten mit hühnchen sowie  kartoffeln ca   stück  frühlingskartoffeln wir finden das ist nicht das billigste aber seinen  preis auf jeden  fall wert wie heisst es in  ebay weiter so und gerne wieder'

#### Tokenization

In [157]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

def tokenize(text):
    tokens = word_tokenize(text,language='german')

    return tokens

yelp['Comment'] = yelp['Comment'].apply(lambda x: tokenize(x))
print(yelp.Comment[7890])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['wir', 'sind', 'alle', 'monate', 'in', 'der', 'schnitzelei', 'die', 'sehr', 'großen', 'schnitzel', 'sind', 'sehr', 'lecker', 'und', 'die', 'karte', 'ist', 'kreativ', 'zusammengestellt', 'insbesondere', 'das', 'kostenlose', 'begrüßungsbier', 'und', 'die', 'deutschen', 'tapas', 'heben', 'die', 'schnitzelei', 'nochmal', 'von', 'anderen', 'lokalen', 'ab', 'zudem', 'sind', 'alle', 'bedienungen', 'sehr', 'freundlich', 'und', 'das', 'ambiente', 'ist', 'sehr', 'angenehm']


#### Remove Stopwords

In [158]:
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('german')
#print(stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [159]:
# Filters out Stopwords specific for German language
def rm_stopwords(text):
    text = " ".join([word for word in text if word not in stop_words])
    return text

yelp['Comment'] = yelp['Comment'].apply(lambda x: rm_stopwords(x))
print(yelp.Comment[112])


zufall marral gelandet tolles essen toller service guten preis kommen defintiv


In [160]:
# Weitere Idee für data cleaning: filter out the name of the restaurant. If not done tokens with restaurant name that have
# more comments than others will have higher frequency in bag of words model
print(yelp.Comment[6789])


schönes restaurant herzen berlins wobei preise trotzdem human otito seit guten jahr stammlokal gutes sushi fisch stets frisch guter qualität angemessenen preisen weiterhin bestellen speisen take away mitnehmen hinsicht empfehlung asiatisches essen


#### Lemmatisierung mit SpaCy

In [161]:
# https://blog.codecentric.de/natural-language-processing-basics
# https://textmining.wp.hs-hannover.de/Preprocessing.html
# https://nickyreinert.de/blog/2020/12/09/einfuehrung-in-stemming-und-lemmatisierung-deutscher-texte-mit-python/
# https://de.steadforce.com/blog/natural-language-processing-tools
# https://spacy.io/usage/linguistic-features#lemmatization

In [162]:
#pip install spacy
#python -m spacy download de_core_news_md

In [None]:

spc =  spacy.load(r'de_core_news_md')


def lemmatize_spc(tokenized_comment):

    tok_cmt_as_spacy_object = spc(tokenized_comment)
    text = [token.lemma_ for token in tok_cmt_as_spacy_object]
    return [word.lower() for word in text]
yelp['Comment'] = yelp['Comment'].apply(lambda x: lemmatize_spc(x))





In [None]:
print(yelp.Comment[50:59])
print(yelp.Comment[78])
print(yelp.Comment[65:70])
print(yelp.Comment[112])