In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

import string
import nltk
#stemming
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag


## Data import - processed reviews

In [2]:
dataset = pd.read_csv('./vienna_data/processed_reviews.csv')

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 466660 entries, 0 to 466659
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   listing_id   466660 non-null  int64  
 1   reviewer_id  466660 non-null  int64  
 2   comments     466660 non-null  object 
 3   Compound     466660 non-null  float64
dtypes: float64(1), int64(2), object(1)
memory usage: 14.2+ MB


## Subset english reviews for word clouds

Count the total number of words per review and the english words per review into 2 separate columns

In [43]:
words = set(nltk.corpus.words.words())

length = len(dataset)
english_words = []
total_words = []

for i in range (0, length):
    count = 0
    totalcount = 0
    review = dataset.comments.values[i]
    #tokenize and lower review
    tokenized_review = nltk.wordpunct_tokenize(review)
    tokenized_review = [w.lower() for w in tokenized_review]
    
    #remove punctuation
    table = str.maketrans('', '', string.punctuation)
    stripped_review = [w.translate(table) for w in tokenized_review]
    stripped = [w for w in stripped_review if w.isalpha()]
    
    #POS-tag and lemmatize
    lemmaWords = []
    wordnet_lemmatizer = WordNetLemmatizer()
    for word,tag in pos_tag(stripped):
        wntag = tag[0].lower()
        wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
        lemma = wordnet_lemmatizer.lemmatize(word, wntag) if wntag else word
        lemmaWords.append(lemma)
    
    for w in lemmaWords:
        totalcount +=1
        if w.lower()in words:
            count +=1
    english_words.append(count)
    total_words.append(totalcount)
    
dataset['english_words'] = np.array(english_words)
dataset['total_words'] = np.array(total_words)


In [45]:
with pd.option_context('display.max_colwidth', None): 
    print(dataset.tail(20))

        listing_id  reviewer_id  \
466640    42575315    336280355   
466641    42575315    217364463   
466642    42583867    175483145   
466643    42631068    256961243   
466644    42640682      1853439   
466645    42642091    299008765   
466646    42665072     14455623   
466647    42665072      6036216   
466648    42665072    196311212   
466649    42666433    126455592   
466650    42667250    141082281   
466651    42667250     30923871   
466652    42678136    287767126   
466653    42689613    324408348   
466654    42700746    177308173   
466655    42705436      4005354   
466656    42707880    143319370   
466657    42741768     60936666   
466658    42782957     12348831   
466659    42822480    196565405   

                                                                                                                                                                                                                                                                        

In [46]:
dataset['comments'][466648]

'This accommodation is not so far from Vienna hbf station  Alexander is very kind and contacted us frequently  If you choose this accommodation  you can be got a rare experience   この宿はｳｨｰﾝ中央駅からそんなに遠く離れていません｡Alexanderさんは頻繁に連絡をくれるため､とても親切で安心できます｡この宿では少し珍しい体験をできるかもしれません '

In [69]:
dataset.head()

Unnamed: 0,listing_id,reviewer_id,comments,Compound,english_words,total_words
0,15883,30537860,If you need a clean comfortable place to stay...,94.8,78,84
1,15883,37529754,It's so nice to be in the house It's a peace ...,96.04,23,24
2,15883,3147341,A beautiful place uniquely decorated showing ...,92.98,60,63
3,15883,29518067,Eine sehr schöne Unterkunft in einem privaten ...,0.0,6,34
4,15883,36016357,It was a very pleasant stay Excellent locatio...,92.08,13,13


In [48]:
dataset[dataset['english_words']/dataset['total_words'] < 0.7 ]

Unnamed: 0,listing_id,reviewer_id,comments,Compound,english_words,total_words
3,15883,29518067,Eine sehr schöne Unterkunft in einem privaten ...,0.00,6,34
7,15883,11338741,Der Aufenthalt bei Eva war zu meiner vollsten ...,-97.61,17,70
24,38768,2422768,Sehr schöne Unterkunft ruhig und doch zentral...,-59.94,3,18
26,38768,2526106,la estancia en este apartamento por llamarlo d...,-83.16,80,186
27,38768,3177185,Das Apartement ist perfekt für eine Städtereis...,-59.94,26,87
...,...,...,...,...,...,...
466650,42667250,141082281,Super nette Gastgeberin kleine feine Wohnung ...,73.51,3,12
466651,42667250,30923871,Leticia ist sehr nett und hilfsbereit Die Woh...,-49.12,6,25
466652,42678136,287767126,Nette Kommunikation unproblematischer Check-i...,-24.22,10,30
466653,42689613,324408348,Die Wohnung ist neu renoviert und sehr schön e...,-83.16,8,46


In [49]:
dataset.tail(20)

Unnamed: 0,listing_id,reviewer_id,comments,Compound,english_words,total_words
466640,42575315,336280355,Amazing apartment and very convenient location...,58.59,12,12
466641,42575315,217364463,Great space and very clean apartment Easy acc...,96.42,38,38
466642,42583867,175483145,Amazing apartment ten minutes walking to hist...,87.18,13,13
466643,42631068,256961243,The appartment is asoluteley lovely super cle...,99.02,57,63
466644,42640682,1853439,Euch erwartet ein top ausgestattetes Apartment...,20.23,4,13
466645,42642091,299008765,La estancia fue increíble Nos dejaron cosas p...,0.0,4,15
466646,42665072,14455623,The apartment is clean and simple You have to...,-29.6,62,63
466647,42665072,6036216,Very cute and great apartment price vs value ...,97.05,84,90
466648,42665072,196311212,This accommodation is not so far from Vienna h...,64.59,27,34
466649,42666433,126455592,Me and my friend really liked it We loved it ❤️,88.05,10,10


In [59]:
non_english = pd.DataFrame(dataset[dataset['english_words']/dataset['total_words'] < 0.7 ])

In [60]:
english = pd.DataFrame(dataset[dataset['english_words']/dataset['total_words'] >= 0.7])

In [61]:
huh = dataset[ (dataset['english_words']/dataset['total_words'] >= 0.7 ) & (dataset['english_words']/dataset['total_words'] <= 0.8 ) ]

In [62]:
len(huh)

3988

In [74]:
huh.head()

Unnamed: 0,listing_id,reviewer_id,comments,Compound,english_words,total_words
165,38768,65209029,Hannes became very trustworthly and prompt co-...,69.69,29,37
210,38768,10590684,Great place for an unposh stay in Vienna Hann...,84.81,10,13
287,38768,179179808,Hannes offers great appartments well-located ...,87.77,19,24
369,40625,66422696,Everything is perfectt Thank you so much Ms ...,36.12,7,9
431,40625,122666236,Ingela is amazingly kind,57.09,3,4


In [73]:
english.reset_index(drop = True, inplace=True)

In [75]:
with pd.option_context('display.max_colwidth', 200): 
    print(english.tail(10))

        listing_id  reviewer_id  \
309100    42631068    256961243   
309101    42665072     14455623   
309102    42665072      6036216   
309103    42665072    196311212   
309104    42666433    126455592   
309105    42700746    177308173   
309106    42705436      4005354   
309107    42707880    143319370   
309108    42741768     60936666   
309109    42782957     12348831   

                                                                                                                                                                                                       comments  \
309100  The appartment is asoluteley lovely  super clean and well equipped  Everything is brand new  The location is perfect - Metro station accross the street  Lidl shop around the corner  parked my car ...   
309101  The apartment is clean and simple  You have to pick the key up from the packet station  two stations far from the apartment  But it was no problem because Alexander described the exact

*As it can be seen, in the english dataset there is at least one comment in another language that the algorithm couldn't filter. One method could be filtering even further the english dataset and removing all words not recognized in english.*

In [77]:
english.comments[309103]

'This accommodation is not so far from Vienna hbf station  Alexander is very kind and contacted us frequently  If you choose this accommodation  you can be got a rare experience   この宿はｳｨｰﾝ中央駅からそんなに遠く離れていません｡Alexanderさんは頻繁に連絡をくれるため､とても親切で安心できます｡この宿では少し珍しい体験をできるかもしれません '

In [78]:
english.iloc[309103, [4,5]]

english_words    27
total_words      34
Name: 309103, dtype: object

In [65]:
print(len(dataset))
print(len(english))
print(len(non_english))

466660
309110
156685


In [79]:
english.to_csv('./vienna_data/englsih_reviews.csv', index = False)