In [1]:
import pandas as pd
import re, os
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.wordnet import WordNetLemmatizer

In [15]:
data = '/non_hotel_data'
root = os.getcwd()
csv_directory = [file for file in os.listdir(root+data) if file.endswith(".csv")]

In [3]:
def review_to_words(review_column):
    num_reviews = review_column.size
    
    review_words_list = []
    
    for i in range(num_reviews):
        raw_review = str.encode(review_column[i]).decode('unicode-escape')
        # 1. Remove non-letters        
        letters_only = re.sub("[^a-zA-Z]", " ", raw_review) 
        # 2. Convert to lower case, split into individual words
        words = letters_only.lower().split()                             
        # 3. Cleaning stop word
        stops = set(stopwords.words("english"))                  
        meaningful_words = [w for w in words if not w in stops]   
        # 4. changing all word to present tense
        meaningful_words_list = [WordNetLemmatizer().lemmatize(word,'v') for word in meaningful_words]
        
        review_words_list += meaningful_words_list
        
    return(review_words_list)

def update_word_count_pd(wordcount_pd, review_words_list, csv_name):
    
    wordcounter = Counter(review_words_list) #word counting
    
    for key, freq in wordcounter.items():
    
        if key not in list(wordcount_pd['word']):
            wordcount_pd.loc[-1] = [key, freq, [csv_name]] #csv name
            wordcount_pd.index = wordcount_pd.index + 1
            wordcount_pd = wordcount_pd.sort_index()
        else:
            index_value = wordcount_pd[wordcount_pd['word']== key].index.values.astype(int)[0]

            org_freq = wordcount_pd.loc[wordcount_pd['word'] == key, 'freq']
            new_freq = org_freq + freq

            org_source = wordcount_pd.loc[wordcount_pd['word'] == key, 'source']
            new_source = org_source + [csv_name]
            #print(source)
            # update row
            wordcount_pd.at[index_value,'freq'] = int(new_freq) #Update freq
            wordcount_pd.at[index_value,'source'] = list(new_source)[0] #Update source
    
    return wordcount_pd

In [16]:
wordcount_pd = pd.DataFrame(columns = ['word','freq','source']) # create empty dataframe

In [17]:
for csv in csv_directory:
    print (csv)
    csv_path = root+data+'/' + csv
    hotel_review = pd.read_csv(csv_path, delimiter="\t")
    
    review_words_list = review_to_words(hotel_review['review content'])
    wordcount_pd = update_word_count_pd(wordcount_pd, review_words_list, csv)

Tiki_Moon_Villas_all_data.csv
Kalani_Hawaii_Private_Lodging_all_data.csv
Manoa_Valley_Inn_all_data.csv
Hawaiian_Monarch_Hotel_all_data.csv
Waikiki_Beachside_Hostel_all_data.csv
Stay_Condominiums_Waikiki_all_data.csv
Marina_Tower_Waikiki_all_data.csv
Wyndham_at_Waikiki_Beach_Walk_all_data.csv


In [18]:
ascending_wordcount_pd = wordcount_pd.sort_values('freq', ascending=False)

In [19]:
ascending_wordcount_pd.to_csv('non_hotel_summary.csv',index=False)