# Plotting Lexical Dispersion - working with JSON reviews

In [1]:
import os
import pandas as pd
import json

## Data Munging

In [2]:
path = './data/690_webhose-2017-03_20170904112233'
good_review_folder = os.listdir(path)

In [3]:
good_reviews = []
for file in good_review_folder:
    with open(path + '/' +file, 'r') as json_file:
        data = json_file.readlines()
        good_reviews.append(list(map(json.loads, data))[0])
print(len(good_reviews))

9233


In [4]:
good_reviews[25]

{'organizations': [],
 'uuid': 'b4ce8b8623dcf0778478b600b0470ebad0afc348',
 'thread': {'social': {'gplus': {'shares': 0},
   'pinterest': {'shares': 0},
   'vk': {'shares': 0},
   'linkedin': {'shares': 0},
   'facebook': {'likes': 0, 'shares': 0, 'comments': 0},
   'stumbledupon': {'shares': 0}},
  'site_full': 'www.tripadvisor.com',
  'main_image': 'https://media-cdn.tripadvisor.com/media/photo-s/0e/c5/16/00/double-room.jpg',
  'site_section': 'https://www.tripadvisor.com/Hotel_Review-g186338-d2458180-Reviews-Point_A_Hotel_London_Liverpool_Street-London_England.html',
  'section_title': 'Point A Hotel, London Liverpool Street - UPDATED 2017 Reviews &amp; Price Comparison (England) - TripAdvisor',
  'url': 'https://www.tripadvisor.com/ShowUserReviews-g186338-d2458180-r470788461-Point_A_Hotel_London_Liverpool_Street-London_England.html',
  'country': 'US',
  'domain_rank': 189,
  'title': 'best stay ever',
  'performance_score': 0,
  'site': 'tripadvisor.com',
  'participants_count': 2

In [5]:
keys = good_reviews[0].keys()
keys

dict_keys(['organizations', 'uuid', 'thread', 'author', 'url', 'ord_in_thread', 'title', 'locations', 'entities', 'highlightText', 'language', 'persons', 'text', 'external_links', 'published', 'crawled', 'highlightTitle'])

In [6]:
for UON in good_reviews:
    for key in UON.keys():
        if key not in keys:
            keys.append(key)
            print('added ', key, ' to namespace')
        else:
            pass

In [7]:
columns = ['uuid', 'title', 'published', 'text']

In [8]:
review_df = pd.DataFrame.from_dict(good_reviews, orient = 'columns')

In [13]:
columns = ['uuid', 'title', 'published', 'text']
review_df[columns].head(3)

Unnamed: 0,uuid,title,published,text
0,81637c51531cbadbf4d669be0849afe1dd4180c8,Fantastic stay,2017-03-14T02:00:00.000+02:00,This hotel was quite a find. Centrally located...
1,0043ecb630b38d8ecb79093a008152447eb158e6,Great Hotel,2017-03-11T02:00:00.000+02:00,"Great staff, great rooms!! Will be back for su..."
2,26c0934f3559367dcc94402cd8d7516a1b559684,Heavenly Hilton -,2017-03-18T02:00:00.000+02:00,"First, let me confess: Hilton usually is not m..."


## Feature Engineering - True or False?

In [14]:
bed = "|".join(('pillow', 'bed', 'sheets', 'blankets', 'covers', 'comforter'))
desk = "|".join(('chair', 'desk', 'stationary', 'outlet', 'plug', 'plugs'))
room = "|".join(('carpet', 'wallpaper', 'paint', 'fridge', 'light', 'lights', 'curtain'))
bathroom = "|".join(('towels', 'bath', 'tub', 'shower', 'mirror', 'toilet', 'soap'))

In [15]:
elements = [bed, desk, room, bathroom]

for element in elements:
    review_df[element[0:3]] = review_df['text'].str.contains(element)

In [16]:
review_df.rename(columns = {'pil': 'bed', 'cha': 'desk', 'car': 'room', 'tow': 'bathroom'}, inplace = True)

In [17]:
review_df.head(3)

Unnamed: 0,author,crawled,entities,external_links,highlightText,highlightTitle,language,locations,ord_in_thread,organizations,...,published,text,thread,title,url,uuid,bed,desk,room,bathroom
0,Kim E,2017-03-27T03:07:16.539+03:00,"{'persons': [], 'locations': [], 'organization...",[],,,english,[],0,[],...,2017-03-14T02:00:00.000+02:00,This hotel was quite a find. Centrally located...,"{'social': {'gplus': {'shares': 0}, 'pinterest...",Fantastic stay,https://www.tripadvisor.com/ShowUserReviews-g6...,81637c51531cbadbf4d669be0849afe1dd4180c8,True,False,False,False
1,Danelle S,2017-03-26T21:04:56.491+03:00,"{'persons': [], 'locations': [], 'organization...",[],,,english,[],0,[],...,2017-03-11T02:00:00.000+02:00,"Great staff, great rooms!! Will be back for su...","{'social': {'gplus': {'shares': 0}, 'pinterest...",Great Hotel,https://www.tripadvisor.com/ShowUserReviews-g3...,0043ecb630b38d8ecb79093a008152447eb158e6,True,False,False,False
2,Talkabout,2017-03-28T11:15:47.404+03:00,"{'persons': [], 'locations': [], 'organization...",[],,,english,[],0,[],...,2017-03-18T02:00:00.000+02:00,"First, let me confess: Hilton usually is not m...","{'social': {'gplus': {'shares': 0}, 'pinterest...",Heavenly Hilton -,https://www.tripadvisor.com/ShowUserReviews-g1...,26c0934f3559367dcc94402cd8d7516a1b559684,False,False,False,False


In [21]:
print(review_df.bed.value_counts(), '\n')
print(review_df.desk.value_counts(), '\n')
print(review_df.room.value_counts(), '\n')
print(review_df.bathroom.value_counts(), '\n')

False    7052
True     2181
Name: bed, dtype: int64 

False    8192
True     1041
Name: desk, dtype: int64 

False    8073
True     1160
Name: room, dtype: int64 

False    6829
True     2404
Name: bathroom, dtype: int64 



## Feature Engineering - List of Element Mentions

In [23]:
room_elements = ['pillow', 'bed', 'sheets', 'blankets', 'covers', 'comforter', 
                'chair', 'desk', 'stationary', 'outlet', 'plug', 'plugs',
                'carpet', 'wallpaper', 'paint', 'fridge', 'light', 'lights', 'curtain',
                'towels', 'bath', 'tub', 'shower', 'mirror', 'toilet', 'soap']

punctuation = ',?!.\/#@"><[]'

In [24]:
def room_list(x):
    list_of_words = x.split(" ")
    out_data = []
    for word in list_of_words:
        word = word.lower()
        if word.strip(punctuation) in room_elements:
            out_data.append(word.strip(punctuation))
    return (str(out_data))

In [25]:
review_df['room_list'] = review_df.text.apply(room_list)
review_df.room_list.value_counts()

[]                                                                       6264
['bed']                                                                   634
['desk']                                                                  518
['shower']                                                                217
['bed', 'shower']                                                          96
['light']                                                                  83
['fridge']                                                                 66
['towels']                                                                 65
['bath']                                                                   57
['bed', 'bed']                                                             52
['shower', 'bed']                                                          39
['desk', 'desk']                                                           38
['bed', 'desk']                                                 

## Feature Engineering - Final Element Mentions

In [26]:
import string
def room_item(x):
    list_of_words = x.split(" ")
    for word in list_of_words:
        word = word.lower()
        if word.strip(punctuation) in room_elements:
            return word.strip(punctuation)
        else:
            pass

In [27]:
review_df['room_item'] = review_df.text.apply(room_item)
review_df.room_item.value_counts()

bed           1115
desk           697
shower         407
light          131
bath           118
fridge         107
towels          87
toilet          60
lights          49
tub             35
sheets          27
chair           20
pillow          18
plugs           14
carpet          13
plug            12
mirror          12
paint            9
soap             8
wallpaper        6
curtain          6
outlet           6
covers           6
blankets         5
stationary       1
Name: room_item, dtype: int64

In [21]:
review_df.room_item.fillna('none', inplace = True)

In [24]:
review_df[['uuid', 'published', 'url', 'language', 'text', 'title', 'bed', 'desk', 'room', 'bathroom', 'room_list', 'room_item']].to_csv('good_review_lexical_dispersion.csv', sep = ',')

In [25]:
review_df.columns

Index(['author', 'crawled', 'entities', 'external_links', 'highlightText',
       'highlightTitle', 'language', 'locations', 'ord_in_thread',
       'organizations', 'persons', 'published', 'text', 'thread', 'title',
       'url', 'uuid', 'bed', 'desk', 'room', 'bathroom', 'room_list',
       'room_item'],
      dtype='object')