# Import Libraries

In [1]:
import pandas as pd

In [2]:
from datasets import concatenate_datasets, load_dataset
from datasets import Dataset, DatasetDict

# Read Dataset

In [3]:
df = pd.read_json("../data/yelp/yelp.jsonl", lines=True)

In [4]:
df

Unnamed: 0,ty_id,review_date,score,author,language,title,content,extractions,sentences
0,NW366Ota9w52nYofW99u1g,,-1,FwqVjnpwpcW6fHQxAYigYg,en,,Good cheap place to get away from the heat a t...,"[{'opinion': 'Good cheap', 'aspect': 'place', ...","[[Good, cheap, place, to, get, away, from, the..."
1,NW366Ota9w52nYofW99u1g,,-1,ZNQXrXdCfRX5m08ddcYgtQ,en,,"If you are looking for something to do, take a...","[{'opinion': 'really old', 'aspect': 'theater'...","[[If, you, are, looking, for, something, to, d..."
2,NW366Ota9w52nYofW99u1g,,-1,gm6W0Ys2_b5d9bUcfL8aYw,en,,"Wow, this is a clean and neat establishment! W...","[{'opinion': 'clean', 'aspect': 'establishment...","[[Wow, ,, this, is, a, clean, and, neat, estab..."
3,NW366Ota9w52nYofW99u1g,,-1,iwIJDqCQfxpfnKTYu6vbNQ,en,,This place was hard to find and pretty much a ...,"[{'opinion': 'hard to find', 'aspect': 'place'...","[[This, place, was, hard, to, find, and, prett..."
4,NW366Ota9w52nYofW99u1g,,-1,yq1ZN_38X3hOWrUJ_ZTKCw,en,,"Yep, read the other reviews and decided to try...","[{'opinion': 'OK', 'aspect': 'ticket prices', ...","[[Yep, ,, read, the, other, reviews, and, deci..."
...,...,...,...,...,...,...,...,...,...
1038121,6bgjcFOy4WHMyw62_1V9Pw,,-1,gTJVaTPMyVzjzDhaX447hA,en,,"We ordered two sandwiches, two sodas, and a bo...","[{'opinion': 'mediocre', 'aspect': 'Sandwich',...","[[We, ordered, two, sandwiches, ,, two, sodas,..."
1038122,6bgjcFOy4WHMyw62_1V9Pw,,-1,SzLKZVb24ZlenSBFTBGGTQ,en,,"Mehh... came in at 9pm, one hour before close....","[{'opinion': 'Saddest', 'aspect': 'grilled che...","[[Mehh, ..., came, in, at, 9, pm, ,, one, hour..."
1038123,6bgjcFOy4WHMyw62_1V9Pw,,-1,2Lh31bK4AFis5WKXcutDdQ,en,,I love coming to Panera! Ashley Love and Luke ...,"[{'opinion': 'amazing and friendly', 'aspect':...","[[I, love, coming, to, Panera, !], [Ashley, Lo..."
1038124,6bgjcFOy4WHMyw62_1V9Pw,,-1,EkC8wieEPelR_EzSmQchdQ,en,,So I walk into The Panera Rest. On Ft Apache t...,[],"[[So, I, walk, into, The, Panera, Rest, .], [O..."


## Opinion Extraction Preview

In [5]:
df['extraction_len'] = df['extractions'].apply(lambda x: len(x))
df['sentence_len'] = df['sentences'].apply(lambda x: len(x))

In [6]:
test_idx = 2
print(df['content'].iloc[test_idx])
print(len(df['sentences'].iloc[test_idx]))
df['extractions'].iloc[test_idx]

Wow, this is a clean and neat establishment! We came in for a 3:30 show on Friday and shared the ~600-person theater with 20-something people. The lobby and hall were pretty dark, and the entrance itself was sketchily situated on a backlot-like side road/street/alley, but the bathroom, theater and seating looked and smelled very clean and sterile, and the staff was very courteous. The sound might've been a bit low, but otherwise it was a clean, straightforward and intimate experience, and if you find yourself on the Vegas Strip and looking for a mainstream/Hollywood/blockbuster movie, this is a great find.
4


[{'opinion': 'clean',
  'aspect': 'establishment',
  'sid': 0,
  'asp_start': 8,
  'asp_end': 8,
  'op_start': 5,
  'op_end': 5,
  'attribute': 'restaurant -> atmosphere',
  'sentiment': 'positive'},
 {'opinion': '~600-person',
  'aspect': 'theater',
  'sid': 1,
  'asp_start': 15,
  'asp_end': 15,
  'op_start': 14,
  'op_end': 14,
  'attribute': 'restaurant -> atmosphere',
  'sentiment': 'neutral'},
 {'opinion': 'pretty dark',
  'aspect': 'hall',
  'sid': 2,
  'asp_start': 3,
  'asp_end': 3,
  'op_start': 5,
  'op_end': 6,
  'attribute': 'restaurant -> atmosphere',
  'sentiment': 'negative'},
 {'opinion': 'very courteous',
  'aspect': 'staff',
  'sid': 2,
  'asp_start': 44,
  'asp_end': 44,
  'op_start': 46,
  'op_end': 47,
  'attribute': 'staff',
  'sentiment': 'positive'},
 {'opinion': 'sketchily situated',
  'aspect': 'entrance',
  'sid': 2,
  'asp_start': 10,
  'asp_end': 10,
  'op_start': 13,
  'op_end': 14,
  'attribute': 'restaurant -> atmosphere',
  'sentiment': 'negative'},
 {

# Data Sampling

## Sample Top Business Categories

In [7]:
df_yelp_business = pd.read_json("../data/yelp/business.json", lines=True)

In [8]:
df_yelp_business = df_yelp_business.rename(columns={'business_id': 'ty_id'})
df = df.merge(df_yelp_business[['ty_id', 'name', 'categories']], on=['ty_id'])
df['categories'].value_counts().index.tolist()
df['categories_list'] = df['categories'].apply(lambda x: x.split(", ") if pd.notnull(x) else [])
df = df.reset_index()

**Get Top Businesses Categories**

In [9]:
top_domains = df['categories_list'].explode().value_counts()
temp_df = df.explode(['categories_list'])
temp_df = temp_df.set_index('categories_list').loc[top_domains.index.tolist(),].reset_index()
temp_df = temp_df.drop_duplicates(subset=['index'])

In [10]:
temp_df['categories_list'].value_counts()

Restaurants                     704679
Beauty & Spas                    73428
Food                             69920
Automotive                       38373
Shopping                         34190
Nightlife                        20970
Home Services                    19407
Event Planning & Services        18336
Active Life                      12933
Arts & Entertainment             12094
Health & Medical                 10527
Pets                              9665
Local Services                    6598
Hotels & Travel                   4739
Professional Services              706
Public Services & Government       450
Education                          404
Financial Services                 347
Religious Organizations            137
Local Flavor                       128
Mass Media                          95
Name: categories_list, dtype: int64

**Select top business categories**

In [11]:
selected_top_domains = ['Restaurants', 'Hotels', 'Beauty & Spas', 'Arts & Entertainment', 'Automotive']
selected_df = df[df['categories_list'].apply(lambda x: any([cat in selected_top_domains for cat in x]))]
selected_df['domain'] = selected_df['categories_list'].apply(lambda x: list(set(x) & set(selected_top_domains)))
selected_df = selected_df[selected_df['domain'].str.len() == 1]
selected_df['domain'] = selected_df['domain'].apply(lambda x: x[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_df['domain'] = selected_df['categories_list'].apply(lambda x: list(set(x) & set(selected_top_domains)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_df['domain'] = selected_df['domain'].apply(lambda x: x[0])


In [12]:
selected_df['domain'].value_counts()

Restaurants             686343
Beauty & Spas            72324
Automotive               41784
Arts & Entertainment     25249
Hotels                    8658
Name: domain, dtype: int64

## Sample Top Businesses Per Category

In [13]:
k = 50
# k = 30

In [14]:
top_k_business = []
for domain in selected_df['domain'].unique():
    top_k_business += selected_df.loc[selected_df['domain'] == domain, 'ty_id'].value_counts().head(k).index.tolist()

In [15]:
selected_df = selected_df[df['ty_id'].isin(top_k_business)]

  selected_df = selected_df[df['ty_id'].isin(top_k_business)]


In [16]:
selected_df

Unnamed: 0,index,ty_id,review_date,score,author,language,title,content,extractions,sentences,extraction_len,sentence_len,name,categories,categories_list,domain
306994,306994,iCCsd62g79PDjswXaqxltw,,-1,S4nbxIsCCYj0VIhRF8BuDg,en,,Hey Mrs. Robinson! \n\nThe Graduate was a fant...,"[{'opinion': 'modern', 'aspect': 'twist', 'sid...","[[Hey, Mrs., Robinson, !, ], [The, Graduate,...",4,6,Graduate Tempe,"Venues & Event Spaces, Event Planning & Servic...","[Venues & Event Spaces, Event Planning & Servi...",Hotels
306995,306995,iCCsd62g79PDjswXaqxltw,,-1,HNmHme33Ar8EnskRVmBypg,en,,"The rooms are decent sized, and the view is aw...","[{'opinion': 'decent sized', 'aspect': 'rooms'...","[[The, rooms, are, decent, sized, ,, and, the,...",7,6,Graduate Tempe,"Venues & Event Spaces, Event Planning & Servic...","[Venues & Event Spaces, Event Planning & Servi...",Hotels
306996,306996,iCCsd62g79PDjswXaqxltw,,-1,AFBb3pYlCVTpBy2ZvvqsiA,en,,My son is a future sun devil and I found this ...,"[{'opinion': 'awesome', 'aspect': 'employees',...","[[My, son, is, a, future, sun, devil, and, I, ...",2,3,Graduate Tempe,"Venues & Event Spaces, Event Planning & Servic...","[Venues & Event Spaces, Event Planning & Servi...",Hotels
306997,306997,iCCsd62g79PDjswXaqxltw,,-1,4tUGHviSJ7RPpxHhbXiuQw,en,,Awesome place to stay when near ASU. Friendly ...,"[{'opinion': 'Awesome', 'aspect': 'place', 'si...","[[Awesome, place, to, stay, when, near, ASU, ....",7,5,Graduate Tempe,"Venues & Event Spaces, Event Planning & Servic...","[Venues & Event Spaces, Event Planning & Servi...",Hotels
306998,306998,iCCsd62g79PDjswXaqxltw,,-1,GLZP_OfjcQLWivre2uF-Zw,en,,Bad hotel. Very run down. Light broken in our ...,"[{'opinion': 'Bad', 'aspect': 'hotel', 'sid': ...","[[Bad, hotel, .], [Very, run, down, .], [Light...",2,11,Graduate Tempe,"Venues & Event Spaces, Event Planning & Servic...","[Venues & Event Spaces, Event Planning & Servi...",Hotels
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1038121,1038121,6bgjcFOy4WHMyw62_1V9Pw,,-1,gTJVaTPMyVzjzDhaX447hA,en,,"We ordered two sandwiches, two sodas, and a bo...","[{'opinion': 'mediocre', 'aspect': 'Sandwich',...","[[We, ordered, two, sandwiches, ,, two, sodas,...",5,8,Panera Bread,"Breakfast & Brunch, Soup, Food, Bagels, Salad,...","[Breakfast & Brunch, Soup, Food, Bagels, Salad...",Restaurants
1038122,1038122,6bgjcFOy4WHMyw62_1V9Pw,,-1,SzLKZVb24ZlenSBFTBGGTQ,en,,"Mehh... came in at 9pm, one hour before close....","[{'opinion': 'Saddest', 'aspect': 'grilled che...","[[Mehh, ..., came, in, at, 9, pm, ,, one, hour...",2,11,Panera Bread,"Breakfast & Brunch, Soup, Food, Bagels, Salad,...","[Breakfast & Brunch, Soup, Food, Bagels, Salad...",Restaurants
1038123,1038123,6bgjcFOy4WHMyw62_1V9Pw,,-1,2Lh31bK4AFis5WKXcutDdQ,en,,I love coming to Panera! Ashley Love and Luke ...,"[{'opinion': 'amazing and friendly', 'aspect':...","[[I, love, coming, to, Panera, !], [Ashley, Lo...",3,5,Panera Bread,"Breakfast & Brunch, Soup, Food, Bagels, Salad,...","[Breakfast & Brunch, Soup, Food, Bagels, Salad...",Restaurants
1038124,1038124,6bgjcFOy4WHMyw62_1V9Pw,,-1,EkC8wieEPelR_EzSmQchdQ,en,,So I walk into The Panera Rest. On Ft Apache t...,[],"[[So, I, walk, into, The, Panera, Rest, .], [O...",0,7,Panera Bread,"Breakfast & Brunch, Soup, Food, Bagels, Salad,...","[Breakfast & Brunch, Soup, Food, Bagels, Salad...",Restaurants


# Extract Comments

In [17]:
selected_df

Unnamed: 0,index,ty_id,review_date,score,author,language,title,content,extractions,sentences,extraction_len,sentence_len,name,categories,categories_list,domain
306994,306994,iCCsd62g79PDjswXaqxltw,,-1,S4nbxIsCCYj0VIhRF8BuDg,en,,Hey Mrs. Robinson! \n\nThe Graduate was a fant...,"[{'opinion': 'modern', 'aspect': 'twist', 'sid...","[[Hey, Mrs., Robinson, !, ], [The, Graduate,...",4,6,Graduate Tempe,"Venues & Event Spaces, Event Planning & Servic...","[Venues & Event Spaces, Event Planning & Servi...",Hotels
306995,306995,iCCsd62g79PDjswXaqxltw,,-1,HNmHme33Ar8EnskRVmBypg,en,,"The rooms are decent sized, and the view is aw...","[{'opinion': 'decent sized', 'aspect': 'rooms'...","[[The, rooms, are, decent, sized, ,, and, the,...",7,6,Graduate Tempe,"Venues & Event Spaces, Event Planning & Servic...","[Venues & Event Spaces, Event Planning & Servi...",Hotels
306996,306996,iCCsd62g79PDjswXaqxltw,,-1,AFBb3pYlCVTpBy2ZvvqsiA,en,,My son is a future sun devil and I found this ...,"[{'opinion': 'awesome', 'aspect': 'employees',...","[[My, son, is, a, future, sun, devil, and, I, ...",2,3,Graduate Tempe,"Venues & Event Spaces, Event Planning & Servic...","[Venues & Event Spaces, Event Planning & Servi...",Hotels
306997,306997,iCCsd62g79PDjswXaqxltw,,-1,4tUGHviSJ7RPpxHhbXiuQw,en,,Awesome place to stay when near ASU. Friendly ...,"[{'opinion': 'Awesome', 'aspect': 'place', 'si...","[[Awesome, place, to, stay, when, near, ASU, ....",7,5,Graduate Tempe,"Venues & Event Spaces, Event Planning & Servic...","[Venues & Event Spaces, Event Planning & Servi...",Hotels
306998,306998,iCCsd62g79PDjswXaqxltw,,-1,GLZP_OfjcQLWivre2uF-Zw,en,,Bad hotel. Very run down. Light broken in our ...,"[{'opinion': 'Bad', 'aspect': 'hotel', 'sid': ...","[[Bad, hotel, .], [Very, run, down, .], [Light...",2,11,Graduate Tempe,"Venues & Event Spaces, Event Planning & Servic...","[Venues & Event Spaces, Event Planning & Servi...",Hotels
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1038121,1038121,6bgjcFOy4WHMyw62_1V9Pw,,-1,gTJVaTPMyVzjzDhaX447hA,en,,"We ordered two sandwiches, two sodas, and a bo...","[{'opinion': 'mediocre', 'aspect': 'Sandwich',...","[[We, ordered, two, sandwiches, ,, two, sodas,...",5,8,Panera Bread,"Breakfast & Brunch, Soup, Food, Bagels, Salad,...","[Breakfast & Brunch, Soup, Food, Bagels, Salad...",Restaurants
1038122,1038122,6bgjcFOy4WHMyw62_1V9Pw,,-1,SzLKZVb24ZlenSBFTBGGTQ,en,,"Mehh... came in at 9pm, one hour before close....","[{'opinion': 'Saddest', 'aspect': 'grilled che...","[[Mehh, ..., came, in, at, 9, pm, ,, one, hour...",2,11,Panera Bread,"Breakfast & Brunch, Soup, Food, Bagels, Salad,...","[Breakfast & Brunch, Soup, Food, Bagels, Salad...",Restaurants
1038123,1038123,6bgjcFOy4WHMyw62_1V9Pw,,-1,2Lh31bK4AFis5WKXcutDdQ,en,,I love coming to Panera! Ashley Love and Luke ...,"[{'opinion': 'amazing and friendly', 'aspect':...","[[I, love, coming, to, Panera, !], [Ashley, Lo...",3,5,Panera Bread,"Breakfast & Brunch, Soup, Food, Bagels, Salad,...","[Breakfast & Brunch, Soup, Food, Bagels, Salad...",Restaurants
1038124,1038124,6bgjcFOy4WHMyw62_1V9Pw,,-1,EkC8wieEPelR_EzSmQchdQ,en,,So I walk into The Panera Rest. On Ft Apache t...,[],"[[So, I, walk, into, The, Panera, Rest, .], [O...",0,7,Panera Bread,"Breakfast & Brunch, Soup, Food, Bagels, Salad,...","[Breakfast & Brunch, Soup, Food, Bagels, Salad...",Restaurants


**Extract comments from reviews**

In [18]:
def extract_opinionated_comments_from_review(grp):
    extractions = grp['extractions'].iloc[0]
    sentences = grp['sentences'].iloc[0]
    
    opinionated_sid_map = {}
    for opinion in (extractions):
        if opinion['sid'] in opinionated_sid_map:
            opinionated_sid_map[opinion['sid']] += [opinion]
        else:
            opinionated_sid_map[opinion['sid']] = [opinion]
    
    opinionated_sentences = [" ".join(sentences[i]) for i in opinionated_sid_map.keys()]
    opinions = []
    aspects = []
    attributes = []
    sentiments = []
    opinion_aspect_pairs = []
    for sent_opinions in opinionated_sid_map.values():
        opinions += [[opinion['opinion'] for opinion in sent_opinions]]
        aspects += [[opinion['aspect'] for opinion in sent_opinions]]
        attributes += [[opinion['attribute'] for opinion in sent_opinions]]
        sentiments += [[opinion['sentiment'] for opinion in sent_opinions]]
        opinion_aspect_pairs += [[opinion['opinion'] + " " + opinion['aspect'] for opinion in sent_opinions]]
    
    opinion_sent_df = pd.DataFrame({'sentences': opinionated_sentences})
    opinion_sent_df['opinions'] = opinions
    opinion_sent_df['aspects'] = aspects
    opinion_sent_df['opinion_aspect_pairs'] = opinion_aspect_pairs
    opinion_sent_df['sid'] = opinionated_sid_map.keys()
    opinion_sent_df['attributes'] = attributes
    opinion_sent_df['sentiments'] = sentiments
    opinion_sent_df['review_content'] = grp['content'].iloc[0]
    opinion_sent_df.insert(0, 'business_id', grp['ty_id'].iloc[0])
    opinion_sent_df.insert(0, 'name', grp['name'].iloc[0])
    opinion_sent_df.insert(0, 'categories', grp['categories'].iloc[0])
    opinion_sent_df.insert(0, 'categories_list', [grp['categories_list'].iloc[0] for i in range(len(opinion_sent_df))])
    opinion_sent_df.insert(0, 'domain', grp['domain'].iloc[0])
    
    return opinion_sent_df

In [20]:
yelp_sent_df = selected_df.reset_index().groupby(['index']).apply(extract_opinionated_comments_from_review)

In [21]:
yelp_sent_df

Unnamed: 0_level_0,Unnamed: 1_level_0,domain,categories_list,categories,name,business_id,sentences,opinions,aspects,opinion_aspect_pairs,sid,attributes,sentiments,review_content
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
306994,0,Hotels,"[Venues & Event Spaces, Event Planning & Servi...","Venues & Event Spaces, Event Planning & Servic...",Graduate Tempe,iCCsd62g79PDjswXaqxltw,Gorgeous 70 's vintage feel with that modern t...,"[modern, boutique style, Gorgeous]","[twist, hotel, feel]","[modern twist, boutique style hotel, Gorgeous ...",2.0,"[restaurant -> atmosphere, restaurant -> atmos...","[positive, positive, positive]",Hey Mrs. Robinson! \n\nThe Graduate was a fant...
306994,1,Hotels,"[Venues & Event Spaces, Event Planning & Servi...","Venues & Event Spaces, Event Planning & Servic...",Graduate Tempe,iCCsd62g79PDjswXaqxltw,I especially liked the convenience of the Norm...,[liked],[convenience],[liked convenience],3.0,[food -> variety],[positive],Hey Mrs. Robinson! \n\nThe Graduate was a fant...
306995,0,Hotels,"[Venues & Event Spaces, Event Planning & Servi...","Venues & Event Spaces, Event Planning & Servic...",Graduate Tempe,iCCsd62g79PDjswXaqxltw,"The rooms are decent sized , and the view is a...","[decent sized, awesome]","[rooms, view]","[decent sized rooms, awesome view]",0.0,"[food-quantity, restaurant -> atmosphere]","[positive, positive]","The rooms are decent sized, and the view is aw..."
306995,1,Hotels,"[Venues & Event Spaces, Event Planning & Servi...","Venues & Event Spaces, Event Planning & Servic...",Graduate Tempe,iCCsd62g79PDjswXaqxltw,Employees for the most part after nice althoug...,"[metrosexual, nice]","[men, Employees]","[metrosexual men, nice Employees]",1.0,"[food -> variety, staff]","[negative, positive]","The rooms are decent sized, and the view is aw..."
306995,2,Hotels,"[Venues & Event Spaces, Event Planning & Servi...","Venues & Event Spaces, Event Planning & Servic...",Graduate Tempe,iCCsd62g79PDjswXaqxltw,The ' theme ' of the hotel I found to be kind ...,[cheesy],[hotel],[cheesy hotel],3.0,[food -> quality],[neutral],"The rooms are decent sized, and the view is aw..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1038122,1,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad...","Breakfast & Brunch, Soup, Food, Bagels, Salad,...",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Tomato soup was ok .,[ok],[Tomato soup],[ok Tomato soup],6.0,[food -> quality],[neutral],"Mehh... came in at 9pm, one hour before close...."
1038123,0,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad...","Breakfast & Brunch, Soup, Food, Bagels, Salad,...",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,The staff here is so amazing and friendly !,[amazing and friendly],[staff],[amazing and friendly staff],2.0,[staff],[positive],I love coming to Panera! Ashley Love and Luke ...
1038123,1,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad...","Breakfast & Brunch, Soup, Food, Bagels, Salad,...",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Great food and wonderful service !,"[Great, wonderful]","[food, service]","[Great food, wonderful service]",4.0,"[food -> quality, staff]","[positive, positive]",I love coming to Panera! Ashley Love and Luke ...
1038125,0,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad...","Breakfast & Brunch, Soup, Food, Bagels, Salad,...",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Love Panera bread !,[Love],[Panera],[Love Panera],0.0,[food -> quality],[positive],Love Panera bread! So lucky to have them cater...


In [22]:
yelp_sent_df.to_pickle("../data/yelp/snippext_yelp_sent_df.pkl")

# Data Filter

In [23]:
yelp_sent_df = pd.read_pickle("../data/yelp/snippext_yelp_sent_df.pkl")

In [24]:
yelp_sent_df

Unnamed: 0_level_0,Unnamed: 1_level_0,domain,categories_list,categories,name,business_id,sentences,opinions,aspects,opinion_aspect_pairs,sid,attributes,sentiments,review_content
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
306994,0,Hotels,"[Venues & Event Spaces, Event Planning & Servi...","Venues & Event Spaces, Event Planning & Servic...",Graduate Tempe,iCCsd62g79PDjswXaqxltw,Gorgeous 70 's vintage feel with that modern t...,"[modern, boutique style, Gorgeous]","[twist, hotel, feel]","[modern twist, boutique style hotel, Gorgeous ...",2.0,"[restaurant -> atmosphere, restaurant -> atmos...","[positive, positive, positive]",Hey Mrs. Robinson! \n\nThe Graduate was a fant...
306994,1,Hotels,"[Venues & Event Spaces, Event Planning & Servi...","Venues & Event Spaces, Event Planning & Servic...",Graduate Tempe,iCCsd62g79PDjswXaqxltw,I especially liked the convenience of the Norm...,[liked],[convenience],[liked convenience],3.0,[food -> variety],[positive],Hey Mrs. Robinson! \n\nThe Graduate was a fant...
306995,0,Hotels,"[Venues & Event Spaces, Event Planning & Servi...","Venues & Event Spaces, Event Planning & Servic...",Graduate Tempe,iCCsd62g79PDjswXaqxltw,"The rooms are decent sized , and the view is a...","[decent sized, awesome]","[rooms, view]","[decent sized rooms, awesome view]",0.0,"[food-quantity, restaurant -> atmosphere]","[positive, positive]","The rooms are decent sized, and the view is aw..."
306995,1,Hotels,"[Venues & Event Spaces, Event Planning & Servi...","Venues & Event Spaces, Event Planning & Servic...",Graduate Tempe,iCCsd62g79PDjswXaqxltw,Employees for the most part after nice althoug...,"[metrosexual, nice]","[men, Employees]","[metrosexual men, nice Employees]",1.0,"[food -> variety, staff]","[negative, positive]","The rooms are decent sized, and the view is aw..."
306995,2,Hotels,"[Venues & Event Spaces, Event Planning & Servi...","Venues & Event Spaces, Event Planning & Servic...",Graduate Tempe,iCCsd62g79PDjswXaqxltw,The ' theme ' of the hotel I found to be kind ...,[cheesy],[hotel],[cheesy hotel],3.0,[food -> quality],[neutral],"The rooms are decent sized, and the view is aw..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1038122,1,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad...","Breakfast & Brunch, Soup, Food, Bagels, Salad,...",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Tomato soup was ok .,[ok],[Tomato soup],[ok Tomato soup],6.0,[food -> quality],[neutral],"Mehh... came in at 9pm, one hour before close...."
1038123,0,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad...","Breakfast & Brunch, Soup, Food, Bagels, Salad,...",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,The staff here is so amazing and friendly !,[amazing and friendly],[staff],[amazing and friendly staff],2.0,[staff],[positive],I love coming to Panera! Ashley Love and Luke ...
1038123,1,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad...","Breakfast & Brunch, Soup, Food, Bagels, Salad,...",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Great food and wonderful service !,"[Great, wonderful]","[food, service]","[Great food, wonderful service]",4.0,"[food -> quality, staff]","[positive, positive]",I love coming to Panera! Ashley Love and Luke ...
1038125,0,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad...","Breakfast & Brunch, Soup, Food, Bagels, Salad,...",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Love Panera bread !,[Love],[Panera],[Love Panera],0.0,[food -> quality],[positive],Love Panera bread! So lucky to have them cater...


## Select single-aspect comments

In [25]:
yelp_sent_df = yelp_sent_df[yelp_sent_df['aspects'].str.len() == 1]

In [26]:
yelp_sent_df['sentiments'].value_counts()

[positive]    44612
[negative]    12193
[neutral]      4021
Name: sentiments, dtype: int64

## Filter neutral comments

In [27]:
yelp_sent_df = yelp_sent_df[yelp_sent_df['sentiments'].apply(lambda x: all([sentiment != 'neutral' for sentiment in x]))]

In [28]:
yelp_sent_df

Unnamed: 0_level_0,Unnamed: 1_level_0,domain,categories_list,categories,name,business_id,sentences,opinions,aspects,opinion_aspect_pairs,sid,attributes,sentiments,review_content
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
306994,1,Hotels,"[Venues & Event Spaces, Event Planning & Servi...","Venues & Event Spaces, Event Planning & Servic...",Graduate Tempe,iCCsd62g79PDjswXaqxltw,I especially liked the convenience of the Norm...,[liked],[convenience],[liked convenience],3.0,[food -> variety],[positive],Hey Mrs. Robinson! \n\nThe Graduate was a fant...
306995,3,Hotels,"[Venues & Event Spaces, Event Planning & Servi...","Venues & Event Spaces, Event Planning & Servic...",Graduate Tempe,iCCsd62g79PDjswXaqxltw,Oh and hallway carpet is terrible .,[terrible],[hallway carpet],[terrible hallway carpet],4.0,[restaurant -> atmosphere],[negative],"The rooms are decent sized, and the view is aw..."
306995,4,Hotels,"[Venues & Event Spaces, Event Planning & Servi...","Venues & Event Spaces, Event Planning & Servic...",Graduate Tempe,iCCsd62g79PDjswXaqxltw,Overall very pleasant stay and would like to r...,[very pleasant],[stay],[very pleasant stay],5.0,[wait-time],[positive],"The rooms are decent sized, and the view is aw..."
306997,0,Hotels,"[Venues & Event Spaces, Event Planning & Servi...","Venues & Event Spaces, Event Planning & Servic...",Graduate Tempe,iCCsd62g79PDjswXaqxltw,Awesome place to stay when near ASU .,[Awesome],[place],[Awesome place],0.0,[recommendation],[positive],Awesome place to stay when near ASU. Friendly ...
306997,3,Hotels,"[Venues & Event Spaces, Event Planning & Servi...","Venues & Event Spaces, Event Planning & Servic...",Graduate Tempe,iCCsd62g79PDjswXaqxltw,"However , the carpet pattern on the floor leve...",[hideous],[floor levels],[hideous floor levels],3.0,[restaurant -> atmosphere],[negative],Awesome place to stay when near ASU. Friendly ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1038121,4,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad...","Breakfast & Brunch, Soup, Food, Bagels, Salad,...",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"Table was weird , dirty , and wobbly , like it...",[weird],[Table],[weird Table],6.0,[restaurant -> comfort],[negative],"We ordered two sandwiches, two sodas, and a bo..."
1038122,0,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad...","Breakfast & Brunch, Soup, Food, Bagels, Salad,...",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,- Saddest grilled cheese,[Saddest],[grilled cheese],[Saddest grilled cheese],4.0,[food -> quality],[negative],"Mehh... came in at 9pm, one hour before close...."
1038123,0,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad...","Breakfast & Brunch, Soup, Food, Bagels, Salad,...",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,The staff here is so amazing and friendly !,[amazing and friendly],[staff],[amazing and friendly staff],2.0,[staff],[positive],I love coming to Panera! Ashley Love and Luke ...
1038125,0,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad...","Breakfast & Brunch, Soup, Food, Bagels, Salad,...",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Love Panera bread !,[Love],[Panera],[Love Panera],0.0,[food -> quality],[positive],Love Panera bread! So lucky to have them cater...


In [29]:
yelp_sent_df['sentiments'].value_counts()

[positive]    44612
[negative]    12193
Name: sentiments, dtype: int64

In [30]:
yelp_sent_df.shape

(56805, 13)

## Select high-quality comments

**Sample top 100 business per sentiment struct & >= review sent**

In [31]:
yelp_sent_df = yelp_sent_df.reset_index()

In [32]:
yelp_sent_df = yelp_sent_df.groupby(['index']).filter(lambda grp: len(grp) > 2)

In [33]:
yelp_sent_df = yelp_sent_df.set_index(['index', 'level_1']).sort_index()

In [34]:
yelp_sent_df

Unnamed: 0_level_0,Unnamed: 1_level_0,domain,categories_list,categories,name,business_id,sentences,opinions,aspects,opinion_aspect_pairs,sid,attributes,sentiments,review_content
index,level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
306997,0,Hotels,"[Venues & Event Spaces, Event Planning & Servi...","Venues & Event Spaces, Event Planning & Servic...",Graduate Tempe,iCCsd62g79PDjswXaqxltw,Awesome place to stay when near ASU .,[Awesome],[place],[Awesome place],0.0,[recommendation],[positive],Awesome place to stay when near ASU. Friendly ...
306997,3,Hotels,"[Venues & Event Spaces, Event Planning & Servi...","Venues & Event Spaces, Event Planning & Servic...",Graduate Tempe,iCCsd62g79PDjswXaqxltw,"However , the carpet pattern on the floor leve...",[hideous],[floor levels],[hideous floor levels],3.0,[restaurant -> atmosphere],[negative],Awesome place to stay when near ASU. Friendly ...
306997,4,Hotels,"[Venues & Event Spaces, Event Planning & Servi...","Venues & Event Spaces, Event Planning & Servic...",Graduate Tempe,iCCsd62g79PDjswXaqxltw,The bar at the bottom has decent servers .,[decent],[servers],[decent servers],4.0,[staff],[positive],Awesome place to stay when near ASU. Friendly ...
307005,0,Hotels,"[Venues & Event Spaces, Event Planning & Servi...","Venues & Event Spaces, Event Planning & Servic...",Graduate Tempe,iCCsd62g79PDjswXaqxltw,Reasonable price .,[Reasonable],[price],[Reasonable price],1.0,[value-for-money],[positive],"Didn't know what to expect, but we were pleas..."
307005,1,Hotels,"[Venues & Event Spaces, Event Planning & Servi...","Venues & Event Spaces, Event Planning & Servic...",Graduate Tempe,iCCsd62g79PDjswXaqxltw,The decor is unjust and college themed .,[unjust],[decor],[unjust decor],2.0,[restaurant -> atmosphere],[negative],"Didn't know what to expect, but we were pleas..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1038117,3,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad...","Breakfast & Brunch, Soup, Food, Bagels, Salad,...",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Tomato soup is always perfect .,[always perfect],[Tomato soup],[always perfect Tomato soup],4.0,[food -> quality],[positive],This location is closest to my place in Spring...
1038117,4,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad...","Breakfast & Brunch, Soup, Food, Bagels, Salad,...",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,My favorite location is the Boca Park just bec...,[favorite],[location],[favorite location],5.0,[recommendation],[positive],This location is closest to my place in Spring...
1038121,2,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad...","Breakfast & Brunch, Soup, Food, Bagels, Salad,...",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,Place felt dirty and disorganized .,[dirty],[Place],[dirty Place],4.0,[restaurant -> atmosphere],[negative],"We ordered two sandwiches, two sodas, and a bo..."
1038121,3,Restaurants,"[Breakfast & Brunch, Soup, Food, Bagels, Salad...","Breakfast & Brunch, Soup, Food, Bagels, Salad,...",Panera Bread,6bgjcFOy4WHMyw62_1V9Pw,"Place was deserted on Saturday afternoon , now...",[deserted],[Place],[deserted Place],5.0,[restaurant -> atmosphere],[negative],"We ordered two sandwiches, two sodas, and a bo..."


In [35]:
yelp_sent_df.to_pickle("../data/yelp/snippext_yelp_sent_df_single_aspect_5_domains_top_50.pkl")