In [2]:
# Imports
import pandas as pd
import numpy as np
import spacy
import json
import re
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
#import tqdm
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
#from tqdm import tqdm


plt.style.use('ggplot')
%matplotlib inline

In [9]:
df_1 = pd.read_parquet('file/yelp_foods_mentioned.parquet')
df_1.head(10)

Unnamed: 0,id,stars,useful,funny,cool,text,sentiment.negative,sentiment.neutral,sentiment.positive,food mentioned
1251197,2980811,4.0,0,0,0,we landed at the reno airport for our honeymoo...,0.030769,0.244984,0.724247,"[pizza, egg, beef, cake, coffee, tea, pancake,..."
197231,467079,4.0,12,3,7,this atmosphere is certainly geared towards am...,0.032657,0.175753,0.791589,"[pork, shrimp, ribs, bbq, ramen, potato]"
226443,543661,1.0,3,0,0,food and service are key components to enjoy a...,0.891778,0.097133,0.011089,"[rice, dessert]"
263340,638958,5.0,4,2,4,cyranos is a hidden gem that i wish i had know...,0.002409,0.013464,0.984127,"[rice, bread, crab, cake, dessert, cocktail]"
151897,352283,5.0,1,0,2,wow kekes truly impressed me this afternoon m...,0.002309,0.009757,0.987934,[egg]
2306988,5523173,5.0,0,0,0,its quite a drive to sparks but there are only...,0.001669,0.02589,0.972441,"[soup, pie, falafel]"
1354912,3227020,5.0,1,0,1,my husband and i visited this location for lun...,0.005359,0.015456,0.979185,"[steak, salad, cheese, shrimp, tea, taco]"
1679076,4020477,4.0,7,0,1,finally stopped by here after getting tired of...,0.012973,0.136857,0.85017,"[chicken, fish, salad, rice, bread, egg, pork,..."
27814,63876,1.0,1,1,0,i went here for lunch after an extremely cold ...,0.937443,0.056307,0.00625,"[soup, sandwich, meat]"
1893515,4523556,5.0,0,0,0,called and ordered the chicken tikka pizza es...,0.007335,0.073568,0.919097,"[pizza, chicken]"


In [14]:
# Use idxmax to find the column with the maximum value in each row
df_1['sentiment'] = df_1[['sentiment.negative', 'sentiment.neutral', 'sentiment.positive']].idxmax(axis=1)

# To remove the 'sentiment.' prefix and keep only the sentiment name
df_1['sentiment'] = df_1['sentiment'].str.replace('sentiment.', '')

# If you want to add an 'id' column as well (assuming it doesn't already exist)
df_1.reset_index(inplace=True)
df_1.rename(columns={'index': 'id_review'}, inplace=True)

# Show the new dataframe structure
df_1.head(10)

Unnamed: 0,id_review,id,id.1,stars,useful,funny,cool,text,sentiment.negative,sentiment.neutral,sentiment.positive,food mentioned,sentiment
0,0,1251197,2980811,4.0,0,0,0,we landed at the reno airport for our honeymoo...,0.030769,0.244984,0.724247,"[pizza, egg, beef, cake, coffee, tea, pancake,...",positive
1,1,197231,467079,4.0,12,3,7,this atmosphere is certainly geared towards am...,0.032657,0.175753,0.791589,"[pork, shrimp, ribs, bbq, ramen, potato]",positive
2,2,226443,543661,1.0,3,0,0,food and service are key components to enjoy a...,0.891778,0.097133,0.011089,"[rice, dessert]",negative
3,3,263340,638958,5.0,4,2,4,cyranos is a hidden gem that i wish i had know...,0.002409,0.013464,0.984127,"[rice, bread, crab, cake, dessert, cocktail]",positive
4,4,151897,352283,5.0,1,0,2,wow kekes truly impressed me this afternoon m...,0.002309,0.009757,0.987934,[egg],positive
5,5,2306988,5523173,5.0,0,0,0,its quite a drive to sparks but there are only...,0.001669,0.02589,0.972441,"[soup, pie, falafel]",positive
6,6,1354912,3227020,5.0,1,0,1,my husband and i visited this location for lun...,0.005359,0.015456,0.979185,"[steak, salad, cheese, shrimp, tea, taco]",positive
7,7,1679076,4020477,4.0,7,0,1,finally stopped by here after getting tired of...,0.012973,0.136857,0.85017,"[chicken, fish, salad, rice, bread, egg, pork,...",positive
8,8,27814,63876,1.0,1,1,0,i went here for lunch after an extremely cold ...,0.937443,0.056307,0.00625,"[soup, sandwich, meat]",negative
9,9,1893515,4523556,5.0,0,0,0,called and ordered the chicken tikka pizza es...,0.007335,0.073568,0.919097,"[pizza, chicken]",positive


In [15]:
df_new = df_1.copy()

In [16]:
df_new.drop(['sentiment.negative', 'sentiment.neutral', 'sentiment.positive', 'id'], axis=1, inplace=True)

In [17]:
df_new.head(10)

Unnamed: 0,id_review,stars,useful,funny,cool,text,food mentioned,sentiment
0,0,4.0,0,0,0,we landed at the reno airport for our honeymoo...,"[pizza, egg, beef, cake, coffee, tea, pancake,...",positive
1,1,4.0,12,3,7,this atmosphere is certainly geared towards am...,"[pork, shrimp, ribs, bbq, ramen, potato]",positive
2,2,1.0,3,0,0,food and service are key components to enjoy a...,"[rice, dessert]",negative
3,3,5.0,4,2,4,cyranos is a hidden gem that i wish i had know...,"[rice, bread, crab, cake, dessert, cocktail]",positive
4,4,5.0,1,0,2,wow kekes truly impressed me this afternoon m...,[egg],positive
5,5,5.0,0,0,0,its quite a drive to sparks but there are only...,"[soup, pie, falafel]",positive
6,6,5.0,1,0,1,my husband and i visited this location for lun...,"[steak, salad, cheese, shrimp, tea, taco]",positive
7,7,4.0,7,0,1,finally stopped by here after getting tired of...,"[chicken, fish, salad, rice, bread, egg, pork,...",positive
8,8,1.0,1,1,0,i went here for lunch after an extremely cold ...,"[soup, sandwich, meat]",negative
9,9,5.0,0,0,0,called and ordered the chicken tikka pizza es...,"[pizza, chicken]",positive


In [18]:
df_new.shape

(10000, 8)

In [19]:
df_1.shape

(10000, 13)

In [20]:
df_new.to_parquet('yelp_dataset.parquet')