In [1]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')


In [2]:
# Function to clean the review text
def clean_text(text):
      
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Remove unnecessary characters like special symbols
    text = re.sub(r'[^A-Za-z0-9\s]+.,', '', text)  # Keep only alphanumeric characters and spaces
    # Convert to lowercase
    #text = text.lower()
    return text

# Function to standardize the review dates
def standardize_date(unix_timestamp):
    try:
        return pd.to_datetime(unix_timestamp, unit='s', errors='coerce').date() # 'coerce' returns NaT for invalid dates
    except:
        return None

# Function to clean restaurant ratings (ensure ratings are between 1 and 5)
def clean_rating(rating):
    try:
        if pd.isna(rating):  # Handle missing ratings
            return None
        elif 1 <= rating <= 5:  # Ensure rating is within the valid range
            return rating
        else:
            return None  # Handle out-of-range ratings
    except:
        return None

In [3]:
# Load the CSV file into a DataFrame
csv_file = '../data/austin_restaurant_reviews.csv'
df = pd.read_csv(csv_file)


In [4]:


# Step 1: Load the existing CSV file into a DataFrame

# Step 2: Load the data from the data.pkl file
pkl_file_path = '../data/data.pkl'
df_additional_data = pd.read_pickle(pkl_file_path)

# Step 3: Check the shape of both DataFrames to ensure alignment
print("Shape of reviews DataFrame:", df.shape)
print("Shape of additional data DataFrame:", df_additional_data.shape)

# Assuming you want to add a column from df_additional_data to df
# Ensure that df_additional_data contains a column that can be used to merge (like 'Restaurant ID' or similar)
# For example, let's say we want to add a column named 'New_Column' from df_additional_data


# Option B: If the DataFrames align and you simply want to add a column
# Ensure both DataFrames have the same number of rows before using this method
if df.shape[0] == df_additional_data.shape[0]:
    df['Review_Date'] = df_additional_data['New_Column']  
else:
    print("DataFrames do not have the same number of rows. Cannot add column directly.")

# Step 4: Save the updated DataFrame back to a CSV file
df.to_csv('updated_austin_restaurant_reviews.csv', index=False)

print("Updated CSV saved as 'updated_austin_restaurant_reviews.csv'.")


Shape of reviews DataFrame: (300, 7)
Shape of additional data DataFrame: (60, 1)
DataFrames do not have the same number of rows. Cannot add column directly.
Updated CSV saved as 'updated_austin_restaurant_reviews.csv'.


In [7]:
dfp = pd.read_pickle('../data/data.pkl')
dfp.head()

Unnamed: 0,details
0,"{'formatted_address': '900 E 11th St, Austin, ..."
1,"{'formatted_address': '1316 S Congress Ave, Au..."
2,"{'formatted_address': '801 Red River St, Austi..."
3,"{'formatted_address': '303 Red River St, Austi..."
4,"{'formatted_address': '1917 Manor Rd, Austin, ..."


In [8]:
dfp[dfp['details'].astype(str).str.contains('Beware of this place', case=False)]

Unnamed: 0,details
49,"{'formatted_address': '1701 Toomey Rd, Austin,..."


In [9]:
dfp.iloc[49].values[0]

{'formatted_address': '1701 Toomey Rd, Austin, TX 78704, USA',
 'name': 'Casa de Luz Village',
 'price_level': 1,
 'rating': 4.7,
 'reviews': [{'author_name': 'Gabby',
   'author_url': 'https://www.google.com/maps/contrib/112169170583194282872/reviews',
   'language': 'en',
   'original_language': 'en',
   'profile_photo_url': 'https://lh3.googleusercontent.com/a/ACg8ocKnyr-EEvYHSF9i0_uMpwHQ1F7orbsM_GYF1wQGnxexN1I3CBU=s128-c0x00000000-cc-rp-mo',
   'rating': 5,
   'relative_time_description': 'a month ago',
   'text': 'Cannot say enough good things about this place. This is my most favorite place to eat in Austin. I love the atmosphere, the food, the workers, everything. I’ve eaten here countless times and it’s always been perfect. The food is high quality, unprocessed and all plant based whole foods. I tell everyone I know to come here. I’ve recently moved out of state and everytime I come back to Austin, this is where I go. They have a beautiful outdoor seating area where you are can

In [9]:
# Function to extract and organize reviews for each restaurant
def collect_reviews(d):
    reviews_list = []
    
    for details in d:
        if details:
            reviews = details.get('reviews', [])
            for review in reviews:
                print(review.get('rating', 'No review rating provided'))
                reviews_list.append({
                    'Restaurant': details.get('name', 'Unknown'),
                    'Address': details.get('formatted_address', 'Unknown'),
                    'Rating': details.get('rating', 'No rating provided'),
                    'Review Text': review.get('text', 'No review text available'),
                    'Review Rating': review.get('rating', 'No review rating provided'),
                    'Review Time': review.get('relative_time_description', 'No time information'),
                    'Time': review.get('time', 'No time information')
                })
    
    return reviews_list


In [13]:
# reviews = collect_reviews(dfp['details'].values)
df.head()


Unnamed: 0,Restaurant,Address,Rating,Review Text,Review Rating,Review Time,Time
0,Franklin Barbecue,"900 E 11th St, Austin, TX 78702, USA",4.7,We got this delivered to our house in Washingt...,5,a month ago,1725127412
1,Franklin Barbecue,"900 E 11th St, Austin, TX 78702, USA",4.7,100/100 hands down the best bbq in the country...,5,a week ago,1727201997
2,Franklin Barbecue,"900 E 11th St, Austin, TX 78702, USA",4.7,Franklin Barbecue truly lives up to the hype! ...,5,a month ago,1725236925
3,Franklin Barbecue,"900 E 11th St, Austin, TX 78702, USA",4.7,Going to Franklin takes a level of commitment ...,5,a week ago,1727041818
4,Franklin Barbecue,"900 E 11th St, Austin, TX 78702, USA",4.7,Without a doubt the Greatest of All Time!\n\nB...,5,2 weeks ago,1726585483


In [15]:
reviews_df = pd.DataFrame(reviews)

In [16]:
filename = '../data/austin_restaurant_reviews.csv'
reviews_df.to_csv(filename, index=False)

In [17]:
reviews_df

Unnamed: 0,Restaurant,Address,Rating,Review Text,Review Rating,Review Time,Time
0,Franklin Barbecue,"900 E 11th St, Austin, TX 78702, USA",4.7,We got this delivered to our house in Washingt...,5,a month ago,1725127412
1,Franklin Barbecue,"900 E 11th St, Austin, TX 78702, USA",4.7,100/100 hands down the best bbq in the country...,5,a week ago,1727201997
2,Franklin Barbecue,"900 E 11th St, Austin, TX 78702, USA",4.7,Franklin Barbecue truly lives up to the hype! ...,5,a month ago,1725236925
3,Franklin Barbecue,"900 E 11th St, Austin, TX 78702, USA",4.7,Going to Franklin takes a level of commitment ...,5,a week ago,1727041818
4,Franklin Barbecue,"900 E 11th St, Austin, TX 78702, USA",4.7,Without a doubt the Greatest of All Time!\n\nB...,5,2 weeks ago,1726585483
...,...,...,...,...,...,...,...
295,Musashino Sushi Dokoro,"2905 San Gabriel St Suite 200, Austin, TX 7870...",4.5,My husband and I absolutely love Musashino and...,5,2 weeks ago,1726175098
296,Musashino Sushi Dokoro,"2905 San Gabriel St Suite 200, Austin, TX 7870...",4.5,Do not eat here as it is far inferior to the b...,1,2 months ago,1722558335
297,Musashino Sushi Dokoro,"2905 San Gabriel St Suite 200, Austin, TX 7870...",4.5,"Not sure about the hype, the place has nice de...",2,a week ago,1726874664
298,Musashino Sushi Dokoro,"2905 San Gabriel St Suite 200, Austin, TX 7870...",4.5,"Excellent, fresh sushi at a reasonable price! ...",5,2 months ago,1722536140


In [18]:
# Apply the cleaning functions to the DataFrame
# df['cleaned_review'] = reviews_df['Review Text'].apply(clean_text)           # Clean review text
df['standardized_date'] = reviews_df['Time'].apply(standardize_date)  # Standardize review date
df['cleaned_rating'] = reviews_df['Review Rating'].apply(clean_rating)              # Clean and standardize ratings

# Handle missing data
# Option 1: Drop rows with any missing data
df_cleaned = df.dropna(subset=['standardized_date', 'cleaned_rating'])

# Option 2: You can choose to fill missing ratings with the mean (comment/uncomment as per need)
# df['cleaned_rating'] = df['cleaned_rating'].fillna(df['cleaned_rating'].mean())

# Organize the cleaned data into a new DataFrame
df_cleaned = df[['Review Text', 'standardized_date', 'cleaned_rating', 'Restaurant', 'Address']]

# Save the cleaned DataFrame into a new CSV file
df_cleaned.to_csv('cleaned_restaurant_reviews.csv', index=False)

# Preview the final cleaned data
df_cleaned.head()




Unnamed: 0,Review Text,standardized_date,cleaned_rating,Restaurant,Address
0,We got this delivered to our house in Washingt...,2024-08-31,5,Franklin Barbecue,"900 E 11th St, Austin, TX 78702, USA"
1,100/100 hands down the best bbq in the country...,2024-09-24,5,Franklin Barbecue,"900 E 11th St, Austin, TX 78702, USA"
2,Franklin Barbecue truly lives up to the hype! ...,2024-09-02,5,Franklin Barbecue,"900 E 11th St, Austin, TX 78702, USA"
3,Going to Franklin takes a level of commitment ...,2024-09-22,5,Franklin Barbecue,"900 E 11th St, Austin, TX 78702, USA"
4,Without a doubt the Greatest of All Time!\n\nB...,2024-09-17,5,Franklin Barbecue,"900 E 11th St, Austin, TX 78702, USA"


In [19]:
df_cleaned.to_csv('cleaned_restaurant_reviews.csv', index=False)

In [131]:
df.iloc[122]['Review Text']

'Lovely cafe with tasty crepes. We had the La Provencale and the Chloe crepes. Great service. Small on the inside and plenty of seating on the covered patio.'