# Exploring the data

#### run the cell below to download data

In [1]:
import os
from dotenv import load_dotenv
from pathlib import Path

load_dotenv()

# 1. Set your Kaggle credentials
kaggle_username = os.getenv('KAGGLE_USERNAME')
kaggle_key = os.getenv('KAGGLE_KEY')

# 2. Define the dataset and target file
dataset_slug = 'denizbilginn/google-maps-restaurant-reviews'
target_file = 'reviews.csv' 
data_dir = Path('../data/raw/')

# 3. Create directory if it doesn't exist
data_dir.mkdir(parents=True, exist_ok=True)

# 4. Download ONLY the specific file
!kaggle datasets download -d {dataset_slug} -f {target_file} -p {data_dir} --unzip -q

Dataset URL: https://www.kaggle.com/datasets/denizbilginn/google-maps-restaurant-reviews
License(s): ODbL-1.0
^C


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visual style
%matplotlib inline
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

data_path = "../data/raw/reviews.csv"
df = pd.read_csv(data_path)

print("Dataset Shape:", df.shape)
df.info()
df.head(15)

Dataset Shape: (1100, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1100 entries, 0 to 1099
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   business_name    1100 non-null   object
 1   author_name      1100 non-null   object
 2   text             1100 non-null   object
 3   photo            1100 non-null   object
 4   rating           1100 non-null   int64 
 5   rating_category  1100 non-null   object
dtypes: int64(1), object(5)
memory usage: 51.7+ KB


Unnamed: 0,business_name,author_name,text,photo,rating,rating_category
0,Haci'nin Yeri - Yigit Lokantasi,Gulsum Akar,We went to Marmaris with my wife for a holiday...,dataset/taste/hacinin_yeri_gulsum_akar.png,5,taste
1,Haci'nin Yeri - Yigit Lokantasi,Oguzhan Cetin,During my holiday in Marmaris we ate here to f...,dataset/menu/hacinin_yeri_oguzhan_cetin.png,4,menu
2,Haci'nin Yeri - Yigit Lokantasi,Yasin Kuyu,Prices are very affordable. The menu in the ph...,dataset/outdoor_atmosphere/hacinin_yeri_yasin_...,3,outdoor_atmosphere
3,Haci'nin Yeri - Yigit Lokantasi,Orhan Kapu,Turkey's cheapest artisan restaurant and its f...,dataset/indoor_atmosphere/hacinin_yeri_orhan_k...,5,indoor_atmosphere
4,Haci'nin Yeri - Yigit Lokantasi,Ozgur Sati,I don't know what you will look for in terms o...,dataset/menu/hacinin_yeri_ozgur_sati.png,3,menu
5,Haci'nin Yeri - Yigit Lokantasi,Arda Karaca,Generally good.,dataset/indoor_atmosphere/hacinin_yeri_arda_ka...,4,indoor_atmosphere
6,Haci'nin Yeri - Yigit Lokantasi,İrem Eren,What you see is 125 TL in total. It's a pretty...,dataset/taste/hacinin_yeri_irem_eren.png,5,taste
7,Haci'nin Yeri - Yigit Lokantasi,Nadia Salim,Delicious food at rock bottom prices. Friendly...,dataset/taste/hacinin_yeri_nadia_salim.png,5,taste
8,Haci'nin Yeri - Yigit Lokantasi,Mehmet Eser,"Every time I go, I still experience the amazem...",dataset/outdoor_atmosphere/hacinin_yeri_mehmet...,5,outdoor_atmosphere
9,Haci'nin Yeri - Yigit Lokantasi,Celal Ozer,The most f/p of all businesses I've seen.,dataset/indoor_atmosphere/hacinin_yeri_celal_o...,5,indoor_atmosphere


In [3]:
df.describe(include = "all")

Unnamed: 0,business_name,author_name,text,photo,rating,rating_category
count,1100,1100,1100,1100,1100.0,1100
unique,100,1074,1088,1100,,4
top,Haci'nin Yeri - Yigit Lokantasi,Nihat Karabiber,Great.,dataset/taste/hacinin_yeri_gulsum_akar.png,,taste
freq,11,3,3,1,,330
mean,,,,,3.912727,
std,,,,,1.218459,
min,,,,,1.0,
25%,,,,,3.0,
50%,,,,,4.0,
75%,,,,,5.0,


In [4]:
df.isnull().sum()

business_name      0
author_name        0
text               0
photo              0
rating             0
rating_category    0
dtype: int64

In [5]:
df['business_name'] = df['business_name'].astype(str)
df['author_name'] = df['author_name'].astype(str)
df['text'] = df['text'].astype(str)
df['rating_category'] = df['rating_category'].astype(str)
df.drop(columns=["photo"], inplace=True)

df

Unnamed: 0,business_name,author_name,text,rating,rating_category
0,Haci'nin Yeri - Yigit Lokantasi,Gulsum Akar,We went to Marmaris with my wife for a holiday...,5,taste
1,Haci'nin Yeri - Yigit Lokantasi,Oguzhan Cetin,During my holiday in Marmaris we ate here to f...,4,menu
2,Haci'nin Yeri - Yigit Lokantasi,Yasin Kuyu,Prices are very affordable. The menu in the ph...,3,outdoor_atmosphere
3,Haci'nin Yeri - Yigit Lokantasi,Orhan Kapu,Turkey's cheapest artisan restaurant and its f...,5,indoor_atmosphere
4,Haci'nin Yeri - Yigit Lokantasi,Ozgur Sati,I don't know what you will look for in terms o...,3,menu
...,...,...,...,...,...
1095,Miss Pizza,Salih Gursoy,There are so many types of pizza; you are surp...,5,taste
1096,Miss Pizza,Kemal Amangeldi,I tried the smoked ribeye pizza; the dough is ...,5,indoor_atmosphere
1097,Miss Pizza,Ulkem Esen,Crowded and expensive place.,3,menu
1098,Miss Pizza,Ilkin Saymaz,No bad. It was very crowded; there was no ligh...,3,taste


In [None]:
# tried to write some code to figure out if there was any reviews with ads but i couldn't find
# tried words like promotion, ad, www, promo code, 

for text in df["text"]:
    if "promotion" in text:
        print(text)
 

In [15]:
df.to_csv('clean_data.csv', index=False)