# Case Study 3

In [6]:
import duckdb
import pandas as pd

### Creating Table Source for review_specific_id

In [50]:
duckdb.sql("CREATE TABLE review_specific_id AS SELECT * FROM read_json_auto('yelp_academic_dataset_review.json') WHERE business_id = '7ATYjTIgM3jUlt4UM3IypQ' AND date BETWEEN '2018-01-01' AND '2019-01-01';")

### Ingesting json into temp table, this is for easier get data from 5 GB of JOSN data as it's running quickly (5 seconds)

In [17]:
duckdb.sql("CREATE TEMP TABLE review_taemp AS SELECT * FROM read_json_auto('yelp_academic_dataset_review.json') WHERE business_id = '7ATYjTIgM3jUlt4UM3IypQ' AND date BETWEEN '2018-01-01' AND '2019-01-01';")

--------------------------------------------

### This is for incremental mode

In [54]:
# FOR INCREMENTAL ONLY NOT USING BACKDATE
duckdb.sql("SELECT * FROM review_temp WHERE review_id not in (SELECT review_id from review_specific_id);")

┌───────────┬─────────┬─────────────┬────────┬────────┬───────┬───────┬─────────┬───────────┐
│ review_id │ user_id │ business_id │ stars  │ useful │ funny │ cool  │  text   │   date    │
│  varchar  │ varchar │   varchar   │ double │ int64  │ int64 │ int64 │ varchar │ timestamp │
├───────────┴─────────┴─────────────┴────────┴────────┴───────┴───────┴─────────┴───────────┤
│                                          0 rows                                           │
└───────────────────────────────────────────────────────────────────────────────────────────┘

Due there is no data in new json, we use all data just for example of data ingestion to Spreadsheet

In [18]:
df = duckdb.sql("SELECT * FROM review_temp").fetchdf()

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   review_id    4 non-null      object        
 1   user_id      4 non-null      object        
 2   business_id  4 non-null      object        
 3   stars        4 non-null      float64       
 4   useful       4 non-null      int64         
 5   funny        4 non-null      int64         
 6   cool         4 non-null      int64         
 7   text         4 non-null      object        
 8   date         4 non-null      datetime64[us]
dtypes: datetime64[us](1), float64(1), int64(3), object(4)
memory usage: 416.0+ bytes


In [23]:
df['date_format'] = df['date'].dt.date

In [24]:
df

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,date_format
0,OmKVEVJmVPImlQDK_wCn2g,AhLubOI-czV3DeKs9s4yUQ,7ATYjTIgM3jUlt4UM3IypQ,5.0,0,0,0,Seriously kicked my butt. Great bikes and ins...,2018-02-13 21:55:20,2018-02-13
1,gRCKV0tmSf046wio5ocPDw,thlXLsfsScDqsD8Jehc1_g,7ATYjTIgM3jUlt4UM3IypQ,5.0,0,0,0,New to Body Cycle and spinning. I could not ha...,2018-04-10 21:21:44,2018-04-10
2,kaqHqKN30CyOxt2jIniwBQ,wZWB41wmnv4pPsetuIN5Gw,7ATYjTIgM3jUlt4UM3IypQ,5.0,0,0,0,(one day later) Holy hell. My booty hurts. \n\...,2018-04-23 12:45:37,2018-04-23
3,XwivkWLE63_Ya8kBAbGA-w,KTxe33TKY86VJRm41iBm1A,7ATYjTIgM3jUlt4UM3IypQ,5.0,2,0,1,I've been to this studio many times through cl...,2018-02-21 13:56:52,2018-02-21


In [31]:
grouped_df = df.groupby('date_format').agg(
    count_row=('business_id', 'count'),
    avg_stars=('stars', 'mean'),
    avg_useful=('useful', 'mean'),
    avg_funny=('funny', 'mean'),
    avg_cool=('cool', 'mean')
).reset_index()

In [45]:
grouped_df['date_format'] = grouped_df['date_format'].astype(str)

In [46]:
grouped_df

Unnamed: 0,date_format,count_row,avg_stars,avg_useful,avg_funny,avg_cool
0,2018-02-13,1,5.0,0.0,0.0,0.0
1,2018-02-21,1,5.0,2.0,0.0,1.0
2,2018-04-10,1,5.0,0.0,0.0,0.0
3,2018-04-23,1,5.0,0.0,0.0,0.0


In [48]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials


scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
creds = ServiceAccountCredentials.from_json_keyfile_name('service-account.json', scope)
client = gspread.authorize(creds)

spreadsheet = client.open("Azhar Question Number 3 Dealls")  # Replace with your spreadsheet name
worksheet = spreadsheet.sheet1  

In [49]:
worksheet.insert_row(grouped_df.columns.tolist(), 1)

for index, row in grouped_df.iterrows():
    print(row.tolist())
    worksheet.insert_row(row.tolist(), index + 2)

print("Data inserted successfully!")

['2018-02-13', 1, 5.0, 0.0, 0.0, 0.0]
['2018-02-21', 1, 5.0, 2.0, 0.0, 1.0]
['2018-04-10', 1, 5.0, 0.0, 0.0, 0.0]
['2018-04-23', 1, 5.0, 0.0, 0.0, 0.0]
Data inserted successfully!


### Succefully inserted for initial load and incremental mode

---------------------------------------------------------

### Now this is for the backfill

In [74]:
df_backfill = duckdb.sql("SELECT date::date date, COUNT(business_id) count_row, AVG(stars) stars, AVG(useful) useful, AVG(funny) funny, AVG(cool) cool  FROM review_temp WHERE date::date = '2018-04-10' GROUP BY date::date;").fetchdf()

In [75]:
df_backfill

Unnamed: 0,date,count_row,stars,useful,funny,cool
0,2018-04-10,1,5.0,0.0,0.0,0.0


In [76]:
data = worksheet.get_all_values()

id_to_delete = '2018-04-10'

row_index_to_delete = None
for index, row in enumerate(data):
    if row[0] == id_to_delete:  # Assuming the ID is in the first column
        row_index_to_delete = index + 1  # gspread is 1-indexed
        break

In [77]:
row_index_to_delete

5

In [78]:
if row_index_to_delete:
    worksheet.delete_rows(row_index_to_delete)
    print(f"Deleted row with ID: {id_to_delete}")
else:
    print(f"No row found with ID: {id_to_delete}")

Deleted row with ID: 2018-04-10


In [83]:
df_backfill['date'] = df_backfill['date'].astype(str)

In [84]:
# new_row = ['2018-04-10', 1, 5.0, 0.0, 0.0, 0.0]
for i, row in df_backfill.iterrows():
    new_row = row.tolist()
    worksheet.append_row(new_row)

### Success for backfill data