# Importing and Checking Data

## Packages

In [1]:
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import json

/kaggle/input/yelp-dataset/Dataset_User_Agreement.pdf
/kaggle/input/yelp-dataset/yelp_academic_dataset_review.json
/kaggle/input/yelp-dataset/yelp_academic_dataset_checkin.json
/kaggle/input/yelp-dataset/yelp_academic_dataset_business.json
/kaggle/input/yelp-dataset/yelp_academic_dataset_tip.json
/kaggle/input/yelp-dataset/yelp_academic_dataset_user.json


## Import Data and Specify Chunksize:

Here we set nrows (i.e. chunksize) = 5000

In [2]:
checkin = pd.read_json('/kaggle/input/yelp-dataset/yelp_academic_dataset_checkin.json', lines=True, nrows = 5000)
review = pd.read_json('/kaggle/input/yelp-dataset/yelp_academic_dataset_review.json', lines=True, nrows = 5000)
business = pd.read_json('/kaggle/input/yelp-dataset/yelp_academic_dataset_business.json', lines=True, nrows = 5000)
tip = pd.read_json('/kaggle/input/yelp-dataset/yelp_academic_dataset_tip.json', lines=True, nrows = 5000)
user= pd.read_json('/kaggle/input/yelp-dataset/yelp_academic_dataset_user.json', lines=True, nrows = 5000)

## Check in 

In [3]:
checkin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   business_id  5000 non-null   object
 1   date         5000 non-null   object
dtypes: object(2)
memory usage: 78.2+ KB


In [4]:
checkin.head()

Unnamed: 0,business_id,date
0,---kPU91CF4Lq2-WlRu9Lw,"2020-03-13 21:10:56, 2020-06-02 22:18:06, 2020..."
1,--0iUa4sNDFiZFrAdIWhZQ,"2010-09-13 21:43:09, 2011-05-04 23:08:15, 2011..."
2,--30_8IhuyMHbSOcNWd6DQ,"2013-06-14 23:29:17, 2014-08-13 23:20:22"
3,--7PUidqRWpRSpXebiyxTg,"2011-02-15 17:12:00, 2011-07-28 02:46:10, 2012..."
4,--7jw19RH9JKXgFohspgQw,"2014-04-21 20:42:11, 2014-04-28 21:04:46, 2014..."


Add checkin_id so it could be set as PK in SQL:

In [5]:
checkin['checkin_id'] = range(1, len(checkin) + 1)

In [6]:
earliest_date = str(checkin['date'].min())[:10]
latest_date = checkin['date'].max()

print("Check-in table earliest date:", earliest_date)
print("Check-in table latest date:", latest_date)

Check-in table earliest date: 2010-01-16
Check-in table latest date: 2022-01-16 16:19:09


## Review

In [7]:
review.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30
3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,5,1,0,1,"Wow! Yummy, different, delicious. Our favo...",2015-01-04 00:01:03
4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,4,1,0,1,Cute interior and owner (?) gave us tour of up...,2017-01-14 20:54:15


In [8]:
review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   review_id    5000 non-null   object        
 1   user_id      5000 non-null   object        
 2   business_id  5000 non-null   object        
 3   stars        5000 non-null   int64         
 4   useful       5000 non-null   int64         
 5   funny        5000 non-null   int64         
 6   cool         5000 non-null   int64         
 7   text         5000 non-null   object        
 8   date         5000 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(4), object(4)
memory usage: 351.7+ KB


In [9]:
earliest_date = review['date'].min()
latest_date = review['date'].max()

print("Review table earliest date:", earliest_date)
print("Review table latest date:", latest_date)

Review table earliest date: 2005-03-12 03:47:06
Review table latest date: 2018-10-04 18:10:01


## Business

In [10]:
business.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [11]:
business.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   business_id   5000 non-null   object 
 1   name          5000 non-null   object 
 2   address       5000 non-null   object 
 3   city          5000 non-null   object 
 4   state         5000 non-null   object 
 5   postal_code   5000 non-null   object 
 6   latitude      5000 non-null   float64
 7   longitude     5000 non-null   float64
 8   stars         5000 non-null   float64
 9   review_count  5000 non-null   int64  
 10  is_open       5000 non-null   int64  
 11  attributes    4559 non-null   object 
 12  categories    4995 non-null   object 
 13  hours         4218 non-null   object 
dtypes: float64(3), int64(2), object(9)
memory usage: 547.0+ KB


Here we adjust datatype of some columns so that the data format is consistent:

In [12]:
if 'attributes' in business.columns:
    business['attributes'] = business['attributes'].astype(str)
else:
    print("Warning: 'attributes' column not found in the data.")

if 'hours' in business.columns:
    business['hours'] = business['hours'].astype(str)
else:
    print("Warning: 'hours' column not found in the data.")

## Tip

Add tip_id so it could be set as PK in SQL:

In [13]:
tip['tip_id'] = range(1, len(tip) + 1)

In [14]:
tip.head()

Unnamed: 0,user_id,business_id,text,date,compliment_count,tip_id
0,AGNUgVwnZUey3gcPCJ76iw,3uLgwr0qeCNMjKenHJwPGQ,Avengers time with the ladies.,2012-05-18 02:17:21,0,1
1,NBN4MgHP9D3cw--SnauTkA,QoezRbYQncpRqyrLH6Iqjg,They have lots of good deserts and tasty cuban...,2013-02-05 18:35:10,0,2
2,-copOvldyKh1qr-vzkDEvw,MYoRNLb5chwjQe3c_k37Gg,It's open even when you think it isn't,2013-08-18 00:56:08,0,3
3,FjMQVZjSqY8syIO-53KFKw,hV-bABTK-glh5wj31ps_Jw,Very decent fried chicken,2017-06-27 23:05:38,0,4
4,ld0AperBXk1h6UbqmM80zw,_uN0OudeJ3Zl_tf6nxg5ww,Appetizers.. platter special for lunch,2012-10-06 19:43:09,0,5


In [15]:
tip.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   user_id           5000 non-null   object        
 1   business_id       5000 non-null   object        
 2   text              5000 non-null   object        
 3   date              5000 non-null   datetime64[ns]
 4   compliment_count  5000 non-null   int64         
 5   tip_id            5000 non-null   int64         
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 234.5+ KB


In [16]:
earliest_date = tip['date'].min()
latest_date = tip['date'].max()

print("Tip table earliest date:", earliest_date)
print("Tip table latest date:", latest_date)

Tip table earliest date: 2009-04-24 04:59:59
Tip table latest date: 2018-05-04 22:32:49


## User

In [17]:
user.head()

Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,585,2007-01-25 16:47:26,7217,1259,5994,2007,"NSCy54eWehBJyZdG2iE84w, pe42u7DcCH2QmI81NX-8qA...",267,...,65,55,56,18,232,844,467,467,239,180
1,j14WgRoU_-2ZE1aw1dXrJg,Daniel,4333,2009-01-25 04:35:42,43091,13066,27281,"2009,2010,2011,2012,2013,2014,2015,2016,2017,2...","ueRPE0CX75ePGMqOFVj6IQ, 52oH4DrRvzzl8wh5UXyU0A...",3138,...,264,184,157,251,1847,7054,3131,3131,1521,1946
2,2WnXYQFK0hXEoTxPtV2zvg,Steph,665,2008-07-25 10:41:00,2086,1010,1003,20092010201120122013,"LuO3Bn4f3rlhyHIaNfTlnA, j9B4XdHUhDfTKVecyWQgyA...",52,...,13,10,17,3,66,96,119,119,35,18
3,SZDeASXq7o05mMNLshsdIA,Gwen,224,2005-11-29 04:38:33,512,330,299,200920102011,"enx1vVPnfdNUdPho6PH_wg, 4wOcvMLtU6a9Lslggq74Vg...",28,...,4,1,6,2,12,16,26,26,10,9
4,hA5lMy-EnncsH4JoR-hFGQ,Karen,79,2007-01-05 19:40:59,29,15,7,,"PBK4q9KEEBHhFvSXCUirIw, 3FWPpM7KU1gXeOM_ZbYMbA...",1,...,1,0,0,0,1,1,0,0,0,0


In [18]:
user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   user_id             5000 non-null   object 
 1   name                5000 non-null   object 
 2   review_count        5000 non-null   int64  
 3   yelping_since       5000 non-null   object 
 4   useful              5000 non-null   int64  
 5   funny               5000 non-null   int64  
 6   cool                5000 non-null   int64  
 7   elite               5000 non-null   object 
 8   friends             5000 non-null   object 
 9   fans                5000 non-null   int64  
 10  average_stars       5000 non-null   float64
 11  compliment_hot      5000 non-null   int64  
 12  compliment_more     5000 non-null   int64  
 13  compliment_profile  5000 non-null   int64  
 14  compliment_cute     5000 non-null   int64  
 15  compliment_list     5000 non-null   int64  
 16  compli

In [19]:
earliest_date = user['yelping_since'].min()
latest_date = user['yelping_since'].max()

print("User table earliest date:", earliest_date)
print("User table latest date:", latest_date)

User table earliest date: 2005-01-03 18:01:35
User table latest date: 2013-07-30 17:49:45


# Connecting to Azure SQL Server

In [21]:
!pip install PyMySql
!pip install pyodbc
import sqlalchemy
from sqlalchemy import create_engine, types, inspect



In [22]:
!apt-get update
!apt-get install -y unixodbc unixodbc-dev

# Add Microsoft's signing key and repository for SQL Server ODBC Driver
!curl https://packages.microsoft.com/keys/microsoft.asc | apt-key add -
!curl https://packages.microsoft.com/config/ubuntu/$(lsb_release -rs)/prod.list | tee /etc/apt/sources.list.d/mssql-release.list

# Update the package list again after adding Microsoft's repository
!apt-get update

# Install Microsoft SQL Server ODBC Driver 17
!ACCEPT_EULA=Y apt-get install -y msodbcsql17

Get:1 https://packages.cloud.google.com/apt gcsfuse-focal InRelease [1227 B]
Get:2 http://security.ubuntu.com/ubuntu focal-security InRelease [128 kB]      
Hit:3 http://archive.ubuntu.com/ubuntu focal InRelease                         
Get:4 https://packages.cloud.google.com/apt cloud-sdk InRelease [1618 B]       
Get:5 https://packages.cloud.google.com/apt gcsfuse-focal/main amd64 Packages [28.6 kB]
Get:6 http://archive.ubuntu.com/ubuntu focal-updates InRelease [128 kB]
Get:7 https://packages.cloud.google.com/apt cloud-sdk/main amd64 Packages [3375 kB]
Get:8 http://security.ubuntu.com/ubuntu focal-security/restricted amd64 Packages [4105 kB]
Get:9 https://packages.cloud.google.com/apt cloud-sdk/main all Packages [1564 kB]
Get:10 http://archive.ubuntu.com/ubuntu focal-backports InRelease [128 kB]     
Get:11 http://security.ubuntu.com/ubuntu focal-security/universe amd64 Packages [1275 kB]
Get:12 http://security.ubuntu.com/ubuntu focal-security/main amd64 Packages [4081 kB]
Get:13 htt

### Connection Details

In [23]:
username = 'yh69-admin'
password = 'nimda-96hy'
server = 'yh69-server.database.windows.net'
database = 'badm554minicase1'

### Setup Connection

In [24]:
connection_string = f"mssql+pyodbc://{username}:{password}@{server}:1433/{database}?driver=ODBC+Driver+17+for+SQL+Server"
engine = create_engine(connection_string)

### Test Connection

In [26]:
try:
    with engine.connect() as connection:
        print("Connection success!")
except Exception as e:
    print("Failed to connect:", e)

Connection success!


## to.sql()

### Business

In [27]:
business.to_sql(
    name="business",
    con=engine,
    if_exists='replace',
    index=True,
    dtype={'attributes': sqlalchemy.types.NVARCHAR, 'hours': sqlalchemy.types.NVARCHAR}
)

135

### Review

In [28]:
review.to_sql(
    name="review",
    con=engine,
    if_exists='replace',
    index=True
)

193

### Checkin

In [29]:
checkin.to_sql(
    name="checkin",
    con=engine,
    if_exists='replace',
    index=True
)

284

### Tip

In [30]:
tip.to_sql(
    name="tip",
    con=engine,
    if_exists='replace',
    index=True
)

216

### User

In [31]:
user.to_sql(
    name="user",
    con=engine,
    if_exists='replace',
    index=True
)

86

## Execute SQL Syntax

In [32]:
!pip install pyodbc sqlalchemy ipython-sql



In [33]:
%load_ext sql

%sql $connection_string

## Check and Set PKs/FKs

* In this part, to ensure that every table in the database is well-structured, we need to examine each table to check for PKs and FKs

p.s: In this part, if setting is not working in the notebook, you should directly execute these codes **in Azure SQL Server**

First, Check all tables to see FKs and PKs:

In [34]:
inspector = inspect(engine)

tables = inspector.get_table_names()

for table in tables:
    print(f"Table: {table}")
    
    pk = inspector.get_pk_constraint(table)
    print("Primary Key(s):", pk['constrained_columns'])
    
    fks = inspector.get_foreign_keys(table)
    print("Foreign Keys:", fks)

Table: business
Primary Key(s): []
Foreign Keys: []
Table: checkin
Primary Key(s): []
Foreign Keys: []
Table: review
Primary Key(s): []
Foreign Keys: []
Table: tip
Primary Key(s): []
Foreign Keys: []
Table: user
Primary Key(s): []
Foreign Keys: []


Please note that these are the very default settings, so it’s normal for them to be empty (unless you have already configured them in Azure)

The following code can be executed in: **Azure Database - Query editor (preview) - Login - Query**

### Business

In [35]:
%%sql
ALTER TABLE business
DROP CONSTRAINT IF EXISTS PK_business; 

--set pk:
ALTER TABLE business
ALTER COLUMN business_id VARCHAR(50) NOT NULL; 

--set fk:
ALTER TABLE business
ADD CONSTRAINT PK_business PRIMARY KEY (business_id); 

 * mssql+pyodbc://yh69-admin:***@yh69-server.database.windows.net:1433/badm554minicase1?driver=ODBC+Driver+17+for+SQL+Server
Done.
Done.
Done.


[]

### Checkin

In [37]:
%%sql
ALTER TABLE checkin
ALTER COLUMN checkin_id INT NOT NULL;
ALTER TABLE checkin
ALTER COLUMN business_id VARCHAR(50) NULL;

--set pk:
ALTER TABLE checkin
ADD CONSTRAINT PK_checkin PRIMARY KEY (checkin_id);

--set fk:
ALTER TABLE checkin
ADD CONSTRAINT FK_checkin_business FOREIGN KEY (business_id) REFERENCES business(business_id);

 * mssql+pyodbc://yh69-admin:***@yh69-server.database.windows.net:1433/badm554minicase1?driver=ODBC+Driver+17+for+SQL+Server
Done.
Done.
Done.
(pyodbc.ProgrammingError) ('42000', "[42000] [Microsoft][ODBC Driver 17 for SQL Server][SQL Server]There are no primary or candidate keys in the referenced table 'business' that match the referencing column list in the foreign key 'FK_checkin_business'. (1776) (SQLExecDirectW)")
[SQL: --set fk:
ALTER TABLE checkin
ADD CONSTRAINT FK_checkin_business FOREIGN KEY (business_id) REFERENCES business(business_id);]
(Background on this error at: https://sqlalche.me/e/20/f405)


### Review

In [38]:
%%sql
ALTER TABLE review
ALTER COLUMN review_id VARCHAR(50) NOT NULL;
ALTER TABLE review
ALTER COLUMN business_id VARCHAR(50) NULL;
ALTER TABLE review
ALTER COLUMN user_id VARCHAR(50) NULL;

--set pk
ALTER TABLE review
ADD CONSTRAINT PK_review PRIMARY KEY (review_id);

--set fk
ALTER TABLE review
ADD CONSTRAINT FK_review_business FOREIGN KEY (business_id) REFERENCES business(business_id);
ALTER TABLE review
ADD CONSTRAINT FK_review FOREIGN KEY (user_id) REFERENCES [user](user_id);

 * mssql+pyodbc://yh69-admin:***@yh69-server.database.windows.net:1433/badm554minicase1?driver=ODBC+Driver+17+for+SQL+Server
Done.
Done.
Done.
Done.
Done.


[]

### Tips

In [39]:
%%sql
ALTER TABLE tip
ALTER COLUMN tip_id INT NOT NULL;
ALTER TABLE tip
ALTER COLUMN business_id VARCHAR(50) NULL;
ALTER TABLE tip
ALTER COLUMN user_id VARCHAR(50) NULL;

--set pk:
ALTER TABLE tip
ADD CONSTRAINT PK_tip PRIMARY KEY (tip_id);

--set fk:
ALTER TABLE tip
ADD CONSTRAINT FK_tip FOREIGN KEY (business_id) REFERENCES business(business_id);

 * mssql+pyodbc://yh69-admin:***@yh69-server.database.windows.net:1433/badm554minicase1?driver=ODBC+Driver+17+for+SQL+Server
Done.
Done.
Done.
Done.
Done.


[]

### User

In [40]:
%%sql
ALTER TABLE [user]
ALTER COLUMN user_id VARCHAR(50) NOT NULL;

--set pk:
ALTER TABLE [user]
ADD CONSTRAINT PK_user PRIMARY KEY (user_id);

 * mssql+pyodbc://yh69-admin:***@yh69-server.database.windows.net:1433/badm554minicase1?driver=ODBC+Driver+17+for+SQL+Server
Done.
Done.


[]

To check if the PKs and FKs are being set properly:

In [43]:
%%sql
SELECT kcu.COLUMN_NAME AS ColumnName
FROM INFORMATION_SCHEMA.TABLE_CONSTRAINTS AS tc
JOIN INFORMATION_SCHEMA.KEY_COLUMN_USAGE AS kcu
ON tc.CONSTRAINT_NAME = kcu.CONSTRAINT_NAME
WHERE tc.TABLE_NAME = 'business' AND tc.CONSTRAINT_TYPE = 'PRIMARY KEY';

 * mssql+pyodbc://yh69-admin:***@yh69-server.database.windows.net:1433/badm554minicase1?driver=ODBC+Driver+17+for+SQL+Server
0 rows affected.


ColumnName
