This notebook will be an exploration of cold start methodologies in recomender systems.  I will use the Yelp dataset.

In [20]:
import pandas as pd
import os
import tarfile
import json

from sqlalchemy import create_engine, MetaData, Table, Column, String, Integer, Numeric, DateTime

from sqlalchemy.dialects.mysql import \
        CHAR, DATE, DATETIME, DECIMAL, \
        MEDIUMTEXT, NUMERIC, VARCHAR, YEAR


# Import json and add to database

In [2]:
os.listdir('data')

['yelp_dataset.tar', '.DS_Store', 'yelp_dataset']

In [3]:
if not os.path.exists('data/yelp_dataset'):
    tar = tarfile.open(os.path.join('data', 'yelp_dataset.tar'))
    tar.extractall('data/yelp_dataset')
else:
    print("Destination already exists.")

Destination already exists.


In [4]:
DATA_PATH = 'data/yelp_dataset'
os.listdir(DATA_PATH)

['yelp_academic_dataset_checkin.json',
 'Dataset_User_Agreement.pdf',
 'yelp_academic_dataset_tip.json',
 'yelp_academic_dataset_review.json',
 'yelp_academic_dataset_business.json',
 'yelp_academic_dataset_user.json']

# Create Database

In [32]:
# Migrate to MySQL for faster queries
engine = create_engine('mysql+pymysql://root:password@localhost/yelp')
#engine = create_engine("sqlite:///database")

metadata = MetaData(engine)
metadata.reflect()

In [27]:
metadata.tables

FacadeDict({})

In [24]:
# Start Over
metadata.drop_all()

## Reviews
Reviews is too large to hold in memory, we need to write this directly to a database line by line.

In [8]:
# file too large, we run out of memory
# review_df = pd.read_json(os.path.join(DATA_PATH, 'yelp_academic_dataset_review.json'), lines=True)

In [33]:
# Explicity create review table

review_table = Table('review', 
                     metadata,
                     Column('review_id', VARCHAR(256), primary_key=True),
                     Column('user_id', VARCHAR(256)),
                     Column('business_id', VARCHAR(256)),
                     Column('stars', NUMERIC),
                     Column('useful', NUMERIC),
                     Column('funny', NUMERIC),
                     Column('cool', NUMERIC),
                     Column('text', MEDIUMTEXT),
                     Column('date', DATETIME)
                    )

In [34]:
metadata.create_all()

In [12]:
# Import reviews to database

# Only import if the table doesn't exist.  This takes a long time
if not 'review' in metadata.tables.keys():
    review_file = os.path.join(DATA_PATH, 'yelp_academic_dataset_review.json')

    with open(review_file, 'r') as f:
        
        line = f.readline()
        line = json.loads(line)

        line = {k: [v] for k, v in line.items()}
        line_df = pd.DataFrame(lines)

        line_df.to_sql('review', engine, if_exists='append')

In [13]:
# Explicity create review table
review_table = Table('review', 
                     metadata,
                     Column('review_id', String),
                     Column('user_id', String),
                     Column('business_id', String),
                     Column('stars', Numeric),
                     Column('useful', Numeric),
                     Column('funny', Numeric),
                     Column('cool', Numeric),
                     Column('text', String),
                     Column('date', DateTime)
                    )

InvalidRequestError: Table 'review' is already defined for this MetaData instance.  Specify 'extend_existing=True' to redefine options and columns on an existing Table object.

In [14]:
# Import reviews to database


# Only import if the table doesn't exist.  This takes a long time
# if not 'review' in metadata.tables.keys():
review_file = os.path.join(DATA_PATH, 'yelp_academic_dataset_review.json')

with open(review_file, 'r') as f:
    run = True
    while run:
        # read 10000 lines and insert them
        lines = []
        for i in range(10 ** 6):
            line = f.readline()
            if not line:
                run = False
                break
            line = json.loads(line)
            lines.append(line)

        line_df = pd.DataFrame(lines)
        line_df.to_sql('review', engine, if_exists='append')

KeyboardInterrupt: 

In [None]:
line_df

## Business

In [None]:
business_df = pd.read_json(os.path.join(DATA_PATH, 'yelp_academic_dataset_business.json'), lines=True)

In [None]:
business_df.shape

In [None]:
business_df.head()

In [None]:
business_df.categories.str.contains('Italian').sum()