<a href="https://colab.research.google.com/github/carlosbarrone/public_notebooks/blob/dev/unsupervised_text_classification_final_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
try:
  from tqdm import tqdm
except Exception as e:
  print(f'tqdmNotInstalled_{e}')
  !pip install tqdm
  from tqdm import tqdm

In [2]:
import gzip
import json
import gc
import re
import pickle

import pandas as pd

from collections import defaultdict
from google.colab import drive
from datetime import date

In [3]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
DATA_FOLDER_PATH = '/content/drive/MyDrive/Colab Notebooks/data'
TOTAL_REVIEWS = 5748920
TOTAL_PRODUCTS = 1503384

In [16]:
all_categories  = defaultdict(lambda: 0)

def process_categories(categories: list[list[str]])->list[str]:
  categories = list(set([re.sub('[^A-Za-z0-9 ]+','',w.lower().strip()) for category in categories for w in category]))
  for c in categories:
    all_categories[c] += 1
  return ' '.join(' '.join(categories).split())

class Review:
  def __init__(self, props: dict, index = False) -> None:
    try:
      self.idx = index
      self.reviewer_id = props.get('reviewerID')
      self.asin = props.get('asin')
      self.reviewer_name = props.get('reviewerName')
      self.helpful_0 = props.get('helpful')[0]
      self.helpful_1 = props.get('helpful')[1]
      self.review_text = props.get('reviewText')
      self.overall = props.get('overall')
      self.summary = props.get('summary')
      self.unix_review_time = props.get('unixReviewTime')
      self.review_time = date.fromtimestamp(self.unix_review_time)
    except Exception as e:
      print(e)

  def to_dict(self):
    return self.__dict__

class Product_Metadata:
  def __init__(self, props: dict, index = False) -> None:
    cat = process_categories(props.get('categories'))
    sr = props.get('salesRank')
    try:
      if sr:
        ranking_categories = list(props['salesRank'].keys())
        self.primary_category = re.sub('[^A-Za-z0-9 ]','',list(props['salesRank'].keys())[0])
        self.sales_rank = props['salesRank'][list(props['salesRank'].keys())[0]]
      else:
        self.primary_category = None
        self.sales_rank = None
      self.idx = index
      self.asin = props.get('asin')
      self.brand = props.get('brand')
      self.title = props.get('title')
      self.price = props.get('price')
      self.categories = cat

    except Exception as e:
      print(e)

  def to_dict(self):
    return self.__dict__

def build_reviews_df(load_path: str, save_path: str = False)->pd.DataFrame:
  reviews = []
  with gzip.open(load_path, 'r') as file:
    print('Loading data...')
    for idx, r in enumerate(tqdm(file, total=TOTAL_REVIEWS)):
      try:
          review = Review(json.loads(r), idx).to_dict()
          reviews.append(review)
      except KeyboardInterrupt as ki:
        return pd.DataFrame(reviews)
      except Exception as e:
        print(e,type(e))
    print('Loading data done.')
    print('Building Data Frame...')
    final_df: pd.DataFrame = pd.DataFrame(reviews)
    print('Building Data Frame done.')
    if save_path:
      print('Writing CSV to drive...')
      try:
        final_df.to_csv(save_path, index=False)
        print('Data Frame written to google drive.')
      except Exception as e:
        print('build_reviews_df:ERROR_WRITING_FILE:',e)
    del reviews
    return final_df

def build_product_metadata_df(load_path: str, save_path: str = False)->pd.DataFrame:
  products = []
  with gzip.open(load_path, 'r') as file:
    print('Loading data...')
    for idx, p in enumerate(tqdm(file, total=TOTAL_PRODUCTS)):
      try:
          product = Product_Metadata(json.loads(p), idx).to_dict()
          products.append(product)
      except KeyboardInterrupt as ki:
        return pd.DataFrame(products)
      except Exception as e:
        print(e,type(e))
    print('Loading data done.')
    print('Building Data Frame...')
    final_df: pd.DataFrame = pd.DataFrame(products)
    print('Building Data Frame done.')
    if save_path:
      print('Writing CSV to drive...')
      try:
        final_df.to_csv(save_path, index=False)
        print('Data Frame written to google drive.')
      except Exception as e:
        print('build_reviews_df:ERROR_WRITING_FILE:',e)
    del products
    return final_df

In [6]:
try:
  print('Reading from Google Drive...')
  reviews_df = pd.read_csv(f'{DATA_FOLDER_PATH}/reviews_Clothing_Shoes_and_Jewelry.csv')
except FileNotFoundError as e:
  print('File does not exist, building reviews Data Frame...')
  reviews_df = build_reviews_df(
      f'{DATA_FOLDER_PATH}/reviews_Clothing_Shoes_and_Jewelry.json.gz',
      f'{DATA_FOLDER_PATH}/reviews_Clothing_Shoes_and_Jewelry.csv',
  )
except Exception as e:
  print(e,type(e))

Reading from Google Drive...


In [17]:
try:
  print('Reading from Google Drive...')
  products_df = pd.read_csv(f'{DATA_FOLDER_PATH}/metadata_Clothing_Shoes_and_Jewelry.csv')
except FileNotFoundError as e:
  print('File does not exist, building reviews Data Frame...')
  products_df = build_product_metadata_df(
      f'{DATA_FOLDER_PATH}/metadata_Clothing_Shoes_and_Jewelry.jsonl.gz',
      f'{DATA_FOLDER_PATH}/metadata_Clothing_Shoes_and_Jewelry.csv',
  )
except Exception as e:
  print(e,type(e))

Reading from Google Drive...
File does not exist, building reviews Data Frame...
Loading data...


100%|██████████| 1503384/1503384 [01:23<00:00, 17980.95it/s]


Loading data done.
Building Data Frame...
Building Data Frame done.
Writing CSV to drive...
Data Frame written to google drive.


In [19]:
try:
  with open(f'{DATA_FOLDER_PATH}/metadata_all_categories_clothing_shoes_jewelry.pickle', 'rb') as f:
    pickle.load(f)
except FileNotFoundError as e:
  print('File does not exist, building reviews Data Frame...')
  with open(f'{DATA_FOLDER_PATH}/metadata_all_categories_clothing_shoes_jewelry.pickle', 'wb+') as f:
    pickle.dump(dict(all_categories), f, protocol=pickle.HIGHEST_PROTOCOL)

In [83]:
nike_top_3000_asins = list(set(products_df[(products_df.categories.str.contains('nike')) & (products_df.sales_rank <= 3000)].asin))
reviews_df.asin.isin(nike_top_3000_asins).sum()

3549