Cleaning up and simplifying the dateframe to reduce memory and runtime

In [None]:
# If not already installed, do: pip install pandas fastparquet
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import dask.dataframe as ddf
from dask.diagnostics import ProgressBar

In [None]:

URL_LOOKUP = 'https://storage.googleapis.com/dosm-public-pricecatcher/lookup_item.parquet'
URL_PREMISE = 'https://storage.googleapis.com/dosm-public-pricecatcher/lookup_premise.parquet'
url_file = 'pricecatcher/pricecatcher/price_urls.json'

malaysia_states = {
    "Johor" : 1,
    "Kedah" : 2,
    "Kelantan" : 3,
    "Melaka": 4,
    "Negeri Sembilan" : 5,
    "Pahang" : 6,
    "Perak" : 7,
    "Perlis" : 8,
    "Pulau Pinang" : 9,
    "Sabah" : 10,
    "Sarawak" : 11,
    "Selangor" : 12,
    "Terengganu" :14,
    "Wilayah Persekutuan" : 14
}

In [None]:
price_dfs = []

with open(url_file, 'r') as json_file: 
    price_urls_data = json.load(json_file)

for entry in price_urls_data:
        parquet_urls = entry['parquet_files']
        for url in parquet_urls:
            df = ddf.read_parquet(url, blocksize = '1GB', npartitions = 8)
            if 'date' in df.columns: df['date'] = ddf.to_datetime(df['date'])
            df = df[(df['item_code'] != -1) | (df['premise_code'] != -1)]
            price_dfs.append(df)

price = ddf.concat(price_dfs, ignore_index = True)

premise = ddf.read_parquet(URL_PREMISE, npartitions = 8)
lookup = ddf.read_parquet(URL_LOOKUP, npartitions = 8)
premise = premise.dropna()
lookup = lookup.dropna()
if 'date' in lookup.columns: lookup['date'] = ddf.to_datetime(lookup['date'])
if 'date' in premise.columns: premise['date'] = ddf.to_datetime(premise['date'])

#premise = premise.drop(columns = ['premise_type', 'address'])
price['premise_code'] = price['premise_code'].astype('int32')
premise['premise_code'] = premise['premise_code'].astype('int32')
price = premise.merge(price, on='premise_code',how='left',indicator=False)
price = price.drop(columns = ['premise_type', 'address', 'premise', 'premise_code', 'district'])

del price_dfs
del df
del entry
del json_file
del parquet_urls
del url


CREATE A DICT SO USER CAN ACCESS USING ITEM AND PREMISE CODE INSTEAD OF NAME

In [None]:
lookup_series = lookup.set_index('item')['item_code']
lookup_dict = lookup_series.compute().to_dict()

del lookup_series

In [None]:
#print(price.head())

Filter the DDF based on user input

In [None]:
import dask.dataframe as dd
from fuzzywuzzy import fuzz
import ipywidgets as widgets
from IPython.display import display, clear_output
import time
import threading

In [None]:
#Defining global variable below
user_input = []
filtered_data = None
item_keys = []
state_key = None

In [None]:
def widget():
    state_input = widgets.Text(description="State: ")
    item_input = widgets.Text(description="Item: ")
    submit_button = widgets.Button(description="Submit")
    cancel_button = widgets.Button(description="Cancel")
    buttons_box = widgets.HBox([submit_button, cancel_button])

    display(item_input) 
    display(state_input)
    display(buttons_box)

    def on_submit(button):
        global user_input
        item = item_input.value.strip().lower()
        state = state_input.value.strip().lower()

        if item is not None:
            user_input.append(item)
            if state is not None:
                user_input.append(state)
        
    submit_button.on_click(on_submit)

In [None]:
def match(input, choices, threshold=80):
        match_score = [(choice, fuzz.partial_ratio(input, choice.lower())) for choice in choices if isinstance(choice, str)]
        matched_item = max(match_score, key=lambda x: x[1], default=None)
        if matched_item[1] >= threshold:
            return matched_item[0]
        else:
            return None

def identify_item_code(user_input):
    global item_keys
    user_item_name = user_input.strip().lower()
    matching_item_codes = []
    
    # Search in the "item" column
    matched_item = match(user_item_name, lookup['item'], threshold=80)
    if matched_item is not None:
        item_codes = lookup[lookup['item'] == matched_item]['item_code'].tolist()
        matching_item_codes.extend(item_codes)
    
    # Search in the "item_category" column
    matched_category = match(user_item_name, lookup['item_category'], threshold=80)
    if matched_category is not None:
        item_codes = lookup[lookup['item_category'] == matched_category]['item_code'].tolist()
        matching_item_codes.extend(item_codes)
    
    item_keys = list(set(matching_item_codes))
    
    return price[price['item_code'].isin(item_keys)]

def identify_state(user_input, df):
    user_item_name = user_input.strip().lower()

    matched_state = match(user_item_name, df['state'], threshold=80)
    if matched_category is not None:
        state = df[df['state'] == matched_category]['state'].tolist()
        matching_item_codes.extend(state)
    
    item_state = list(set(matching_item_codes))
    
    return df[df['state'].isin(item_keys)]


In [None]:
print(price.head())

In [None]:
widget()

In [None]:
print(user_input)


In [None]:
user_input.clear()

In [None]:
if len(user_input) > 0: filtered_data = identify_item_code(user_input=user_input[0])
elif len(user_input) > 1: filtered_data = identify_state(user_input[1], filtered_data)

print(filtered_data)