Cleaning up and simplifying the dateframe to reduce memory and runtime

In [1]:
# If not already installed, do: pip install pandas fastparquet
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import dask.dataframe as ddf
from dask.diagnostics import ProgressBar
from tqdm.auto import tqdm

In [2]:

URL_LOOKUP = 'https://storage.googleapis.com/dosm-public-pricecatcher/lookup_item.parquet'
URL_PREMISE = 'https://storage.googleapis.com/dosm-public-pricecatcher/lookup_premise.parquet'
url_file = 'pricecatcher/pricecatcher/price_urls.json'

malaysia_states = {
    "Johor" : 1,
    "Kedah" : 2,
    "Kelantan" : 3,
    "Melaka": 4,
    "Negeri Sembilan" : 5,
    "Pahang" : 6,
    "Perak" : 7,
    "Perlis" : 8,
    "Pulau Pinang" : 9,
    "Sabah" : 10,
    "Sarawak" : 11,
    "Selangor" : 12,
    "Terengganu" :14,
    "Wilayah Persekutuan" : 14
}

In [3]:
price_dfs = []

with open(url_file, 'r') as json_file: 
    price_urls_data = json.load(json_file)

for entry in price_urls_data:
        parquet_urls = entry['parquet_files']
        for url in parquet_urls:
            df = ddf.read_parquet(url, blocksize = '1GB', npartitions = 8)
            if 'date' in df.columns: df['date'] = ddf.to_datetime(df['date'])
            df = df[(df['item_code'] != -1) | (df['premise_code'] != -1)]
            price_dfs.append(df)

price = ddf.concat(price_dfs, ignore_index = True)

premise = ddf.read_parquet(URL_PREMISE, npartitions = 8)
lookup = ddf.read_parquet(URL_LOOKUP, npartitions = 8)
premise = premise.dropna()
lookup = lookup.dropna()
if 'date' in lookup.columns: lookup['date'] = ddf.to_datetime(lookup['date'])
if 'date' in premise.columns: premise['date'] = ddf.to_datetime(premise['date'])

#premise = premise.drop(columns = ['premise_type', 'address'])
price['premise_code'] = price['premise_code'].astype('int32')
premise['premise_code'] = premise['premise_code'].astype('int32')
price = premise.merge(price, on='premise_code',how='left',indicator=False)
price = price.drop(columns = ['premise_type', 'address', 'premise', 'premise_code', 'district'])

del price_dfs
del df
del entry
del json_file
del parquet_urls
del url
del URL_PREMISE
del URL_LOOKUP
del url_file
del price_urls_data
del premise


CREATE A DICT SO USER CAN ACCESS USING ITEM AND PREMISE CODE INSTEAD OF NAME

In [None]:
#print(lookup.head())

In [4]:
lookup_dict = {}


for index, row in tqdm(lookup.iterrows(), total=len(lookup)):
    key_tuple = tuple([row['item'], row['item_category']])
    lookup_dict[key_tuple] = row['item_code']


  0%|          | 0/756 [00:00<?, ?it/s]

In [None]:
#lookup_dict.clear()

In [None]:
#print(price.head())

Filter the DDF based on user input

In [None]:
import dask.dataframe as dd
from fuzzywuzzy import fuzz
import ipywidgets as widgets
from IPython.display import display, clear_output
import concurrent.futures

In [None]:
#Defining global variable below
user_input = []
filtered_data = None

In [None]:
def match(input, choices, threshold=80):
        match_score = [(choice, fuzz.partial_ratio(input, choice.lower())) for choice in choices if isinstance(choice, str)]
        matched_item = max(match_score, key=lambda x: x[1], default=None)
        if matched_item[1] >= threshold:
            return matched_item[0]
        else:
            return None

def identify_item_code(user_input):
    item_keys = []
    user_item = user_input.lower()
    matching_item_codes = []
    
    for key in lookup_dict.keys(): 
    # Search in the "item" 
        if match(user_item, [key[0]], threshold=80) is not None:
            item_codes = lookup_dict.get(key, [])
            matching_item_codes.append(item_codes)
    
    # Search in the "item_category" 
        if match(user_item, [key[1]], threshold=80) is not None:
            item_codes = lookup_dict.get(key, [])
            matching_item_codes.append(item_codes)
    
    item_keys = list(set(matching_item_codes))
    return price[price['item_code'].isin(item_keys)]

def identify_state(user_input, df):
    state_name = user_input.lower()
    matching_item_codes=[]
    
    def process_chunk(chunk):
        matched_state = match(state_name, df['state'], threshold=80)
        return chunk[chunk['state'].isin(matched_state)]
    
    chunk_size = 10000  # Adjust this based on your system's capabilities
    chunks = [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]

    with concurrent.futures.ThreadPoolExecutor() as executor:
        results = list(executor.map(process_chunk, chunks))
    
    return ddf.concat(results)


In [None]:
print(price.compute())

Just search using the code below

In [None]:
item = 'ayam'
state = 'johor'

if (len(item.strip()) > 0) and (len(state.strip()) > 0): user_input = [item, state]
elif (len(item.strip()) > 0): user_input = [item]

In [None]:
if len(user_input) >= 1: 
    filtered_data = identify_item_code(user_input[0])

if len(user_input) >= 2: 
    filtered_data = identify_state(user_input[1], filtered_data)
print(filtered_data.compute())


