In [2]:
import streamlit as st
import pandas as pd
from azure.identity import ClientSecretCredential
from azure.storage.blob import BlobServiceClient
import plotly.express as px
import altair as alt
import io

#fetching the view file from azure blob storage
container_name = st.secrets["AZURE_CONTAINER_NAME"]

adls_conn_string = st.secrets["AZURE_DATA_LAKE_CONNECTION_STRING"]
if adls_conn_string is None:
    raise ValueError("ADLS connection string not found")
        
# Create a BlobServiceClient
service_client = BlobServiceClient.from_connection_string(adls_conn_string)

# Get container client
container_client = service_client.get_container_client(container_name)

# Replace 'your_file.parquet' with your actual parquet file name
blob_name = 'used-car-data-view.parquet'
blob_client = container_client.get_blob_client(blob_name)

# Download the blob content
downloaded_blob = blob_client.download_blob()
bytes_data = downloaded_blob.readall()

# Convert to DataFrame
df = pd.read_parquet(io.BytesIO(bytes_data))
df = df[['car_model','year','price','variant','mileage','state','location']]

#preprocessing the value of myvi to make it uniform
df['car_model'] = df['car_model'].replace('myvi', 'Myvi')


In [10]:
# Count the number of entries per state
state_counts = df['state'].value_counts().reset_index()
state_counts.columns = ['state', 'count']

# Create a bar chart using Altair
chart = alt.Chart(state_counts).mark_bar().encode(
    x=alt.X('state:N', title='State'),
    y=alt.Y('count:Q', title='Number of Listings'),
    color=alt.Color('count:Q', scale=alt.Scale(scheme='blues'))
).properties(
    title='Distribution of Car Listings by State',
    width=600,
    height=400
)

# Display the chart
chart

# Print the actual counts and percentages
# print("\nState Distribution:")
# print(df['state'].value_counts())
# print("\nPercentages:")
# print((df['state'].value_counts() / len(df) * 100).round(2), "%")

In [25]:
alt.data_transformers.disable_max_rows()

selected_car_model = 'Vios'
df_selected_car_model = df[df.car_model == selected_car_model]
    # df_selected_year_sorted = df_selected_year.sort_values(by="population", ascending=False)
color_theme_list = ['blues', 'cividis', 'greens', 'inferno', 'magma', 'plasma', 'reds', 'rainbow', 'turbo', 'viridis']
selected_color_theme = 'blues'

def make_heatmap(input_df, input_y, input_x, input_color, input_color_theme):
    # Create a copy and ensure price is numeric
    df_with_bins = input_df.copy()
    df_with_bins['price'] = pd.to_numeric(df_with_bins['price'], errors='coerce')
    
    # Create price bins
    price_bins = [0, 10000, 20000, 30000, 40000, 500000, float('inf')]
    price_labels = ['0-10k', '10k-20k', '20k-30k', '30k-40k', '40k-50k', '50k+']
    
    # Add price range column
    df_with_bins['price_range'] = pd.cut(df_with_bins['price'], 
                                        bins=price_bins, 
                                        labels=price_labels, 
                                        include_lowest=True)
    
    # Create a count by year and price range
    count_matrix = df_with_bins.groupby(['state', 'price_range']).size().reset_index(name='count')
    
    heatmap = alt.Chart(count_matrix).mark_rect().encode(
            y=alt.Y(f'{input_y}:O', axis=alt.Axis(title="Year", titleFontSize=18, titlePadding=15, titleFontWeight=900, labelAngle=0)),
            x=alt.X('price_range:O', axis=alt.Axis(title="Price Range (RM)", titleFontSize=18, titlePadding=15, titleFontWeight=900)),
            color=alt.Color('count:Q',
                             legend=alt.Legend(title="Number of Cars"),
                             scale=alt.Scale(scheme=input_color_theme)),
            stroke=alt.value('black'),
            strokeWidth=alt.value(0.25),
        ).properties(width=900
        ).configure_axis(
        labelFontSize=12,
        titleFontSize=12
        ) 
    return heatmap

# Update the heatmap call
heatmap = make_heatmap(df_selected_car_model, 'state', 'price', 'count', selected_color_theme)

heatmap

  count_matrix = df_with_bins.groupby(['state', 'price_range']).size().reset_index(name='count')


In [24]:
df_selected_car_model.head()

Unnamed: 0,car_model,year,price,variant,mileage,state,location
365,Vios,2014,34800,G,162500,Pahang,Temerloh
428,Vios,2017,40700,J,177500,Selangor,Cheras
429,Vios,2011,23800,G,172500,Johor,Ulu Tiram
494,Vios,2016,46800,J,82500,Johor,Johor Bahru
542,Vios,2021,66800,G,82500,Perak,Ipoh
