In [2]:
import pandas as pd
import plotly.graph_objects as go

# Load data
df = pd.read_csv('dataMCMC.csv')
df['source'] = df['CREATED_BY'].apply(lambda x: 'API' if 'API' in str(x).upper() else 'MANUAL')

# Helper function to build a Sankey diagram
def create_sankey(df, cols, title="Sankey Diagram"):
    labels = []
    label_index = {}
    source = []
    target = []
    value = []

    for i in range(len(cols) - 1):
        group = df.groupby([cols[i], cols[i+1]]).size().reset_index(name='count')
        for _, row in group.iterrows():
            src, tgt, cnt = row[cols[i]], row[cols[i+1]], row['count']

            for val in [src, tgt]:
                if val not in label_index:
                    label_index[val] = len(labels)
                    labels.append(val)

            source.append(label_index[src])
            target.append(label_index[tgt])
            value.append(cnt)

    # Create Sankey plot
    fig = go.Figure(data=[go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=labels,
            color="blue"
        ),
        link=dict(
            source=source,
            target=target,
            value=value
        ))])

    fig.update_layout(title_text=title, font_size=12)
    fig.write_html('sankey1.html')
    fig.show()


# 📊 Sankey 1: UPDATED_BY → STATE → SERVICE_PROVIDER
create_sankey(
    df,
    cols=['UPDATED_BY', 'source', 'SERVICE_PROVIDER', 'STATE'],
    title="Flow from UPDATED_BY to STATE to SERVICE_PROVIDER"
)



  df = pd.read_csv('dataMCMC.csv')


In [3]:
import pandas as pd
import plotly.graph_objects as go

# Load data
df = pd.read_csv('dataMCMC.csv')
df['source'] = df['CREATED_BY'].apply(lambda x: 'API' if 'API' in str(x).upper() else 'MANUAL')

# Helper function to build a Sankey diagram
def create_sankey(df, cols, title="Sankey Diagram"):
    labels = []
    label_index = {}
    source = []
    target = []
    value = []

    for i in range(len(cols) - 1):
        group = df.groupby([cols[i], cols[i+1]]).size().reset_index(name='count')
        for _, row in group.iterrows():
            src, tgt, cnt = row[cols[i]], row[cols[i+1]], row['count']

            for val in [src, tgt]:
                if val not in label_index:
                    label_index[val] = len(labels)
                    labels.append(val)

            source.append(label_index[src])
            target.append(label_index[tgt])
            value.append(cnt)

    # Create Sankey plot
    fig = go.Figure(data=[go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=labels,
            color="blue"
        ),
        link=dict(
            source=source,
            target=target,
            value=value
        ))])

    fig.update_layout(title_text=title, font_size=12)
    fig.write_html('sankey2.html')
    fig.show()


# 📊 Sankey 2: UPDATED_BY → GENDER → RACE → OCCUPATION → OKU_STATUS
create_sankey(
    df,
    cols=['UPDATED_BY', 'GENDER', 'RACE', 'OCCUPATION', 'OKU_STATUS'],
    title="Flow from UPDATED_BY to GENDER to RACE to OCCUPATION to OKU_STATUS"
)




Columns (0) have mixed types. Specify dtype option on import or set low_memory=False.



In [4]:
df.columns

Index(['MEMBER_IC', 'MEMBER_NAME', 'STATE', 'BIRTHDATE', 'ID',
       'SERVICE_PROVIDER', 'MEMBER_STATUS', 'OKU_STATUS', 'MEMBER_ID',
       'GENDER', 'AGE COUNT', 'AGE', 'RACE', 'OCCUPATION', 'LAST_LOGIN',
       'CREATED_BY', 'DATE_CREATED', 'UPDATED_BY', 'UPDATED', 'PI1M_REFID',
       'SITE_NAME', 'MEMBER_TYPE', 'STATE_CODE', 'source'],
      dtype='object')

In [5]:
# Group by both columns and count duplicates
duplicate_counts = df.groupby(['MEMBER_IC', 'MEMBER_NAME']).size().reset_index(name='count')

# Filter only duplicates (count > 1)
duplicates_only = duplicate_counts[duplicate_counts['count'] > 1]

In [6]:
duplicates_only

Unnamed: 0,MEMBER_IC,MEMBER_NAME,count
12805,30704050135,MUHAMMAD SHAFIE,2
14022,30714121129,FAKRUL,2
18610,30821081117,AMIRUDDIN BIN CHE LAH,2
28786,31107140102,NURUL PUTRI SYAMIRA,2
30533,31120110442,SUFIA NATASHA BINTI KASSIM,2
...,...,...,...
1253226,991013026163,MAHYUDDIN BIN ABDULLAH,2
1262241,991231146948,NUR AIMAN SYAFIQAH,2
1283452,CE 66818,FARAHIYAH DAMIA BINTI MOHD HUSAINI,2
1284063,CH 82311,NUR ALYAA HANANI BT HUDRUS,2


In [2]:
df.head()

Unnamed: 0,MEMBER_IC,MEMBER_NAME,STATE,BIRTHDATE,ID,SERVICE_PROVIDER,MEMBER_STATUS,OKU_STATUS,MEMBER_ID,GENDER,...,OCCUPATION,LAST_LOGIN,CREATED_BY,DATE_CREATED,UPDATED_BY,UPDATED,PI1M_REFID,SITE_NAME,MEMBER_TYPE,STATE_CODE
0,----------,MOHAMAD HAZIM MUAZ,KELANTAN,1/1/2009 12:00:00 AM,886567,TM,VISITOR,YES,520879,MALE,...,STUDENTS,7/9/2014 11:07:00 AM,TM_NMS_API,22/8/2017 3:04:39 PM,TM_NMS_API,22/8/2017 3:04:39 PM,D03C013,KAMPUNG PAHI,BIRTH CERT,3
1,-----------,MUHAMAD EMIRUL ABBASY,KELANTAN,1/1/2009 12:00:00 AM,886570,TM,VISITOR,YES,520882,MALE,...,STUDENTS,7/9/2014 11:07:00 AM,TM_NMS_API,22/8/2017 3:04:40 PM,TM_NMS_API,22/8/2017 3:04:40 PM,D03C013,KAMPUNG PAHI,BIRTH CERT,3
2,--------,NUR ANIS QAILISYA CHE MOHD KHAIRI,KELANTAN,1/1/2009 12:00:00 AM,886568,TM,VISITOR,YES,520873,FEMALE,...,STUDENTS,7/9/2014 11:07:00 AM,TM_NMS_API,22/8/2017 3:04:39 PM,TM_NMS_API,22/8/2017 3:04:39 PM,D03C013,KAMPUNG PAHI,BIRTH CERT,3
3,-------,NUR ANIS QAISARA CHE MOHD KHAIRI,KELANTAN,1/1/2009 12:00:00 AM,886566,TM,VISITOR,YES,520872,FEMALE,...,STUDENTS,7/9/2014 11:07:00 AM,TM_NMS_API,22/8/2017 3:04:38 PM,TM_NMS_API,22/8/2017 3:04:38 PM,D03C013,KAMPUNG PAHI,BIRTH CERT,3
4,---------,NURUL ALIA MAISARAH,KELANTAN,1/1/2009 12:00:00 AM,886569,TM,VISITOR,YES,520877,FEMALE,...,STUDENTS,7/9/2014 11:07:00 AM,TM_NMS_API,22/8/2017 3:04:39 PM,TM_NMS_API,22/8/2017 3:04:39 PM,D03C013,KAMPUNG PAHI,BIRTH CERT,3


In [7]:
!pip install sidetable


Collecting sidetable


[notice] A new release of pip is available: 24.1.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip



  Downloading sidetable-0.9.1-py3-none-any.whl.metadata (45 kB)
     ---------------------------------------- 0.0/45.4 kB ? eta -:--:--
     --------------------------- ------------ 30.7/45.4 kB 1.3 MB/s eta 0:00:01
     ---------------------------------- --- 41.0/45.4 kB 653.6 kB/s eta 0:00:01
     -------------------------------------- 45.4/45.4 kB 445.6 kB/s eta 0:00:00
Downloading sidetable-0.9.1-py3-none-any.whl (19 kB)
Installing collected packages: sidetable
Successfully installed sidetable-0.9.1


In [8]:
import sidetable

# Generate frequency + percentage table
df_source_summary = df.stb.freq(['source'])

# Display the summary
print(df_source_summary)


   source    count    percent  cumulative_count  cumulative_percent
0     API  1094037  83.573925           1094037           83.573925
1  MANUAL   215028  16.426075           1309065          100.000000


In [12]:
import pandas as pd

url = 'https://ejoraovtxlbskfcxnkyd.supabase.co/storage/v1/object/sign/sampledt/dataMCMC_API_CELCOM_NMSCMS.csv?token=eyJraWQiOiJzdG9yYWdlLXVybC1zaWduaW5nLWtleV9mZGQ3NTM2Yy01MGJhLTQ5NjEtYTVhYy0yMTBiZjYxNGEyNjgiLCJhbGciOiJIUzI1NiJ9.eyJ1cmwiOiJzYW1wbGVkdC9kYXRhTUNNQ19BUElfQ0VMQ09NX05NU0NNUy5jc3YiLCJpYXQiOjE3NTEzNDcwNzMsImV4cCI6MTc1MTk1MTg3M30.Vrnd9JvrEGTnGfcbijweQufroixHge1kRSVkyFE7XHU'

# Read the CSV directly
df = pd.read_csv(url)

# Show the first few rows
print(df.head())


      MEMBER_IC                  MEMBER_NAME       STATE  \
0  000101010102  NUR SABRINA BINTI MOHD JERY       JOHOR   
1  000101010107       ZARAY REDZUAB BIN AZNI       JOHOR   
2  000101120101                       DEXTER       SABAH   
3  000109121931                  MOHD KHAIRY       SABAH   
4  000112030810                   NURAINNISA  TERENGGANU   

               BIRTHDATE       ID SERVICE_PROVIDER MEMBER_STATUS OKU_STATUS  \
0   1/1/2000 12:00:00 AM   787699           CELCOM        MEMBER         NO   
1   1/1/2000 12:00:00 AM   825648           CELCOM        MEMBER         NO   
2   1/1/2000 12:00:00 AM   801654           CELCOM        MEMBER        YES   
3  9/10/2000 12:00:00 AM  1047123           CELCOM        MEMBER         NO   
4  12/1/2000 12:00:00 AM   790531           CELCOM        MEMBER        YES   

              MEMBER_ID  GENDER  ...  OCCUPATION             LAST_LOGIN  \
0  PI1M_CELCOM_00028819  FEMALE  ...    STUDENTS                    NaN   
1  PI1M_CELCOM

In [14]:
import pandas as pd
import os
from dotenv import load_dotenv

# Load .env variables
load_dotenv()

# Retrieve the token securely
token = os.getenv('SUPABASE_FILE_TOKEN')

# Construct full signed URL
base_url = "https://ejoraovtxlbskfcxnkyd.supabase.co/storage/v1/object/sign/sampledt/dataMCMC_API_CELCOM_NMSCMS.csv"
signed_url = f"{base_url}?token={token}"

# Read the CSV directly
df = pd.read_csv(signed_url)

# Show the data
df.head()


Unnamed: 0,MEMBER_IC,MEMBER_NAME,STATE,BIRTHDATE,ID,SERVICE_PROVIDER,MEMBER_STATUS,OKU_STATUS,MEMBER_ID,GENDER,...,OCCUPATION,LAST_LOGIN,CREATED_BY,DATE_CREATED,UPDATED_BY,UPDATED,PI1M_REFID,SITE_NAME,MEMBER_TYPE,STATE_CODE
0,101010102,NUR SABRINA BINTI MOHD JERY,JOHOR,1/1/2000 12:00:00 AM,787699,CELCOM,MEMBER,NO,PI1M_CELCOM_00028819,FEMALE,...,STUDENTS,,API_CELCOM_NMSCMS,12/5/2017 10:27:49 AM,API_CELCOM_NMSCMS,22/5/2017 5:21:02 PM,CELCOM-022,KG MENSUDUT LAMA,IC,1
1,101010107,ZARAY REDZUAB BIN AZNI,JOHOR,1/1/2000 12:00:00 AM,825648,CELCOM,MEMBER,NO,PI1M_CELCOM_00028810,MALE,...,STUDENTS,,API_CELCOM_NMSCMS,22/5/2017 5:21:02 PM,API_CELCOM_NMSCMS,22/5/2017 5:21:02 PM,CELCOM-022,KG MENSUDUT LAMA,IC,1
2,101120101,DEXTER,SABAH,1/1/2000 12:00:00 AM,801654,CELCOM,MEMBER,YES,PI1M_CELCOM_00049977,MALE,...,STUDENTS,,API_CELCOM_NMSCMS,16/5/2017 3:29:36 PM,API_CELCOM_NMSCMS,22/5/2017 6:37:16 PM,CELCOM-057,PI1M KG. TOBOH,IC,12
3,109121931,MOHD KHAIRY,SABAH,9/10/2000 12:00:00 AM,1047123,CELCOM,MEMBER,NO,PI1M_CELCOM_00134538,MALE,...,STUDENTS,19/5/2018 11:00:45 AM,API_CELCOM_NMSCMS,3/4/2018 11:52:29 AM,API_CELCOM_NMSCMS,2/10/2018 4:20:58 PM,CELCOM-058,PI1M PEJABAT DAERAH KUDAT,IC,12
4,112030810,NURAINNISA,TERENGGANU,12/1/2000 12:00:00 AM,790531,CELCOM,MEMBER,YES,PI1M_CELCOM_00103647,FEMALE,...,STUDENTS,4/7/2017 2:02:57 PM,API_CELCOM_NMSCMS,15/5/2017 5:53:23 PM,API_CELCOM_NMSCMS,20/3/2018 12:36:33 PM,CELCOM-104,KG TOK DOR,IC,11


In [1]:
df.columns

NameError: name 'df' is not defined

In [5]:
from taipy import Core

print("✅ Taipy Core import works!")


✅ Taipy Core import works!


In [None]:
# !pip install taipy --quiet

ERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: 'C:\\Users\\ahmadnajmi.ariffin\\AppData\\Local\\Programs\\Python\\Python311\\Lib\\site-packages\\pandas-2.2.2.dist-info\\INSTALLERpm4bazwl.tmp'


[notice] A new release of pip is available: 24.1.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [19]:
# from taipy.gui import Gui
# import taipy.gui.builder as tgb
# from math import cos, exp

# value = 10

# def compute_data(decay:int)->list:
#     return [cos(i/6) * exp(-i*decay/600) for i in range(100)]

# def slider_moved(state):
#     state.data = compute_data(state.value)

# with tgb.Page() as page:
#     tgb.text(value="# Taipy Getting Started", mode="md")
#     tgb.text(value="Value: {value}")
#     tgb.slider(value="{value}", on_change=slider_moved)
#     tgb.chart(data="{data}")

# data = compute_data(value)

# if __name__ == "__main__":
#     Gui(page=page).run(title="Dynamic chart")