In [1]:
import pandas as pd
import polars as pl
from google.cloud import storage
import io
import numpy as np
import math
from tqdm import tqdm
from pprint import pprint
import datetime
import numpy as np

In [2]:
bucket_name = "perqara-dendrobium"

# Explicitly use service account credentials by specifying the private key file.
storage_client = storage.Client.from_service_account_json('/content/perqara-data-532572ce4996.json')

# Get the bucket and blob objects
bucket = storage_client.get_bucket(bucket_name)

# Download the contents of the blob as a string
source1 = "raw/postgres/csv/consultations/consultations.csv" # gs uri
blob1 = bucket.blob(source1)
str_data1 = io.BytesIO(blob1.download_as_string())

source2 = "raw/postgres/csv/lawyer_ratings/lawyer_ratings.csv" # gs uri
blob2 = bucket.blob(source2)
str_data2 = io.BytesIO(blob2.download_as_string())

source3 = "raw/mongodb/csv/m_chat_messages/m_chat_messages.csv" # gs uri
blob3 = bucket.blob(source3)
str_data3 = io.BytesIO(blob3.download_as_string())

In [3]:
df_cons = pd.read_csv(str_data1, sep='|', low_memory=False)
df_cons.head()

Unnamed: 0,id,parent_id,lawyer_id,client_id,skill_id,description,lawyer_attendance,client_attendance,room_key,lawyer_approved,...,stop_time,paid_at,retries,last_call,is_client_rated,summary_sent_at,matter,legal_basis,analysis,conclusion
0,12,,48,44,2,haha,,,eyJpdiI6IjZ5T0s0bzVQcyszbk9RQnVWOEowYnc9PSIsIn...,1,...,2023-05-02 11:40:28,2023-05-02 11:09:59,0,2023-05-02 11:10:28,t,,asdasd,asdasd,asdasd,asdad
1,16,,48,43,2,testing,,,eyJpdiI6ImYyVERnS0ZjRjIvVHlFWlpoZU56bXc9PSIsIn...,1,...,2023-04-11 13:56:02,2023-04-11 13:24:57,0,2023-04-11 13:25:12,t,,asdasd,asdasd,asdasd,asdasd
2,17,,48,43,2,test,,,eyJpdiI6ImRlM0RwMnM5U2N1M2N6NHRhdVpiN0E9PSIsIn...,1,...,2023-04-12 09:39:02,2023-04-12 09:07:50,0,2023-04-12 09:08:33,f,,asdads,asdasda,sdasd,asdasd
3,18,,48,43,2,pringgo test,,,eyJpdiI6IjJGLzhTdGlOVGZydTFDb1BXQXlqZEE9PSIsIn...,1,...,2023-04-12 11:39:29,2023-04-12 11:09:09,0,2023-04-12 11:09:29,t,,asdasd,asdasd,asdasd,asdasd
4,19,,48,49,6,testing,,,eyJpdiI6Ik1jeVg0S2k3TzFodDJrQVRxczN4Wnc9PSIsIn...,1,...,,2023-04-12 12:53:14,0,2023-04-12 12:56:14,f,,,,,


In [4]:
df_lr = pd.read_csv(str_data2, sep=',', low_memory=False)
df_lr.head()

Unnamed: 0,id,lawyer_id,consultation_id,client_id,rating,description,status,validated_by_id,created_at,updated_at,deleted_at
0,1,48,16,43,5,asdasdasd sadasdasd asdasd,1,,2023-04-11 14:03:27,2023-04-11 14:03:27,
1,2,48,18,43,5,asdasasd,1,,2023-04-12 11:35:28,2023-04-12 11:35:28,
2,3,48,21,49,5,,1,,2023-04-12 13:10:40,2023-04-12 13:10:40,
3,4,48,20,53,5,bagus sekali,1,,2023-04-12 13:15:05,2023-04-12 13:15:05,
4,5,48,24,43,5,sdadasd,1,,2023-04-12 23:09:07,2023-04-12 23:09:07,


In [5]:
df_mchat = pd.read_csv(str_data3, sep='|', low_memory=False)
df_mchat.head()

Unnamed: 0,consultation_id,user_name,sender_id,message,sent_at,delivered_at,read_at,version,object_id,file_url,file_size,file_name,file_extension,notify_type,notify_description
0,4265,"Pringgo Jr, S.H., S.E., M.H., M.M.",LAWYER,hellooo,2023-10-11 08:06:34.683,0,2023-10-11 08:06:34.756,0,6526578a00331947f8ffc18e,,,,,,
1,4265,Daniel Test,CLIENT,kekirim bang,2023-10-11 08:06:44.204,0,2023-10-11 08:06:44.248,0,6526579400331947f8ffc192,,,,,,
2,4265,Daniel Test,CLIENT,masuk ga?,2023-10-11 08:06:47.395,0,2023-10-11 08:06:47.438,0,6526579700331947f8ffc196,,,,,,
3,4265,"Pringgo Jr, S.H., S.E., M.H., M.M.",LAWYER,samuk,2023-10-11 08:06:54.623,0,2023-10-11 08:06:54.687,0,6526579e00331947f8ffc19a,,,,,,
4,4265,Daniel Test,CLIENT,pecah,2023-10-11 08:06:58.586,0,2023-10-11 08:06:58.631,0,652657a200331947f8ffc19e,,,,,,


In [6]:
# set to datetime
df_lr['created_at'] = pd.to_datetime(df_lr['created_at'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

In [7]:
excluded_client_ids = [
    25,
    42,
    43,
    44,
    45,
    49,
    54,
    56,
    58,
    62,
    63,
    66,
    85,
    146,
    157,
    295,
    959,
    1609,
    1655,
    2421,
    2526,
    3180,
    4205,
    49,
    56,
    157,
    2995,
    7569,
    5804,
    5717,
    5716,
    1821,
    7749,
    7743,
    54,
    78,
    11746,
]
excluded_lawyer_ids = [36, 38, 48, 120, 192, 195]
filtered_df_lr = df_lr[~df_lr["lawyer_id"].isin(excluded_lawyer_ids)]
filtered_df_lr = filtered_df_lr[~filtered_df_lr["client_id"].isin(excluded_client_ids)]
filtered_df_lr["description"] = filtered_df_lr["description"].apply(lambda x: x.strip() if isinstance(x, str) else x)
filtered_df_lr.head()

Unnamed: 0,id,lawyer_id,consultation_id,client_id,rating,description,status,validated_by_id,created_at,updated_at,deleted_at
18,19,33,111,165,5,,1,,2023-05-16 15:24:55,2023-05-16 15:24:55,
19,20,17,115,165,5,,1,,2023-05-16 16:59:32,2023-05-16 17:07:50,
25,26,33,202,288,5,Sangat mudah di mengerti penjelasan dari ibu C...,1,,2023-05-25 14:59:42,2023-05-25 14:59:42,
28,29,73,254,345,5,,1,,2023-06-01 14:55:57,2023-06-01 14:55:57,
30,31,94,277,388,5,,1,,2023-06-05 11:53:48,2023-06-05 11:53:48,


In [8]:
filtered_df_cons = df_cons[~df_cons["lawyer_id"].isin(excluded_lawyer_ids)]
filtered_df_cons = filtered_df_cons[~df_cons["client_id"].isin(excluded_client_ids)]
filtered_df_cons['created_at'] = pd.to_datetime(filtered_df_cons['created_at'], format='%Y-%m-%d %H:%M:%S.%f', errors='coerce')
filtered_df_cons['stop_time'] = pd.to_datetime(filtered_df_cons['stop_time'], format='%Y-%m-%d %H:%M:%S.%f', errors='coerce')
filtered_df_cons['updated_at'] = pd.to_datetime(filtered_df_cons['updated_at'], format='%Y-%m-%d %H:%M:%S.%f', errors='coerce')
filtered_df_cons = filtered_df_cons[filtered_df_cons["status"] == 600]
filtered_df_cons.head()

  filtered_df_cons = filtered_df_cons[~df_cons["client_id"].isin(excluded_client_ids)]


Unnamed: 0,id,parent_id,lawyer_id,client_id,skill_id,description,lawyer_attendance,client_attendance,room_key,lawyer_approved,...,stop_time,paid_at,retries,last_call,is_client_rated,summary_sent_at,matter,legal_basis,analysis,conclusion
37,50,,31,72,3,"Saya ingin menanyakan masalah teman saya, suda...",,,eyJpdiI6Im5oVFNQQ0hjWitIcFEwMjZDeGt3bUE9PSIsIn...,1,...,NaT,2023-05-06 20:27:09,0,2023-05-07 08:51:05,f,,-,-,-,-
48,61,,31,100,3,Hallo pak saya mau konsultasi. \n1. Saya terli...,,,eyJpdiI6IkpiOXNJRjcvNDdMOEljNk83MTc2d3c9PSIsIn...,1,...,NaT,2023-05-09 12:20:25,0,2023-05-11 06:03:16,f,,-,-,-,-
49,62,,31,101,6,Masalah pidana,,,eyJpdiI6IjVvWU1yTjNYZk5Fc1hWN1dJRzFWN3c9PSIsIn...,1,...,NaT,2023-05-09 12:20:52,0,2023-05-11 06:01:56,f,,-,-,-,-
53,66,,17,79,5,"Selamat sore,\n\nSaya ingin melakukan konsulta...",,,eyJpdiI6IktFS0xhTW5XU3Jxa2w4OXI4M0RlR1E9PSIsIn...,1,...,NaT,2023-05-09 16:39:47,0,2023-05-09 16:43:19,f,,Istri hendak mengajukan gugatan cerai kepada s...,Pasal 19 Undang-Undang Nomor 1 Tahun 1974 Jo P...,Gugatan sudah memenuhi syarat2 gugatan.,Waktunya kurang.
55,68,,33,155,3,Selamat siang Ibu. Saya Tungga.\nSaya ingin ko...,,,eyJpdiI6IkhrZ3JOVllEWElRNHN3L0VzNHZvUVE9PSIsIn...,1,...,NaT,2023-05-15 10:58:01,0,2023-05-15 11:21:10,f,,Ibu Tungga ingin membatalkan transaksi pembeli...,kitab undang-undang hukum perdata,Pembatalan perjanjian dapat diajukan oleh masi...,Ibu Tungga dapat mengajukan permohonan pembata...


In [9]:
cleaned_consultations_df = filtered_df_cons[["id", "created_at", "stop_time", "updated_at"]]
merged_m_chat_messages_df = df_mchat[["consultation_id", "sender_id", "message", "sent_at", "read_at"]]
df_chat_filtered = merged_m_chat_messages_df[merged_m_chat_messages_df["consultation_id"].isin(cleaned_consultations_df["id"])]
cleaned_consultations_df.rename(columns={'id':'consultation_id'}, inplace=True)
df_chat_final = pd.merge(df_chat_filtered, cleaned_consultations_df, on='consultation_id', how='left')
df_chat_final.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_consultations_df.rename(columns={'id':'consultation_id'}, inplace=True)


Unnamed: 0,consultation_id,sender_id,message,sent_at,read_at,created_at,stop_time,updated_at
0,4268,LAWYER,ad yg bisa saya bantu?,2023-10-11 08:18:37.413,2023-10-11 08:18:37.478,NaT,NaT,NaT
1,4269,LAWYER,ada yg bisa dibantu,2023-10-11 08:47:39.837,1970-01-01 00:00:00.000,NaT,NaT,NaT
2,4270,CLIENT,"Assalamualaikum bapak/ibu, saya Indah Agustina...",2023-10-11 08:37:13.528,2023-10-11 08:37:13.630,NaT,NaT,NaT
3,4272,LAWYER,"selamat sore , perkenalkan saya Suhartawan Hut...",2023-10-11 08:34:32.378,2023-10-11 08:39:28.223,NaT,NaT,NaT
4,4272,LAWYER,ada yg bisa kami bantu ibu,2023-10-11 08:34:43.864,2023-10-11 08:39:28.223,NaT,NaT,NaT


In [10]:
# set to datetime
df_chat_final['read_at'] = pd.to_datetime(df_chat_final['read_at'], format='%Y-%m-%d %H:%M:%S.%f', errors='coerce')
df_chat_final['sent_at'] = pd.to_datetime(df_chat_final['sent_at'], format='%Y-%m-%d %H:%M:%S.%f', errors='coerce')

In [11]:
def calculate_conversation_duration(df_input: pd.DataFrame) -> pd.DataFrame:
    # get unique consultation_id
    consultation_unique_id = df_input["consultation_id"].unique()

    col_consul_id = []
    col_client_read_avg = []
    col_lawyer_read_avg = []
    col_last_time = []
    col_lawyer_messages_count = []
    col_client_messages_count = []
    col_system_messages_count = []
    col_last_sender = []

    # START
    for i in tqdm(consultation_unique_id):
        # get data for each consultation
        df = df_input[df_input["consultation_id"] == i]

        # initial variables
        timestamps = []
        current_sender = None
        prev_sender = None
        prev_time = None

        # get first and last sender
        first_sender = df["sender_id"].iloc[0]
        last_sender = df["sender_id"].iloc[-1]

        # START
        for _, row in df.iterrows():
            current_sender = row['sender_id']
            if current_sender != prev_sender:
                if prev_time is not None:
                    timestamps.append(prev_time)

            prev_sender = current_sender
            prev_time = row['sent_at']

        timestamps.append(row['sent_at'])

        #remove first element
        timestamps = timestamps[1:]

        # Check if sent_at and stop_time are not None
        if pd.notnull(row["stop_time"]) and pd.notnull(row["sent_at"]):
            last_time = (row["stop_time"] - row["sent_at"]).total_seconds()
        else:
            last_time = np.nan  # Assign NaN (Not a Number) if list is empty

        # Filter out None values and ensure all are datetime objects
        timestamps = [ts for ts in timestamps if pd.notnull(ts)]

        # Check if there are valid timestamps
        if timestamps:
            # Convert the timestamps to numpy datetime64 objects
            np_timestamps = np.array(timestamps, dtype="datetime64[s]")

            # Check if datetime_objects is not empty and has more than one element
            if len(np_timestamps) > 1:
                # Calculate time differences in seconds
                time_diff_seconds = np.diff(np_timestamps) / np.timedelta64(1, "s")
            else:
                # Handle cases where the array is empty or has only one element
                time_diff_seconds = np.array([])  # Empty array
        else:
            np_timestamps = np.array([], dtype="datetime64[s]")
            time_diff_seconds = np.array([])

        # Calculate average time between messages
        if first_sender == "LAWYER":
            client_to_lawyer = [time_diff_seconds[i] for i in range(len(time_diff_seconds)) if i % 2 == 0]
            lawyer_to_client = [time_diff_seconds[i] for i in range(len(time_diff_seconds)) if i % 2 != 0]
        else:
            lawyer_to_client = [time_diff_seconds[i] for i in range(len(time_diff_seconds)) if i % 2 == 0]
            client_to_lawyer = [time_diff_seconds[i] for i in range(len(time_diff_seconds)) if i % 2 != 0]

        # Calculate mean only if the lists are not empty
        if client_to_lawyer:
            client_read_avg = np.mean(client_to_lawyer)
        else:
            client_read_avg = np.nan  # Assign NaN (Not a Number) if list is empty

        if lawyer_to_client:
            lawyer_read_avg = np.mean(lawyer_to_client)
        else:
            lawyer_read_avg = np.nan  # Assign NaN (Not a Number) if list is empty

        # Append values
        col_consul_id.append(i)
        col_client_read_avg.append(client_read_avg)
        col_lawyer_read_avg.append(lawyer_read_avg)
        col_last_time.append(last_time)
        col_client_messages_count.append(df[df["sender_id"] == "CLIENT"].shape[0])
        col_lawyer_messages_count.append(df[df["sender_id"] == "LAWYER"].shape[0])
        col_system_messages_count.append(df[df["sender_id"] == "SYSTEM"].shape[0])
        col_last_sender.append(last_sender)
    # END

    # Round values to 2 decimal places
    col_client_read_avg_rounded = [
        round(value, 2) if not math.isnan(value) else value for value in col_client_read_avg
    ]
    col_lawyer_read_avg_rounded = [
        round(value, 2) if not math.isnan(value) else value for value in col_lawyer_read_avg
    ]

    # Create a dataframe
    data = {
        "consultation_id": col_consul_id,
        "client_msg_cnt": col_client_messages_count,
        "lawyer_msg_cnt": col_lawyer_messages_count,
        "system_msg_cnt": col_system_messages_count,
        "client_read_avg": col_client_read_avg_rounded,
        "lawyer_read_avg": col_lawyer_read_avg_rounded,
        "last_time": col_last_time,
        "last_sender": col_last_sender,
    }

    df_result = pd.DataFrame(data)
    return df_result

In [12]:
df_result = calculate_conversation_duration(df_chat_final)
df_result.head()

100%|██████████| 13787/13787 [01:45<00:00, 130.17it/s]


Unnamed: 0,consultation_id,client_msg_cnt,lawyer_msg_cnt,system_msg_cnt,client_read_avg,lawyer_read_avg,last_time,last_sender
0,4268,0,1,0,,,,LAWYER
1,4269,0,1,0,,,,LAWYER
2,4270,1,0,0,,,,CLIENT
3,4272,9,4,0,265.0,216.0,,CLIENT
4,4274,3,0,0,,,,CLIENT


In [13]:
df_result.describe()

Unnamed: 0,consultation_id,client_msg_cnt,lawyer_msg_cnt,system_msg_cnt,client_read_avg,lawyer_read_avg,last_time
count,13787.0,13787.0,13787.0,13787.0,12226.0,11889.0,0.0
mean,17701.64967,13.550156,13.673388,0.248422,174.647573,45.316218,
std,7393.02479,11.658626,9.93897,1.233807,7081.288573,7286.010378,
min,4268.0,0.0,0.0,0.0,-45898.0,-256144.83,
25%,11031.0,5.0,6.0,0.0,39.455,40.5,
50%,18146.0,11.0,12.0,0.0,64.565,64.0,
75%,24203.5,19.0,19.0,0.0,116.6525,109.5,
max,29893.0,149.0,112.0,60.0,256509.5,169809.0,
