In [None]:
import os
import time
from io import StringIO
from google.cloud import storage

import pandas as pd
import numpy as np
from datetime import datetime, timedelta

from tqdm import tqdm
from transformers import pipeline
from tqdm import tqdm
import math

import re

In [None]:
def read_data_from_gcs(bucket_name, folder, filename, delimiter=','):
    start_time = time.time()  # Start measuring time
    blob = storage_client.get_bucket(bucket_name).blob(f'{folder}/{filename}')
    csv_data = blob.download_as_text()
    df = pd.read_csv(StringIO(csv_data), delimiter=delimiter, low_memory=False)
    elapsed_time = time.time() - start_time  # Calculate elapsed time
    print(f"Read {filename} complete. Elapsed time: {elapsed_time:.2f} seconds")
    return df

def remove_users(df, ids, column):
    filtered_df = df[~df[column].isin(ids)]
    return filtered_df

def filter_dataframe_by_column(df, column, status):
    return df[df[column] == status]

def filter_and_process_ratings(df_rating, df_consultations_filtered):
    filtered_id = df_consultations_filtered['id'].values
    df_rating_filtered = df_rating[df_rating['consultation_id'].isin(filtered_id)]
    return df_rating_filtered

def filter_ratings_below_4(df_rating_filtered):
    return df_rating_filtered[df_rating_filtered['rating'] < 4]

def generate_final_output_1(df_rating_filtered_rat_below_4):
    final_output_1 = df_rating_filtered_rat_below_4[df_rating_filtered_rat_below_4['description'] == " "][['consultation_id']]
    final_output_1['recommendation'] = 'REJECT'
    final_output_1['reason'] = 'No Description'
    return final_output_1

def perform_sentiment_analysis(text):
    model = 'ayameRushia/bert-base-indonesian-1.5G-sentiment-analysis-smsa'
    nlp = pipeline("sentiment-analysis", model=model, tokenizer=model)
    result = nlp(text)[0]['label']
    return '' if text == '' else result

def add_sentiment_column(df):
    rating_id_with_desc = df[df['description'] != " "][['id', 'consultation_id','description']]
    # df['sentiment'] = df['description'].progress_apply(perform_sentiment_analysis)
    descriptions = df["description"].tolist()
    sentiments = perform_sentiment_analysis(descriptions)
    df["sentiment"] = sentiments
    return df

def generate_output_2(result_df):
#     Reject Rating with positive sentiment
    result_df['recommendation'] = result_df['sentiment'].apply(lambda x: 'REJECT' if x in ['Positive'] else 'empty')
    result_df['reason'] = 'Positive Sentiment'
    final_output_2 = result_df[result_df['sentiment'] == 'Positive'][['consultation_id', 'recommendation', 'reason']]
    return final_output_2

def filter_neg_neu_sentiment(result_df):
    rating_id_neg_neu_sentiment = result_df[result_df['sentiment'] != 'Positive'][['id', 'consultation_id']]
    return rating_id_neg_neu_sentiment

def filter_chat_messages_by_consultation_id(rating_id_neg_neu_sentiment, df_chat):
    consultation_id_to_filter = rating_id_neg_neu_sentiment[['consultation_id']]
    df_chat_filtered = df_chat[['consultation_id', 'sender_id', 'message', 'sent_at', 'read_at']]
    df_chat_filt_neg_neu = df_chat_filtered[df_chat_filtered['consultation_id'].isin(consultation_id_to_filter['consultation_id'])]
    return df_chat_filt_neg_neu

def extract_consultation_stop_time(rating_id_neg_neu_sentiment, df_consultations_filtered):
    df_cons_filt_neg_neu_sntmnt = df_consultations_filtered[df_consultations_filtered['id'].isin(rating_id_neg_neu_sentiment['consultation_id'])]
    to_merge_consultation = df_cons_filt_neg_neu_sntmnt[['id','stop_time']]
    to_merge_consultation = to_merge_consultation.rename(columns={'id':'consultation_id'}, inplace=False)
    to_merge_consultation['stop_time'] = pd.to_datetime(to_merge_consultation['stop_time'])
    return to_merge_consultation

def merge_chat_messages_with_stop_time(df_chat_filt_neg_neu, to_merge_consultation):
    df_chat_final = pd.merge(df_chat_filt_neg_neu, to_merge_consultation, on='consultation_id', how='left')
    return df_chat_final

# Function to clean the rating data
def clean_rating_data(df_rating, testing_user, client_user):
    df_rating['created_at'] = pd.to_datetime(df_rating['created_at'])
    filtered_rating = remove_users(df_rating, testing_user, 'lawyer_id')
    filtered_rating = remove_users(filtered_rating, client_user, 'client_id')
    return filtered_rating

def change_datetime_cons(df, column):
    df[column] = pd.to_datetime(df[column])
    return df

def change_datetime_chat(df, column):
    df[column] = pd.to_datetime(df[column])
    df[column] = df[column].dt.tz_localize('UTC')
    df[column] = df[column].dt.tz_convert('Asia/Jakarta')
    df[column] = df[column].dt.strftime('%Y-%m-%d %H:%M:%S')
    df[column] = pd.to_datetime(df[column])
    return df

def calculate_conversation_duration(df_input):

    # get unique consultation_id
    consultation_unique_id = df_input['consultation_id'].unique()

    col_consul_id = []
    col_client_read_avg = []
    col_lawyer_read_avg = []
    col_last_time = []
    col_lawyer_messages_count = []
    col_client_messages_count = []
    col_system_messages_count = []
    col_last_sender = []

    for i in tqdm(consultation_unique_id):

        df = df_input[df_input['consultation_id'] == i]
        # START ----

        timestamps = []
        current_sender = None
        prev_sender = None
        prev_time = None
        first_sender = df['sender_id'].iloc[0]
        last_sender = df['sender_id'].iloc[-1]

        for _, row in df.iterrows():
            current_sender = row['sender_id']
            if current_sender != prev_sender:
                timestamps.append(prev_time)

            prev_sender = current_sender
            prev_time = row['sent_at']

        timestamps.append(row['sent_at'])

        #remove first element
        timestamps = timestamps[1:]

        # get last time (stop_at - last message)
        last_time = (pd.to_datetime(row['stop_time']) - row['sent_at']) / np.timedelta64(1, 's')

        datetime_objects = np.array([np.datetime64(timestamp) for timestamp in timestamps])
        # Calculate time differences
        time_diff_seconds = np.diff(datetime_objects) / np.timedelta64(1, 's')

        if first_sender == 'LAWYER':
            client_to_lawyer = [time_diff_seconds[i] for i in range(len(time_diff_seconds)) if i % 2 == 0]
            lawyer_to_client = [time_diff_seconds[i] for i in range(len(time_diff_seconds)) if i % 2 != 0]
            client_read_avg = np.mean(client_to_lawyer)
            lawyer_read_avg = np.mean(lawyer_to_client)
        else:
            lawyer_to_client = [time_diff_seconds[i] for i in range(len(time_diff_seconds)) if i % 2 == 0]
            client_to_lawyer = [time_diff_seconds[i] for i in range(len(time_diff_seconds)) if i % 2 != 0]
            lawyer_read_avg = np.mean(lawyer_to_client)
            client_read_avg = np.mean(client_to_lawyer)

        col_consul_id.append(i)
        col_client_read_avg.append(client_read_avg)
        col_lawyer_read_avg.append(lawyer_read_avg)
        col_last_time.append(last_time)
        col_client_messages_count.append(df[df['sender_id'] == 'CLIENT'].shape[0])
        col_lawyer_messages_count.append(df[df['sender_id'] == 'LAWYER'].shape[0])
        col_system_messages_count.append(df[df['sender_id'] == 'SYSTEM'].shape[0])
        col_last_sender.append(last_sender)

    col_client_read_avg_rounded = [round(value, 2) if not math.isnan(value) else value for value in col_client_read_avg]
    col_lawyer_read_avg_rounded = [round(value, 2) if not math.isnan(value) else value for value in col_lawyer_read_avg]

    # Create a dataframe
    data = {'consultation_id': col_consul_id,
            'client_msg_cnt': col_client_messages_count,
            'lawyer_msg_cnt': col_lawyer_messages_count,
            'system_msg_cnt': col_system_messages_count,
            'client_read_avg': col_client_read_avg_rounded,
            'lawyer_read_avg': col_lawyer_read_avg_rounded,
            'last_time': col_last_time,
            'last_sender': col_last_sender
        }
    df_result = pd.DataFrame(data)
    print(f'df_result shape: {df_result.shape}')
    df_result.describe()

    return df_result


# XXXX
def create_regex_pattern(expected_words):
    pattern = '|'.join(r'\b{}\b(?:\s\w+)?'.format(re.escape(word)) for word in expected_words)
    return pattern

def filter_cons_id_contain_slow_desc(df, pattern):
    return df[df['description'].str.contains(pattern)]['consultation_id'].values

def filter_cons_id_without_slow_desc(df, pattern):
    return df[~df['description'].str.contains(pattern)]['consultation_id'].values

def filter_duration_data_by_cons_id(df_duration_summary, cons_id):
    return df_duration_summary[df_duration_summary['consultation_id'].isin(cons_id)]

def identify_consultations_with_issues(df_dur_sum_2):
    cons_id_with_neg_value = df_dur_sum_2[(df_dur_sum_2['client_read_avg'] < 0) |
                                          (df_dur_sum_2['lawyer_read_avg'] < 0) |
                                          (df_dur_sum_2['last_time'] < 0) |
                                          df_dur_sum_2.isna().any(axis=1)][['consultation_id']]
    return cons_id_with_neg_value

def gen_output_for_issues(cons_id_with_neg_value):
    cons_id_with_neg_value['recommendation'] = "INVESTIGATE"
    cons_id_with_neg_value['reason'] = "Tech issue with timestamps"
    return cons_id_with_neg_value.copy()

def gen_output_unrelated_time(df):
    df = df[['consultation_id']].copy()
    df['recommendation'] = "INVESTIGATE"
    df['reason'] = "Not related with time"
    return df

# XXXX


def check_outlier(value, column):
    Q1 = df_outlier_thres.loc[df_outlier_thres['metrics'] == column, 'Q1'].values[0]
    Q3 = df_outlier_thres.loc[df_outlier_thres['metrics'] == column, 'Q3'].values[0]
    IQR = df_outlier_thres.loc[df_outlier_thres['metrics'] == column, 'IQR'].values[0]
    lower_bound = Q1 - (1.5 * IQR)
    upper_bound = Q3 + (1.5 * IQR)
    return (value < lower_bound) or (value > upper_bound)

def add_outlier_columns(df):
    outlier_columns = ['client_read_avg', 'lawyer_read_avg', 'last_time']
    for column in outlier_columns:
        df[column + '_outlier'] = df[column].apply(lambda x: check_outlier(x, column))
    return df

# Apply rules and create the 'reason' column
def apply_rules(row):
    last_time_outlier = row['last_time_outlier']
    last_sender = row['last_sender']
    lawyer_read_outlier = row['lawyer_read_avg_outlier']
    client_read_outlier = row['client_read_avg_outlier']

    if last_time_outlier:
        if last_sender == 'LAWYER': # Probably client did not reply lawyer in the last message
            if lawyer_read_outlier:
                return 'ACCEPT', 'Both parties respond slowly' # Need improvement in the future
            else:
                return 'REJECT', 'Lawyer responds promptly'
        elif last_sender == 'CLIENT': # Probably lawyer did not reply client in the last message
            if client_read_outlier:
                return 'ACCEPT', 'Both parties respond slowly' # Need improvement in the future
            else:
                return 'ACCEPT', 'Lawyer hasnt replied to clients last message'

    if client_read_outlier and lawyer_read_outlier:
        return 'ACCEPT', 'Both parties respond slowly'
    elif client_read_outlier:
        return 'REJECT', 'Client responds slowly'
    elif lawyer_read_outlier:
        return 'ACCEPT', 'Lawyer responds slowly'
    else:
        return 'REJECT', 'Messages answered promptly'

def filter_and_select_columns(df_dur_sum_4):
    filtered_columns = ['consultation_id', 'client_read_avg_outlier', 'lawyer_read_avg_outlier', 'last_time_outlier', 'last_sender']
    df_dur_sum_4_filtered = df_dur_sum_4[filtered_columns].copy()
    return df_dur_sum_4_filtered

def create_final_output(df_dur_sum_4_filtered):
    # Apply rules to each row and create 'recommendation' and 'reason' columns
    df_dur_sum_4_filtered[['recommendation', 'reason']] = df_dur_sum_4_filtered.apply(apply_rules, axis=1, result_type='expand')

    # Select relevant columns for final output
    final_output_4 = df_dur_sum_4_filtered[['consultation_id', 'recommendation', 'reason']].copy()
    return final_output_4

def merge_all_outputs(final_output_1, final_output_2, final_output_3, final_output_4, final_output_5):
    # Concatenate all final outputs
    merged_df_final = pd.concat([final_output_1, final_output_2, final_output_3, final_output_4, final_output_5], ignore_index=True)

    # Sort by consultation_id
    merged_df_final.sort_values('consultation_id', inplace=True)

    return merged_df_final

def filter_consultations_without_bug(df_dur_sum_2, cons_id_with_neg_value):
    filtered_consultations = df_dur_sum_2[~df_dur_sum_2['consultation_id'].isin(cons_id_with_neg_value['consultation_id'])]
    return filtered_consultations


In [None]:
# Set path
relative_path = '/content/perqara-data-532572ce4996.json'
file_path = os.path.abspath(relative_path)
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = file_path
storage_client = storage.Client()

In [None]:
#Pipeline

In [None]:
# 1. Get data from GCS
df_consultations = read_data_from_gcs('perqara-dendrobium', 'raw/postgres/csv/consultations', 'consultations.csv', delimiter='|')
df_rating = read_data_from_gcs('perqara-dendrobium', 'raw/postgres/csv/lawyer_ratings', 'lawyer_ratings.csv', delimiter=',')
df_lawyers = read_data_from_gcs('perqara-dendrobium', 'raw/postgres/csv/lawyers', 'lawyers.csv', delimiter='|')
df_rating = read_data_from_gcs('perqara-dendrobium', 'raw/postgres/csv/lawyer_ratings', 'lawyer_ratings.csv', delimiter=',')
df_chat = read_data_from_gcs('perqara-dendrobium', 'raw/mongodb/csv/m_chat_messages', 'm_chat_messages.csv', delimiter='|')
df_outlier_thres = read_data_from_gcs('perqara-dendrobium', 'lawyer-low-rating', 'outlier_thres_2024-05.csv', delimiter=',')

Read consultations.csv complete. Elapsed time: 2.26 seconds
Read lawyer_ratings.csv complete. Elapsed time: 0.32 seconds
Read lawyers.csv complete. Elapsed time: 0.17 seconds
Read lawyer_ratings.csv complete. Elapsed time: 0.28 seconds
Read m_chat_messages.csv complete. Elapsed time: 10.93 seconds
Read outlier_thres_2024-05.csv complete. Elapsed time: 0.11 seconds


In [None]:
# 2. Clean Lawyer Rating Data
lawyer_id_testing = [36, 38, 48, 120, 192, 195]
client_id_testing = [25, 42, 43, 44, 45, 49, 54, 56, 58, 62, 63, 66, 85, 146, 157, 295, 959, 1609, 1655, 1821, 2421, 2526, 2995, 3180,
             4205, 5804, 5717, 5716, 7569, 7749, 7743]

# df consultation
df_consultations_filtered = remove_users(df_consultations, lawyer_id_testing, 'lawyer_id')
df_consultations_filtered = remove_users(df_consultations_filtered, client_id_testing, 'client_id')
df_consultations_filtered = filter_dataframe_by_column(df_consultations_filtered, 'status', 600)
df_consultations_filtered = change_datetime_cons(df_consultations_filtered, 'stop_time')

# df chat
df_chat = change_datetime_chat(df_chat, 'sent_at')
df_chat = change_datetime_chat(df_chat, 'read_at')

In [None]:
# 3. Merge lawyer drating with lawyer
df_rating_filtered = filter_and_process_ratings(df_rating, df_consultations_filtered)

In [None]:
# 4. Get low rated lawyer (below 4)
df_rating_filtered_rat_below_4 = filter_ratings_below_4(df_rating_filtered)

final_output_1 = generate_final_output_1(df_rating_filtered_rat_below_4)


In [None]:
# 5.
# b Rating with description (Perform sentiment analysis)
result_df = add_sentiment_column(df_rating_filtered_rat_below_4)

final_output_2 = generate_output_2(result_df)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/899 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/230k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/476k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["sentiment"] = sentiments
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['recommendation'] = result_df['sentiment'].apply(lambda x: 'REJECT' if x in ['Positive'] else 'empty')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result_df['reason'] = 'Positive Sentiment'


In [None]:
# 6 -----
# b Filter rating id * cons id that have neutral & negative sentiment
rating_id_neg_neu_sentiment = filter_neg_neu_sentiment(result_df)

In [None]:
#7 ------
df_chat_filt_neg_neu = filter_chat_messages_by_consultation_id(rating_id_neg_neu_sentiment, df_chat)
to_merge_consultation = extract_consultation_stop_time(rating_id_neg_neu_sentiment, df_consultations_filtered)
df_chat_final = merge_chat_messages_with_stop_time(df_chat_filt_neg_neu, to_merge_consultation)

In [None]:
#8 ------
df_duration_summary = calculate_conversation_duration(df_chat_final)

0it [00:00, ?it/s]

df_result shape: (0, 8)





In [None]:
df_duration_summary

Unnamed: 0,consultation_id,client_msg_cnt,lawyer_msg_cnt,system_msg_cnt,client_read_avg,lawyer_read_avg,last_time,last_sender


In [None]:
# 9
# Get df_duration_sum that contain duration with ['lama', 'slow', 'lebih cepat', 'lambat', 'lemot']
pattern = create_regex_pattern(['lama', 'slow', 'lebih cepat', 'lambat', 'lemot'])
cons_id_with_slow_desc = filter_cons_id_contain_slow_desc(df_rating_filtered_rat_below_4, pattern)
df_duration_summary_with_slow = filter_duration_data_by_cons_id(df_duration_summary, cons_id_with_slow_desc)

cons_id_with_bug_timestamps = identify_consultations_with_issues(df_duration_summary_with_slow)
final_output_3 = gen_output_for_issues(cons_id_with_bug_timestamps)

# Get df_duration_sum that not contain that words
cons_id_without_slow_desc = filter_cons_id_without_slow_desc(df_rating_filtered_rat_below_4, pattern)
df_duration_summary_without_slow = filter_duration_data_by_cons_id(df_duration_summary, cons_id_without_slow_desc)
final_output_other = gen_output_unrelated_time(df_duration_summary_without_slow)

In [None]:
# 10
df_duration_summary_without_bug = filter_consultations_without_bug(df_duration_summary_with_slow, cons_id_with_bug_timestamps)
df_duration_summary_outlier = add_outlier_columns(df_duration_summary_without_bug)
final_output_4 = create_final_output(df_duration_summary_outlier)

ValueError: Columns must be same length as key

In [None]:
#11 Merge All files
merged_df_final = merge_all_outputs(final_output_1, final_output_2, final_output_3, final_output_4, final_output_other)