In [1]:
import os
import time
import pandas as pd
import numpy as np
from google.cloud import storage
from io import StringIO

In [2]:
# Get Credentials
relative_path = '/content/perqara-data-532572ce4996.json'
file_path = os.path.abspath(relative_path)
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = file_path
storage_client = storage.Client()

In [3]:
def read_data_from_gcs(bucket_name, folder, filename, delimiter=','):
    start_time = time.time()  # Start measuring time
    blob = storage_client.get_bucket(bucket_name).blob(f'{folder}/{filename}')
    csv_data = blob.download_as_text()
    df = pd.read_csv(StringIO(csv_data), delimiter=delimiter, low_memory=False)
    elapsed_time = time.time() - start_time  # Calculate elapsed time
    print(f"Read {filename} complete. Elapsed time: {elapsed_time:.2f} seconds")
    return df

In [4]:
### Load CSV Files
availability_instants = read_data_from_gcs('perqara-dendrobium', 'raw/postgres/csv/availability_instants', 'availability_instants.csv', delimiter=',')
consultations = read_data_from_gcs('perqara-dendrobium', 'raw/postgres/csv/consultations', 'consultations.csv', delimiter='|')
institutions = read_data_from_gcs('perqara-dendrobium', 'raw/postgres/csv', 'institutions.csv', delimiter=',')
lawyer_edus = read_data_from_gcs('perqara-dendrobium', 'raw/postgres/csv/lawyer_edus', 'lawyer_edus.csv', delimiter=',')
lawyer_ratings = read_data_from_gcs('perqara-dendrobium', 'raw/postgres/csv/lawyer_ratings', 'lawyer_ratings.csv', delimiter=',')
lawyers = read_data_from_gcs('perqara-dendrobium', 'raw/postgres/csv/lawyers', 'lawyers.csv', delimiter='|')
lawyer_skill_prices = read_data_from_gcs('perqara-dendrobium', 'raw/postgres/csv', 'lawyer_skill_prices.csv', delimiter=',')
skills = read_data_from_gcs('perqara-dendrobium', 'raw/postgres/csv', 'skills.csv', delimiter=',')
users = read_data_from_gcs('perqara-dendrobium', 'raw/postgres/csv/users', 'users.csv', delimiter=',')

Read availability_instants.csv complete. Elapsed time: 0.83 seconds
Read consultations.csv complete. Elapsed time: 2.60 seconds
Read institutions.csv complete. Elapsed time: 0.30 seconds
Read lawyer_edus.csv complete. Elapsed time: 0.14 seconds
Read lawyer_ratings.csv complete. Elapsed time: 0.22 seconds
Read lawyers.csv complete. Elapsed time: 0.18 seconds
Read lawyer_skill_prices.csv complete. Elapsed time: 0.14 seconds
Read skills.csv complete. Elapsed time: 0.29 seconds
Read users.csv complete. Elapsed time: 0.57 seconds


Unnamed: 0,id,lawyer_id,institution_id,degree,created_at,updated_at,deleted_at
0,1,17,177,S1,,,
1,2,33,167,S3,,,
2,3,15,148,S1,,,
3,5,48,3,S1,2023-04-07 10:48:25,2023-04-07 17:15:30,
4,6,48,64,S2,2023-04-07 10:48:25,2023-04-10 10:07:29,
...,...,...,...,...,...,...,...
624,712,573,312,S2,2024-11-07 09:36:18,2024-11-07 09:36:18,
625,713,665,92,S1,2024-11-15 08:56:45,2024-11-15 08:56:45,
626,714,604,337,S1,2025-01-24 14:13:24,2025-01-24 14:13:24,
627,715,604,83,S2,2025-01-24 14:13:24,2025-01-24 14:13:24,


In [None]:
# Education Score: Parse degree level
def get_degree_level(degree):
    if pd.isna(degree):
        return 0
    degree = degree.lower()
    if 'dr.' in degree or 'ph.d.' in degree or 'll.m.' in degree:
        return 20  # Gold: Advanced degrees
    elif 'm.h.' in degree or 'master' in degree:
        return 15  # Silver: Master's
    elif 's.h.' in degree or 'bachelor' in degree:
        return 10  # Bronze: Bachelor's
    return 5  # Default: Below tier requirements

# Experience Score: Years and case volume
def experience_score(year_exp, num_consultations):
    if year_exp >= 8 and num_consultations >= 200:
        return 20  # Gold
    elif 4 <= year_exp <= 7 and num_consultations >= 100:
        return 15  # Silver
    elif 1 <= year_exp <= 3 and num_consultations >= 50:
        return 10  # Bronze
    return 5  # Below minimum

# Industry Expertise Score: Proxy via number of skills
def industry_expertise_score(num_skills):
    if num_skills >= 5:  # Assuming significant experience with more skills
        return 20  # Gold
    elif 3 <= num_skills <= 4:
        return 15  # Silver
    elif 1 <= num_skills <= 2:
        return 10   # Bronze
    return 5   # Minimal expertise

# Client Reviews Score: Based on avg_rating (1-5 scale)
def client_reviews_score(avg_rating):
    if avg_rating >= 4.5:
        return 20  # Gold
    elif 4.0 <= avg_rating < 4.5:
        return 15  # Silver
    elif 1.0 <= avg_rating < 4.0:
        return 10  # Bronze
    return 5  # Below average

# Availability Score: Based on total hours available
def availability_score(total_hours):
    if total_hours >= 2000:  # High availability
        return 20  # Gold
    elif 900 <= total_hours < 2000:
        return 15  # Silver
    elif 1 <= total_hours < 900:
        return 10  # Bronze
    return 5   # Low availability

In [None]:
### Data Preprocessing

# 1. Highest Degree Level
lawyer_edus['degree_score'] = lawyer_edus['degree'].apply(get_degree_level)
highest_degree = lawyer_edus.groupby('lawyer_id')['degree_score'].max().reset_index()

# 2. Number of Consultations (assuming status=600 is completed)
consultations_count = consultations[consultations.notnull().any(axis=1)].groupby('lawyer_id').size().reset_index(name='num_consultations')

# 3. Number of Unique Skills
skills_count = lawyer_skill_prices.groupby('lawyer_id')['skill_id'].nunique().reset_index(name='num_skills')

# 4. Availability Hours
availability_instants['duration'] = (pd.to_datetime(availability_instants['end_datetime']) -
                                    pd.to_datetime(availability_instants['start_datetime'])).dt.total_seconds() / 3600
availability_hours = availability_instants.groupby('lawyer_id')['duration'].sum().reset_index(name='total_hours')

In [None]:
### Merge Data into Lawyers Dataframe
lawyers = lawyers.merge(highest_degree, left_on='id', right_on='lawyer_id', how='left', suffixes=('_left', '_right'))
# Drop the columns from the right DataFrame
lawyers = lawyers.drop(columns=[col for col in lawyers.columns if col.endswith('_right')])
# Optionally, rename columns to remove the '_left' suffix
lawyers = lawyers.rename(columns={col: col.replace('_left', '') for col in lawyers.columns if col.endswith('_left')})

lawyers = lawyers.merge(consultations_count, left_on='id', right_on='lawyer_id', how='left', suffixes=('_left', '_right'))
# Drop the columns from the right DataFrame
lawyers = lawyers.drop(columns=[col for col in lawyers.columns if col.endswith('_right')])
# Optionally, rename columns to remove the '_left' suffix
lawyers = lawyers.rename(columns={col: col.replace('_left', '') for col in lawyers.columns if col.endswith('_left')})

lawyers = lawyers.merge(skills_count, left_on='id', right_on='lawyer_id', how='left', suffixes=('_left', '_right'))
# Drop the columns from the right DataFrame
lawyers = lawyers.drop(columns=[col for col in lawyers.columns if col.endswith('_right')])
# Optionally, rename columns to remove the '_left' suffix
lawyers = lawyers.rename(columns={col: col.replace('_left', '') for col in lawyers.columns if col.endswith('_left')})

lawyers = lawyers.merge(availability_hours, left_on='id', right_on='lawyer_id', how='left', suffixes=('_left', '_right'))
# Drop the columns from the right DataFrame
lawyers = lawyers.drop(columns=[col for col in lawyers.columns if col.endswith('_right')])
# Optionally, rename columns to remove the '_left' suffix
lawyers = lawyers.rename(columns={col: col.replace('_left', '') for col in lawyers.columns if col.endswith('_left')})

# Perform the left join to add the 'email' column from the 'users' table
lawyers = pd.merge(lawyers, users, left_on="user_id", right_on="id", how="left", suffixes=('_left', '_right'))
# Drop the columns from the right DataFrame
lawyers = lawyers.drop(columns=[col for col in lawyers.columns if col.endswith('_right')])
# Optionally, rename columns to remove the '_left' suffix
lawyers = lawyers.rename(columns={col: col.replace('_left', '') for col in lawyers.columns if col.endswith('_left')})

In [None]:
lawyers['total_hours'].describe()

Unnamed: 0,total_hours
count,478.0
mean,452.318446
std,999.144181
min,0.000278
25%,12.309028
50%,68.555694
75%,313.195556
max,6622.476389


In [None]:
# Handle NaN Values
lawyers['degree_score'] = lawyers['degree_score'].fillna(5)  # Minimum score
lawyers['num_consultations'] = lawyers['num_consultations'].fillna(0)
lawyers['year_exp'] = lawyers['year_exp'].fillna(0)
lawyers['avg_rating'] = lawyers['avg_rating'].fillna(3.0)  # Neutral default
lawyers['num_skills'] = lawyers['num_skills'].fillna(0)
lawyers['total_hours'] = lawyers['total_hours'].fillna(0)

### Calculate Scores
lawyers['education_score'] = lawyers['degree_score']
lawyers['experience_score'] = lawyers.apply(lambda row: experience_score(row['year_exp'], row['num_consultations']), axis=1)
lawyers['expertise_score'] = lawyers['num_skills'].apply(industry_expertise_score)
lawyers['reviews_score'] = lawyers['avg_rating'].apply(client_reviews_score)
lawyers['availability_score'] = lawyers['total_hours'].apply(availability_score)

# Total Score
lawyers['total_score'] = (lawyers['education_score'] +
                          lawyers['experience_score'] +
                          lawyers['expertise_score'] +
                          lawyers['reviews_score'] +
                          lawyers['availability_score'])

In [None]:
### Assign Tiers
def assign_tier(total_score):
    if total_score >= 75:
        return 'GOLD'
    elif total_score >= 60:
        return 'SILVER'
    elif total_score >= 30:
        return 'BRONZE'
    return 'Bronze'  # Default for scores < 30

lawyers['subscription_tier'] = lawyers['total_score'].apply(assign_tier)

### Generate Lawyer Name from Slug
def slug_to_name(slug):
    if pd.isna(slug):
        return "Unknown"
    return slug.replace('-', ' ').title()

lawyers['lawyer_name'] = lawyers['slug'].apply(slug_to_name)

### Final Output
result = lawyers[['email', 'subscription_tier']]

# Display or Save Result
result.head()
# result.to_csv('lawyer_tiers.csv', index=False)

Unnamed: 0,email,subscription_tier
0,o.bagaspriambodo@gmail.com,SILVER
1,asenatama@gmail.com,BRONZE
2,james_pangaribuan26@yahoo.com,BRONZE
3,nurul_firdausi36@yahoo.com,BRONZE
4,argo1407@gmail.com,BRONZE


In [None]:
result.loc[result['subscription_tier'] == "GOLD"]["email"]

Unnamed: 0,email
31,hasibuan.terry@gmail.com
39,radendestayomadanpartner@gmail.com
45,dwikurniawan535@gmail.com
55,ricky.rachmatyuniardi@gmail.com
73,rudiadvokatre@gmail.com
75,alpizsbadi3651@gmail.com
127,gwlaw131020@gmail.com
154,ilham.dasari.putra@gmail.com
216,lawyerjusticeindonesia@gmail.com
249,boerhanhabibi@gmail.com


In [None]:
df_grouped = result.groupby('subscription_tier')

# print the size of each group
print(df_grouped.size())

subscription_tier
BRONZE    307
GOLD       34
SILVER    235
dtype: int64


In [None]:
# If you want to save to CSV:
result.to_csv("lawyer_subscription_package.csv", index=False)