In [2]:
from rapidfuzz import fuzz, process
from google.cloud import bigquery
from datetime import datetime
import pandas as pd
import numpy as np
import re

def read_bq():
    client = bigquery.Client("intern-project-415606")
    
    query = f"""
        SELECT *
        FROM `intern-project-415606.Criminal_Dataset.criminal_data_gemini`
    """
    
    try:
        query_job = client.query(query)
        df = query_job.to_dataframe()
        return df
    except Exception as e:
        print(f"Error: {e}")

def convert_to_standard_date(date_str):
    if re.match(r'\d{4}-\d{2}-\d{2}', date_str):
        return date_str
    # Handling Vietnamese format "dd tháng mm năm yyyy"
    viet_date_match = re.match(r'(\d{1,2}) tháng (\d{1,2}) năm (\d{4})', date_str)
    if viet_date_match:
        day = viet_date_match.group(1).zfill(2)
        month = viet_date_match.group(2).zfill(2)
        year = viet_date_match.group(3)
        return f"{year}-{month}-{day}"
    
    # Handling "dd-mm-yyyy" format
    try:
        return datetime.strptime(date_str, '%d/%m/%Y').strftime('%Y-%m-%d')
    except ValueError:
        pass
    
    # Handling "yyyy-mm" format, assume day as 01
    try:
        return datetime.strptime(date_str, '%Y-%m').strftime('%Y-%m')
    except ValueError:
        pass
    
    # Handling formats like "yyyy"
    try:
        return datetime.strptime(date_str, '%Y').strftime('%Y')
    except ValueError:
        pass
    
    
    # Handling "no information" or any other text
    if date_str.lower in ['no', 'N/A', 'null']:
        return 'No information'
    
    # Default return None for unhandled cases
    return None

# Define the mask to filter out specific strings
def filter_specific_strings(df):
    mask = ~(df[['crime', 'jail', 'jail_duration', 'fine', 'fine_total', 'other_punishment']]
              .isin(['no', 'N/A', 'NO', 'No', 'No Action Taken', 'no information', 'No information', 'null', 'None', 
                     'No Crime', 'No crime', 'No criminal charge', 'No criminal record', 'Not specified'])).all(axis=1)
    # Filter the DataFrame using the mask
    df = df[mask]
    return df

# Function to convert jail duration to months
def convert_to_months(duration):
    # Define conversion rates
    year_to_month = 12
    month_to_month = 1

    # Dictionary for Vietnamese translations
    vietnamese_to_english = {
        'năm': 'year',
        'tháng': 'month',
        'chung thân': 'life imprisonment',
        'tù chung thân': 'life imprisonment',
        'mười hai': 'twelve',
        'Chín': 'nine',
        'Hai': 'two',
        'Sáu': 'six',
        'bảy': 'seven',
        'Một': 'one',
        '03': 'three',
        '06': 'six',
        '07': 'seven',
        '08': 'eight',
        '09': 'nine'
    }

    # Replace Vietnamese terms with English equivalents
    for viet, eng in vietnamese_to_english.items():
        duration = duration.replace(viet, eng)

    # Handle special cases
    if 'life' in duration.lower():
        return 'Life Imprisonment'
    if 'death' in duration.lower():
        return 'Death Sentence'
    if 'no' in duration.lower() or 'yes' in duration.lower():
        return 'Not Applicable'

    # Extract numbers and units
    numbers = re.findall(r'\d+', duration)
    units = re.findall(r'year|month', duration, re.IGNORECASE)

    # Convert all to months
    total_months = 0
    for number, unit in zip(numbers, units):
        if 'year' in unit.lower():
            total_months += int(number) * year_to_month
        elif 'month' in unit.lower():
            total_months += int(number) * month_to_month

    return total_months

# Function to standardize monetary amounts
def standardize_amount(amount):
    # Check for monthly payments and extract the number of months if present
    monthly_payment_match = re.search(r'(\d+([.,]\d+)*)\s*VND/month\s*for\s*(\d+)\s*months', amount, re.IGNORECASE)
    if monthly_payment_match:
        monthly_payment = monthly_payment_match.group(1).replace('.', '').replace(',', '')
        months = int(monthly_payment_match.group(3))
        return round(float(monthly_payment) * months)  # Return the total payment over the period
    
    # Handle ranges (e.g., "10.000.000-15.000.000")
    range_match = re.search(r'(\d+[.,\d]*)\s*[-đồng tođến]+\s*(\d+[.,\d]*)', amount, re.IGNORECASE)
    if range_match:
        low_amount = range_match.group(1).replace('.', '').replace(',', '')
        high_amount = range_match.group(2).replace('.', '').replace(',', '')
        # Convert to float and take the average of the range
        try:
            low_amount = float(low_amount)
            high_amount = float(high_amount)
            return round((low_amount + high_amount) / 2)
        except ValueError:
            return 0
        
    # Remove non-numeric characters but keep the decimal point and comma
    amount = re.sub(r'[^\d.,]', '', amount)
    # Determine if commas or periods are used as thousand separators or decimal points
    if ',' in amount and '.' in amount:
        if amount.find(',') < amount.find('.'):
            amount = amount.replace(',', '')
        else:
            amount = amount.replace('.', '')
            
    # Replace remaining commas with dots if they are used as decimal separators
    amount = amount.replace('.', ',')
    amount = amount.replace(',', '')
    # Convert to float and round to nearest integer (assuming no cents are needed)
    try:
        standardized_amount = eval(amount)
    except:
        standardized_amount = 0
    return standardized_amount

# Main function to process the DataFrame
def process_dataframe(df):
    # Apply the function to the birthdate column
    df.loc[:,'birthdate'] = df['birthdate'].apply(convert_to_standard_date)
    
    df.loc[:,'standardized_jail_duration'] = df['jail_duration'].apply(convert_to_months)
    df.loc[:,'standardized_fine_total'] = df['fine_total'].apply(standardize_amount)
    df = filter_specific_strings(df)
    return df

def calculate_similarity(df, person):
    similarities_name = df['name'].apply(lambda x: fuzz.partial_ratio(person['name'], x))
    similarities_birthdate = df['birthdate'].apply(lambda x: fuzz.ratio(str(person['birthdate']), str(x)))
    # Combine the similarities
    #total_similarity = similarities_name + similarities_birthdate + similarities_province + similarities_district + similarities_village
    total_similarity = similarities_name * 0.7 + similarities_birthdate * 0.3 

    # Combine the similarities and original DataFrame
    df['total_similarity'] = total_similarity

    # Retrieve the top 5 most similar rows
    top_rows = df.nlargest(5, 'total_similarity')  

    return top_rows


In [3]:

import json
def process_criminal_data(request):
    """HTTP Cloud Function for processing criminal data.
    Args:
        request (flask.Request): The request object.
    Returns:
        The response object containing the processed data.
    """
    request_json = request.get_json(silent=True)

    # Assuming 'person' details are passed in the request JSON
    if request_json and 'person' in request_json:
        person = request_json['person']
        # Load your DataFrame here
        df = read_bq()
        df = process_dataframe(df)
        top_5_rows = calculate_similarity(df, person)
        # Convert DataFrame to JSON
        result = top_5_rows.to_json(orient='records')

        # Convert the JSON string to a Python object
        result_json = json.loads(result)

        # Pretty print the JSON object
        pretty_result = json.dumps(result_json, indent=4)

        # Return the pretty printed JSON
        return pretty_result
    else:
        return jsonify({"error": "Missing 'person' data in request"}), 400


In [4]:
'a' and 'b' in 'ab'

True