# __init__

In [20]:
import os
import json
import time
import numpy as np
import pandas as pd
import ast
import re
from datetime import datetime
from word2number import w2n
from tqdm import tqdm

# Load data

In [21]:
# ------ read data from the llm output: Extraing informa news articles and storing in in string.
df_llm1 = pd.read_csv('data_extracted_llm_openai.csv')

# ------ read data from the llm output: classification on the first first llm pass.
df_llm2 = pd.read_csv('reasontimevehicle_classification.csv')

# ------ Filter out the results that we identified as issues at the initial clustering stage
df_clustered_list = pd.read_csv('id.csv')

# ------ read in the external auxilary information
df_external_details = pd.read_csv('transformed.csv')
# use only the columns needed
df_external_details = df_external_details[['id','first_line', 'place', 'link', 'news_date','latitude',
       'longitude', 'state', 'week_avg_weather', 'precipitation_3days']]



In [22]:
df_external_details

Unnamed: 0,id,first_line,place,link,news_date,latitude,longitude,state,week_avg_weather,precipitation_3days
0,107938160,Three people lost their lives on Friday in a c...,Sultanpur,https://timesofindia.indiatimes.com/city/allah...,23/02/24,26.334588,81.996789,Uttar Pradesh,18.672459,0.200000
1,107821301,Three people died in a car accident in Rajasth...,Jaipur,https://timesofindia.indiatimes.com/city/jaipu...,19/02/24,26.915458,75.818982,Rajasthan,18.577219,0.000000
2,107790964,Two women killed and five others were seriousl...,Raichur,https://timesofindia.indiatimes.com/city/benga...,18/02/24,16.083333,77.166667,Karnataka,29.642168,0.000000
3,107737101,Two students were injured after a speeding car...,New Delhi,https://timesofindia.indiatimes.com/city/delhi...,16/02/24,28.613895,77.209006,Delhi,18.050730,0.000000
4,107897709,"A newlywed software engineer , his father-in-l...",Hyderabad,https://timesofindia.indiatimes.com/city/hyder...,22/02/24,17.360589,78.474061,Telangana,26.445761,0.000000
...,...,...,...,...,...,...,...,...,...,...
6745,70438045,Three including two members of Jasdan Diamond ...,Rajkot,https://timesofindia.indiatimes.com/city/rajko...,30/07/19,22.305326,70.802838,Gujarat,27.055136,73.299995
6746,69529927,"Five members of a family, including a 14-year-...",Bengaluru,https://timesofindia.indiatimes.com/city/benga...,28/05/19,12.976794,77.590082,Karnataka,25.609011,5.900000
6747,69168530,Three members of a family were killed after th...,Ambala/Parwanoo,https://timesofindia.indiatimes.com/city/gurga...,04/05/19,,,,,
6748,69652686,"Three men, including the manager of a national...",Madurai,https://timesofindia.indiatimes.com/city/chenn...,04/06/19,9.926115,78.114098,Tamil Nadu,30.240919,30.000000


In [23]:
# Check
df_external_details.head(2)

Unnamed: 0,id,first_line,place,link,news_date,latitude,longitude,state,week_avg_weather,precipitation_3days
0,107938160,Three people lost their lives on Friday in a c...,Sultanpur,https://timesofindia.indiatimes.com/city/allah...,23/02/24,26.334588,81.996789,Uttar Pradesh,18.672459,0.2
1,107821301,Three people died in a car accident in Rajasth...,Jaipur,https://timesofindia.indiatimes.com/city/jaipu...,19/02/24,26.915458,75.818982,Rajasthan,18.577219,0.0


In [24]:
# Merge the dataframes 

df_llm1  = df_llm1[['id','content', 'JSON_String', 'JSON_Dict', 'fatalities',
                   'injured','victim_gender','names_ages']]

df_llm2 = df_llm2[['ID', 'time_of_day', 'reason', 'vehicles']]

merged_df = df_llm1.merge(df_llm2, left_on='id', right_on='ID').drop(columns=['ID'])

merged_clust_df = df_clustered_list.merge(merged_df, on = 'id', how = 'left')

merged_clust_df = merged_clust_df.merge(df_external_details, on = 'id', how = 'left')

df = merged_clust_df

In [25]:
# Define a dictionary for renaming columns so it confirms to next script.

new_column_names = {
    'victim_gender': 'gender',
    'names_ages': 'age',
    'vehicles': 'vehicle_type',
    'time_of_day': 'time'
}

# Rename columns
df.rename(columns=new_column_names, inplace=True)

In [26]:
# validate the reaname is completed , else this will throw an error

df = df[['id','first_line','place', 'link', 'content', 'news_date', 'latitude', 'longitude', 'state', 'week_avg_weather', 'precipitation_3days', 'age', 'vehicle_type', 'reason', 'fatalities', 'injured', 'gender', 'time']]

In [27]:
df.head()

Unnamed: 0,id,first_line,place,link,content,news_date,latitude,longitude,state,week_avg_weather,precipitation_3days,age,vehicle_type,reason,fatalities,injured,gender,time
0,107938160,Three people lost their lives on Friday in a c...,Sultanpur,https://timesofindia.indiatimes.com/city/allah...,SULTANPUR : Three people lost their lives on F...,23/02/24,26.334588,81.996789,Uttar Pradesh,18.672459,0.2,"[('Chinta Devi', 51), ('Ram Chandra Gupta', 55...",['Car'],Negligence,3,1,"['Male', 'Female']",
1,107821301,Three people died in a car accident in Rajasth...,Jaipur,https://timesofindia.indiatimes.com/city/jaipu...,JAIPUR: Three people died in a car accident in...,19/02/24,26.915458,75.818982,Rajasthan,18.577219,0.0,"[('Ravinder Kumar', 'na'), ('Subhash Kumar', '...","['Car', 'Commercial Vehicle']",Collision,3,0,['Male'],night
2,107790964,Two women killed and five others were seriousl...,Raichur,https://timesofindia.indiatimes.com/city/benga...,RAICHUR: Two women killed and five others were...,18/02/24,16.083333,77.166667,Karnataka,29.642168,0.0,[],"['Car', 'Commercial Vehicle']",External influences,2,5,['Female'],morning
3,107737101,Two students were injured after a speeding car...,New Delhi,https://timesofindia.indiatimes.com/city/delhi...,New Delhi: Two students were injured after a s...,16/02/24,28.613895,77.209006,Delhi,18.05073,0.0,"[('na', 'early 20s'), ('na', 'early 20s')]",['Car'],Speed,0,2,"['Male', 'Female']",morning
4,107897709,"A newlywed software engineer , his father-in-l...",Hyderabad,https://timesofindia.indiatimes.com/city/hyder...,"Hyderabad: A newlywed software engineer , his ...",22/02/24,17.360589,78.474061,Telangana,26.445761,0.0,"[('Pavan Sai', 27), ('Namburi Anusha', 26), ('...",['Car'],External influences,3,1,"['Male', 'Female']",night


# Functions

In [28]:
# Function to handle number of injured and fatalities when there's only a single entry
def text_to_single_number(text):
    try:
        # Parse the text to extract numbers
        numbers = [w2n.word_to_num(item.strip()) if '.' not in item else float(item.strip()) 
                   for item in text.strip('[]').split(',') if item.strip()]
        # Return the number if there's exactly one, or np.nan otherwise
        return numbers[0] if len(numbers) == 1 else np.nan
    except ValueError:
        return np.nan

In [29]:
#Function to handle reason of accident
reason_mappings = {
    'Speed': ['speeding', 'speed', 'racing', 'joy'],
    'Driving Under Influence': ['drunk', 'drunken', 'alcohol', 'drink', 'drug'],
    'Collision': ['hit-and-run', 'collided', 'collision', 'mowing', 'mow', 'rammed','hit','accident'],
    'Visibility': ['fog', 'fogged', 'smog', 'light','rain'],
    'Negligence': ['negligence','skid','wrong side','negligent', 'rash', 'reckless', 'dozed','abrupt','control','rear-ended','sleep','crossing','jaywalking'],
    'External influences': ['tree','cattle','divider', 'rock', 'stones', 'brick','electric', 'electricity', 'dementia', 'tyre', 'overloaded']
}
# Function to handle reason of accident with hierarchy
def categorize_accident_reason(reason_str):
    if pd.isnull(reason_str):
        return np.nan

    reason_str = reason_str.lower()
    categories = set()
    
    # Extract reason terms from the string, handling lists
    reason_list = re.findall(r'[\w-]+', reason_str.replace('[', '').replace(']', ''))
    
    # Reason hierarchy dictionary (lower number means higher priority)
    reason_hierarchy = {
        'Driving Under Influence': 1,
        'Speed': 2,
        'Negligence': 3,
        'Collision': 4,
        'Visibility': 5,
        'External influences': 6
    }
    
    # Identify categories for each reason term
    min_priority = float('inf')
    selected_category = np.nan
    
    for reason in reason_list:
        for category, keywords in reason_mappings.items():
            if any(keyword in reason for keyword in keywords):
                current_priority = reason_hierarchy.get(category, float('inf'))
                # Check if the current category has higher priority
                if current_priority < min_priority:
                    min_priority = current_priority
                    selected_category = category
    return selected_category


# # Reapply the function to the 'REASON' column
# predictions['REASON'] = predictions['REASON'].apply(categorize_accident_reason)

In [30]:
#Function to handle gender
male_keywords = {'man', 'men', 'boy', 'boys', 'son', 'sons', 'he', 'his', 'him', 'brother', 'brothers', 'father', 'grandfather', 'grandson', 'husband', 'businessman', 'businessmen', 'policeman', 'policemen', 'cameraman', 'male', 'nephew', 'uncle'}
female_keywords = {'woman', 'women', 'girl', 'girls', 'daughter', 'daughters', 'she', 'her', 'sister', 'sisters', 'mother', 'grandmother', 'granddaughter', 'wife', 'businesswoman', 'policewoman', 'female', 'niece', 'aunt', 'lady'}
def classify_gender(gender_str):
    gender_str = re.sub(r'[\[\]\']', '', gender_str)
    
    
    if pd.isnull(gender_str):
        return np.nan

    tokens = set(re.split(r'[,\s\[\]]+', gender_str.lower()))
    identified_genders = []
    if any(token in male_keywords for token in tokens):
        identified_genders.append('Male')
    if any(token in female_keywords for token in tokens):
        identified_genders.append('Female')
    return identified_genders[0] if len(identified_genders) == 1 else identified_genders if identified_genders else np.nan

In [39]:
 def extract_numbers(input_string):
    numbers = re.findall(r'\d+', input_string)
    numbers = [int(num) if int(num) != 0 else 1 for num in numbers]
    numbers = [num for num in numbers if num <= 100]
    if len(numbers) == 0:
        return np.nan
    else:
        return numbers

In [32]:
import re

def extract_time(input_data):
    # Regular expression pattern to match morning, afternoon, evening, or night
    time_pattern = r'(morning|afternoon|evening|night)'
    
    # If input_data is a list
    if isinstance(input_data, list):
        # Iterate through the list
        for item in input_data:
            # If the item is not 'NA', try to find a match
            if item != 'NA':
                match = re.search(time_pattern, item)
                # If a match is found, return it
                if match:
                    return match.group()
    # If input_data is a string
    elif isinstance(input_data, str):
        # Find all matches in the input string
        matches = re.findall(time_pattern, input_data)
        # If matches are found, return the first match, else return None
        if matches:
            return matches[0]
    
    # If no match is found, return None
    return None

In [33]:
def extract_unique_vehicles(input_string):
    allowed_words = ['Two-Wheeler', 'Special Vehicle', 'Commercial Vehicle', 'Car']
    # Remove brackets and single quotes, split the string into words
    words = input_string.strip("[]").replace("'", "").split(", ")
    # Extract unique words
    unique_words = list(set(words))
    # Filter out words not in the allowed list
    filtered_words = [word for word in unique_words if word in allowed_words]
    
    if len(filtered_words) == 0:
        return np.nan
    else:
        return filtered_words

In [34]:
def extract_unique_reasons(input_string):
    allowed_words = ['Negligence', 'Collision', 'External influences', 'Speed', 'Driving Under Influence']
    # Remove brackets and single quotes, split the string into words
    words = input_string.strip("[]").replace("'", "").split(", ")
    # Extract unique words
    unique_words = list(set(words))
    # Filter out words not in the allowed list
    filtered_words = [word for word in unique_words if word in allowed_words]
    
    if len(filtered_words) == 0:
        return np.nan
    else:
        return filtered_words

# Preprocess

In [35]:
df['fatalities'] = df['fatalities'].astype(str).apply(text_to_single_number)
df['injured'] = df['injured'].astype(str).apply(text_to_single_number)
df['gender'] = df['gender'].astype(str).apply(classify_gender)
df['age'] = df['age'].astype(str).apply(extract_numbers)
df['time'] = df['time'].astype(str).apply(extract_time)
df['vehicle_type'] = df['vehicle_type'].astype(str).apply(extract_unique_vehicles)
df['reason'] = df['reason'].astype(str).apply(categorize_accident_reason)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['fatalities'] = df['fatalities'].astype(str).apply(text_to_single_number)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['injured'] = df['injured'].astype(str).apply(text_to_single_number)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['gender'] = df['gender'].astype(str).apply(classify_ge

# Save to CSV

In [36]:
df.head(3)

Unnamed: 0,id,first_line,place,link,content,news_date,latitude,longitude,state,week_avg_weather,precipitation_3days,age,vehicle_type,reason,fatalities,injured,gender,time
0,107938160,Three people lost their lives on Friday in a c...,Sultanpur,https://timesofindia.indiatimes.com/city/allah...,SULTANPUR : Three people lost their lives on F...,23/02/24,26.334588,81.996789,Uttar Pradesh,18.672459,0.2,"[51, 55, 52, 30]",[Car],Negligence,3.0,1.0,"[Male, Female]",
1,107821301,Three people died in a car accident in Rajasth...,Jaipur,https://timesofindia.indiatimes.com/city/jaipu...,JAIPUR: Three people died in a car accident in...,19/02/24,26.915458,75.818982,Rajasthan,18.577219,0.0,,"[Commercial Vehicle, Car]",Collision,3.0,0.0,Male,night
2,107790964,Two women killed and five others were seriousl...,Raichur,https://timesofindia.indiatimes.com/city/benga...,RAICHUR: Two women killed and five others were...,18/02/24,16.083333,77.166667,Karnataka,29.642168,0.0,,"[Commercial Vehicle, Car]",,2.0,5.0,Female,morning


In [38]:
#  the file data_extracted_transformed_openai.csv will hold the final dataset

df.to_csv('data_extracted_transformed_openai.csv')