## Column Transformations

In [1]:
pip install word2number

Note: you may need to restart the kernel to use updated packages.


In [2]:
from datetime import datetime
import pandas as pd
import numpy as np
from word2number import w2n
import re

In [3]:
predictions=pd.read_csv("predictions.csv")

In [4]:
#Function to handle Age 
def extract_numeric_age(age_str):
    # Mapping of non-numeric age descriptors to a representative age
    age_descriptors = {'baby': 1, 'toddler': 2, 'children': 10, 'elderly': 70, 'middle-aged': 45,'teenager':13}
    numeric_ages = []    
    for part in age_str.lower().split(','):
        # Descriptor or month handling
        descriptor_handled = False
        for descriptor, age in age_descriptors.items():
            if descriptor in part:
                numeric_ages.append(age)
                descriptor_handled = True
        # Handle 'month-old' for fractional years if not already handled by descriptor
        if 'month-old' in part and not descriptor_handled:
            month_age = re.findall(r'\d+', part)
            if month_age:
                numeric_ages.append(round(int(month_age[0]) / 12.0,1))
            continue       
        # Extract numbers if part was not handled by descriptors or month-old logic
        if not descriptor_handled:
            try:
                age = w2n.word_to_num(part)
                numeric_ages.append(age)
            except ValueError:
                # Use regex to find numbers if word_to_num fails
                found_ages = re.findall(r'\d+', part)
                numeric_ages.extend([int(age) for age in found_ages])
    # Return appropriately based on the results
    return numeric_ages[0] if len(numeric_ages) == 1 else (numeric_ages if numeric_ages else 'na')

In [5]:
#Function to handle vehicle type
vehicle_mappings = {
    'Car': [ 'car', 'cars', 'suv', 'sedan', 'hatchback', 'toyota innova', 'hyundai creta','bmw', 'mercedes-benz', 'honda city', 'toyota qualis', 'swift dzire', 'jaguar car',
        'pcr van', 'mercedes', 'honda civic', 'innova car', 'maruti baleno', 'hyundai verna','fortuner', 'xuv', 'porsche', 'audi car', 'volkswagen polo', 'beetle car', 'baleno',
        'suzuki ertiga', 'renault car', 'ford', 'wagonr', 'tata nexon', 'bentley car','mercedes car', 'volvo car', 'range rover', 'bikes', 'mercedes-benz', 'swift car','biker', 
        'volkswagen', 'toyota sedan', 'innova-lorry', 'e-bus', 'honda', 'auto','luxury car', 'sports car', 'i10 car', 'toyota etios', 'ford mustang', 'ferrari car','eeco', 
        'hyundai grand', 'ford fiesta', 'jaguar', 'hyundai i10', 'honda civic','renault duster', 'toyota fortuner', 'mercedes benz', 'swift dzire'],
    'Two-Wheeler': ['bike', 'scooter', 'motorbike', 'bicycle', 'two-wheeler', 'scooty', 'biker', 'cyclist','motorcycle', 'bikes', 'minibus', 'minitruck', 'pickup', 'cycle', 'moped', 'bicycles','cyclists'],
    'Commercial Vehicle': ['truck', 'van', 'lorry', 'bus', 'goods carrier', 'tractor-trailer', 'mini-truck','pickup truck', 'container truck', 'dumper', 'tanker', 'canter', 'tempo', 'auto-rickshaw',
        'trolley', 'cargo truck', 'mini-bus', 'omnibus', 'cab', 'oil tanker', 'tanker truck','trailer truck', 'container-truck', 'pickup van', 'mini truck', 'tempo cruiser',
        'pickup', 'luxury bus', 'minibus', 'three wheeler', 'auto', 'rickshaw', 'e-rickshaw','autorickshaw', 'jeep', 'van', 'suv-borne', 'pickup truck', 'minivan', 'trailer',
        'bus-trailer', 'monobus', 'luxury bus', 'ambulance', 'police', 'land cruiser', 'defender','land rover', 'ambulance', 'fire truck', 'police van'],
    'Special Vehicle': ['tractor', 'jcb', 'road roller', 'earth mover', 'multi-utility vehicle', 'muv','multi-purpose vehicle', 'mpv', 'utility vehicle', 'erv', 'electric pole', 'earthmover',
        'crane', 'forklift', 'bulldozer', 'excavator', 'backhoe', 'dumper', 'loader']}
def categorize_vehicle_from_dict(vehicle_str):
    vehicle_str = vehicle_str.lower()
    categories = set()   
    # Extract vehicle terms from the string, handles lists and single entries
    vehicle_list = re.findall(r'[\w\s-]+', vehicle_str.replace('[', '').replace(']', ''))   
    # Identify categories for each vehicle term
    for v in vehicle_list:
        for category, keywords in vehicle_mappings.items():
            if any(keyword.lower() in v for keyword in keywords):
                categories.add(category)              
    return list(categories) if len(categories) > 1 else (categories.pop() if categories else 'na')


In [6]:
#Function to handle reason of accident
reason_mappings = {
    'Speed': ['speeding', 'speed', 'racing', 'joy'],
    'Alcohol': ['drunk', 'drunken', 'alcohol', 'drink'],
    'Collision': ['hit-and-run', 'collided', 'collision', 'mowing', 'mow', 'rammed'],
    'Visibility': ['fog', 'fogged', 'smog', 'light'],
    'Negligence': ['negligent', 'rash', 'reckless', 'dozed'],
    'Obstacle': ['tree', 'divider', 'rock', 'stones', 'brick'],
    'External influences': ['electric', 'electricity', 'drug', 'dementia', 'tyre', 'overloaded']
}
def categorize_accident_reason(reason_str):
    reason_str = reason_str.lower()
    categories = set()

    # Extract reason terms from the string, handling lists
    reason_list = re.findall(r'[\w-]+', reason_str.replace('[', '').replace(']', ''))
    
    # Identify categories for each reason term
    for reason in reason_list:
        for category, keywords in reason_mappings.items():
            if any(keyword in reason for keyword in keywords):
                categories.add(category)                
    return list(categories) if len(categories) > 1 else (categories.pop() if categories else 'na')


In [7]:
#Function to handle no of injured and fatalities
def text_to_single_number(text):
    try:
        return [w2n.word_to_num(item.strip()) for item in text.strip('[]').split(',')] if ',' in text or '[' in text else w2n.word_to_num(text)
    except ValueError:
        return 'na'

In [8]:
#Function to handle gender
male_keywords = {'man', 'men', 'boy', 'boys', 'son', 'sons', 'he', 'his', 'him', 'brother', 'brothers', 'father', 'grandfather', 'grandson', 'husband', 'businessman', 'businessmen', 'policeman', 'policemen', 'cameraman', 'male', 'nephew', 'uncle'}
female_keywords = {'woman', 'women', 'girl', 'girls', 'daughter', 'daughters', 'she', 'her', 'sister', 'sisters', 'mother', 'grandmother', 'granddaughter', 'wife', 'businesswoman', 'policewoman', 'female', 'niece', 'aunt', 'lady'}
def classify_gender(gender_str):
    tokens = set(re.split(r'[,\s\[\]]+', gender_str.lower()))
    identified_genders = []
    if any(token in male_keywords for token in tokens):
        identified_genders.append('Male')
    if any(token in female_keywords for token in tokens):
        identified_genders.append('Female')
    return identified_genders[0] if len(identified_genders) == 1 else identified_genders if identified_genders else 'na'

In [9]:
def categorize_time(time_str):
    # Keywords for categorizing times
    keywords = {
        'Morning': ['morning', 'am', 'dawn', 'predawn', 'early'],
        'Afternoon': ['afternoon', 'noon'],  # 'noon' generally considered part of afternoon
        'Evening': ['evening', 'late', 'dusk', 'sunset'],
        'Night': ['night', 'midnight', 'wee', 'late night']
    }
    # Split on commas, spaces, and brackets
    times = re.split(r'[\s,\[\]]+', time_str.lower())
    categories = set()
    for period, words in keywords.items():
        if any(word in time for time in times for word in words):
            categories.add(period)
    return categories.pop() if len(categories) == 1 else list(categories) if categories else 'na'

In [10]:
#Applying the funcitons to each columns
predictions['AGE'] = predictions['AGE'].apply(extract_numeric_age)

In [11]:
predictions['VEHICLE TYPE'] = predictions['VEHICLE TYPE'].apply(categorize_vehicle_from_dict)

In [12]:
predictions['REASON'] = predictions['REASON'].apply(categorize_accident_reason)

In [13]:
predictions['FATALITIES'] = predictions['FATALITIES'].apply(lambda x: text_to_single_number(x) if pd.notnull(x) else 'na')
predictions['INJURED'] = predictions['INJURED'].apply(lambda x: text_to_single_number(x) if pd.notnull(x) else 'na')

In [14]:
predictions['GENDER'] = predictions['GENDER'].apply(classify_gender)

In [15]:
predictions['TIME'] = predictions['TIME'].apply(categorize_time)

In [16]:
predictions.head(50)

Unnamed: 0.1,Unnamed: 0,ID,CONTENT,FIRST_LINE,AGE,VEHICLE TYPE,REASON,FATALITIES,INJURED,GENDER,TIME
0,0,93122413,NEW DELHI: Three women from a family were kill...,Three women from a family were killed while tw...,1,"[Car, Commercial Vehicle]",Speed,3,2,Female,na
1,1,93121445,NEW DELHI: Three women from a family were kill...,Three women from a family were killed while tw...,1,"[Car, Commercial Vehicle]",Speed,3,2,Female,na
2,2,102292814,SURAT: Even as the public anger against speed ...,Even as the public anger against speed demons ...,na,"[Car, Two-Wheeler]","[Speed, Alcohol]",na,5,Male,Night
3,3,93097664,CHENNAI: A speeding car mowed down two people ...,A speeding car mowed down two people boarding ...,na,"[Car, Commercial Vehicle]",Speed,na,2,na,na
4,4,99974120,Jaipur: A day after four persons were killed i...,A day after four persons were killed in a car ...,na,Car,na,4,na,na,na
5,5,94160142,PANAJI: Porvorim police on Monday registered a...,Porvorim police on Monday registered a case of...,na,Car,Negligence,na,na,na,na
6,6,92630463,UDUPI: One person died and another person is r...,One person died and another person is reported...,na,Car,na,1,na,na,Night
7,7,96099409,BELAGAVI: Two cars collided with each other le...,Two cars collided with each other leading to t...,na,Car,na,na,na,"[Male, Female]",Night
8,8,101003294,MUMBAI: The D B Marg police on Wednesday booke...,The D B Marg police on Wednesday booked a 17-y...,"[17, 13]","[Car, Two-Wheeler]",na,2,na,Male,Morning
9,9,101427314,Pune: A five-year-old girl riding pillion with...,A five-year-old girl riding pillion with her 3...,"[32, 5]","[Car, Two-Wheeler]",na,na,na,"[Male, Female]",na


In [17]:
print('*'*20)
print('Percentage Missing')
print('*'*20)
for column in ['AGE', 'VEHICLE TYPE',
       'REASON', 'FATALITIES', 'INJURED', 'GENDER', 'TIME']:
    print('Column ('+ column + ('-') * (20 - len(column)) + '>' + str(predictions[predictions[column] == 'na'].shape[0]*100/predictions.shape[0]))

********************
Percentage Missing
********************
Column (AGE----------------->72.78127183787561
Column (VEHICLE TYPE-------->24.24877707896576
Column (REASON-------------->77.30607966457023
Column (FATALITIES---------->49.807826694619145
Column (INJURED------------->68.16911250873515
Column (GENDER-------------->65.07686932215235
Column (TIME---------------->56.586303284416495
