## Column Transformations

In [100]:
pip install word2number

Note: you may need to restart the kernel to use updated packages.


In [101]:
from datetime import datetime
import pandas as pd
import numpy as np
from word2number import w2n
import re
import numpy as np

In [102]:
predictions=pd.read_csv("predicted.csv")

In [103]:
predictions.head()

Unnamed: 0,ID,CONTENT,FIRST_LINE,AGE,VEHICLE TYPE,REASON,FATALITIES,INJURED,GENDER,TIME
0,107938160,SULTANPUR : Three people lost their lives on F...,Three people lost their lives on Friday in a c...,,car,,Three,,,
1,107821301,JAIPUR: Three people died in a car accident in...,Three people died in a car accident in Rajasth...,,,,Three,,,
2,107790964,RAICHUR: Two women killed and five others were...,Two women killed and five others were seriousl...,,car-bus,collision,Two,five,,morning
3,107737101,New Delhi: Two students were injured after a s...,Two students were injured after a speeding car...,,,speeding,,Two,,
4,107897709,"Hyderabad: A newlywed software engineer , his ...","A newlywed software engineer , his father-in-l...",,,veered,,,father-in-law,wee


In [104]:
def extract_numeric_age(age_str):
    if pd.isnull(age_str):
        return np.nan

    # Mapping of non-numeric age descriptors to a representative age
    age_descriptors = {'baby': 1, 'toddler': 3, 'children': 10, 'elderly': 70, 'middle-aged': 45, 'teenager': 13}
    numeric_ages = []    
    for part in age_str.lower().split(','):
        # Descriptor or month handling
        descriptor_handled = False
        for descriptor, age in age_descriptors.items():
            if descriptor in part:
                numeric_ages.append(age)
                descriptor_handled = True
        # Handle 'month-old' for fractional years if not already handled by descriptor
        if 'month-old' in part and not descriptor_handled:
            month_age = re.findall(r'\d+', part)
            if month_age:
                numeric_ages.append(round(int(month_age[0]) / 12.0,1))
            continue       
        # Extract numbers if part was not handled by descriptors or month-old logic
        if not descriptor_handled:
            try:
                age = w2n.word_to_num(part)
                numeric_ages.append(age)
            except ValueError:
                # Use regex to find numbers if word_to_num fails
                found_ages = re.findall(r'\d+', part)
                numeric_ages.extend([int(age) for age in found_ages])
    # Return appropriately based on the results
    return numeric_ages[0] if len(numeric_ages) == 1 else (numeric_ages if numeric_ages else np.nan)

In [42]:
#Function to handle vehicle type
vehicle_mappings = {
    'Car': [ 'car', 'cars', 'suv', 'sedan', 'hatchback', 'toyota innova', 'hyundai creta','bmw', 'mercedes-benz', 'honda city', 'toyota qualis', 'swift dzire', 'jaguar car',
        'pcr van', 'mercedes', 'honda civic', 'innova car', 'maruti baleno', 'hyundai verna','fortuner', 'xuv', 'porsche', 'audi car', 'volkswagen polo', 'beetle car', 'baleno',
        'suzuki ertiga', 'renault car', 'ford', 'wagonr', 'tata nexon', 'bentley car','mercedes car', 'volvo car', 'range rover', 'bikes', 'mercedes-benz', 'swift car','biker', 
        'volkswagen', 'toyota sedan', 'innova-lorry', 'e-bus', 'honda', 'auto','luxury car', 'sports car', 'i10 car', 'toyota etios', 'ford mustang', 'ferrari car','eeco', 
        'hyundai grand', 'ford fiesta', 'jaguar', 'hyundai i10', 'honda civic','renault duster', 'toyota fortuner', 'mercedes benz', 'swift dzire'],
    'Two-Wheeler': ['bike', 'scooter', 'motorbike', 'bicycle', 'two-wheeler', 'scooty', 'biker', 'cyclist','motorcycle', 'bikes', 'minibus', 'minitruck', 'pickup', 'cycle', 'moped', 'bicycles','cyclists'],
    'Commercial Vehicle': ['truck', 'van', 'lorry', 'bus', 'goods carrier', 'tractor-trailer', 'mini-truck','pickup truck', 'container truck', 'dumper', 'tanker', 'canter', 'tempo', 'auto-rickshaw',
        'trolley', 'cargo truck', 'mini-bus', 'omnibus', 'cab', 'oil tanker', 'tanker truck','trailer truck', 'container-truck', 'pickup van', 'mini truck', 'tempo cruiser',
        'pickup', 'luxury bus', 'minibus', 'three wheeler', 'auto', 'rickshaw', 'e-rickshaw','autorickshaw', 'jeep', 'van', 'suv-borne', 'pickup truck', 'minivan', 'trailer',
        'bus-trailer', 'monobus', 'luxury bus', 'ambulance', 'police', 'land cruiser', 'defender','land rover', 'ambulance', 'fire truck', 'police van'],
    'Special Vehicle': ['tractor', 'jcb', 'road roller', 'earth mover', 'multi-utility vehicle', 'muv','multi-purpose vehicle', 'mpv', 'utility vehicle', 'erv', 'electric pole', 'earthmover',
        'crane', 'forklift', 'bulldozer', 'excavator', 'backhoe', 'dumper', 'loader']}
def categorize_vehicle_from_dict(vehicle_str):
    if pd.isnull(vehicle_str):
        return np.nan

    vehicle_str = vehicle_str.lower()
    categories = set()   
    # Extract vehicle terms from the string, handles lists and single entries
    vehicle_list = re.findall(r'[\w\s-]+', vehicle_str.replace('[', '').replace(']', ''))   
    # Identify categories for each vehicle term
    for v in vehicle_list:
        for category, keywords in vehicle_mappings.items():
            if any(keyword.lower() in v for keyword in keywords):
                categories.add(category)              
    return list(categories) if len(categories) > 1 else (categories.pop() if categories else np.nan)


In [29]:
#Function to handle reason of accident
reason_mappings = {
    'Speed': ['speeding', 'speed', 'racing', 'joy'],
    'Driving Under Influence': ['drunk', 'drunken', 'alcohol', 'drink', 'drug'],
    'Collision': ['hit-and-run', 'collided', 'collision', 'mowing', 'mow', 'rammed'],
    'Visibility': ['fog', 'fogged', 'smog', 'light'],
    'Negligence': ['negligent', 'rash', 'reckless', 'dozed'],
    'External influences': ['tree', 'divider', 'rock', 'stones', 'brick','electric', 'electricity', 'dementia', 'tyre', 'overloaded']
}
# Function to handle reason of accident with hierarchy
def categorize_accident_reason(reason_str):
    if pd.isnull(reason_str):
        return np.nan

    reason_str = reason_str.lower()
    categories = set()
    
    # Extract reason terms from the string, handling lists
    reason_list = re.findall(r'[\w-]+', reason_str.replace('[', '').replace(']', ''))
    
    # Reason hierarchy dictionary (lower number means higher priority)
    reason_hierarchy = {
        'Driving Under Influence': 1,
        'Speed': 2,
        'Negligence': 3,
        'Collision': 4,
        'Visibility': 5,
        'External influences': 6
    }
    
    # Identify categories for each reason term
    min_priority = float('inf')
    selected_category = np.nan
    
    for reason in reason_list:
        for category, keywords in reason_mappings.items():
            if any(keyword in reason for keyword in keywords):
                current_priority = reason_hierarchy.get(category, float('inf'))
                # Check if the current category has higher priority
                if current_priority < min_priority:
                    min_priority = current_priority
                    selected_category = category
    return selected_category


# Reapply the function to the 'REASON' column
predictions['REASON'] = predictions['REASON'].apply(categorize_accident_reason)


In [30]:
# Function to handle number of injured and fatalities when there's only a single entry
def text_to_single_number(text):
    try:
        # Parse the text to extract numbers
        numbers = [w2n.word_to_num(item.strip()) for item in text.strip('[]').split(',') if item.strip()]
        # Return the number if there's exactly one, or np.nan otherwise
        return numbers[0] if len(numbers) == 1 else np.nan
    except ValueError:
        return np.nan

In [49]:
#Function to handle gender
male_keywords = {'man', 'men', 'boy', 'boys', 'son', 'sons', 'he', 'his', 'him', 'brother', 'brothers', 'father', 'grandfather', 'grandson', 'husband', 'businessman', 'businessmen', 'policeman', 'policemen', 'cameraman', 'male', 'nephew', 'uncle'}
female_keywords = {'woman', 'women', 'girl', 'girls', 'daughter', 'daughters', 'she', 'her', 'sister', 'sisters', 'mother', 'grandmother', 'granddaughter', 'wife', 'businesswoman', 'policewoman', 'female', 'niece', 'aunt', 'lady'}
def classify_gender(gender_str):
    gender_str = re.sub(r'[\[\]\']', '', gender_str)
    
    
    if pd.isnull(gender_str):
        return np.nan

    tokens = set(re.split(r'[,\s\[\]]+', gender_str.lower()))
    identified_genders = []
    if any(token in male_keywords for token in tokens):
        identified_genders.append('Male')
    if any(token in female_keywords for token in tokens):
        identified_genders.append('Female')
    return identified_genders[0] if len(identified_genders) == 1 else identified_genders if identified_genders else np.nan

In [34]:
def categorize_time(time_str):
    if pd.isnull(time_str):
        return np.nan

    # Keywords for categorizing times
    keywords = {
        'Morning': ['morning', 'am', 'dawn', 'predawn', 'early'],
        'Afternoon': ['afternoon', 'noon'],  # 'noon' generally considered part of afternoon
        'Evening': ['evening', 'late', 'dusk', 'sunset'],
        'Night': ['night', 'pm', 'midnight', 'wee', 'late night']
    }
    # Split on commas, spaces, and brackets
    times = re.split(r'[\s,\[\]]+', time_str.lower())
    categories = set()
    for period, words in keywords.items():
        if any(word in time for time in times for word in words):
            categories.add(period)
    return categories.pop() if len(categories) == 1 else list(categories) if categories else np.nan

In [40]:
#Applying the funcitons to each columns
predictions['AGE'] = predictions['AGE'].apply(extract_numeric_age)

In [43]:
predictions['VEHICLE TYPE'] = predictions['VEHICLE TYPE'].apply(categorize_vehicle_from_dict)

In [44]:
predictions['REASON'] = predictions['REASON'].apply(categorize_accident_reason)

In [45]:
predictions['FATALITIES'] = predictions['FATALITIES'].apply(lambda x: text_to_single_number(x) if pd.notnull(x) else np.nan)
predictions['INJURED'] = predictions['INJURED'].apply(lambda x: text_to_single_number(x) if pd.notnull(x) else np.nan)

In [50]:
predictions['GENDER'] = predictions['GENDER'].apply(classify_gender)

In [51]:
predictions['TIME'] = predictions['TIME'].apply(categorize_time)

In [52]:
predictions.head(50)

Unnamed: 0.1,Unnamed: 0,ID,CONTENT,FIRST_LINE,AGE,VEHICLE TYPE,REASON,FATALITIES,INJURED,GENDER,TIME
0,0,107938160,SULTANPUR : Three people lost their lives on F...,Three people lost their lives on Friday in a c...,,Car,,3.0,,,
1,1,107821301,JAIPUR: Three people died in a car accident in...,Three people died in a car accident in Rajasth...,,,,3.0,,,
2,2,107790964,RAICHUR: Two women killed and five others were...,Two women killed and five others were seriousl...,,"[Car, Commercial Vehicle]",Collision,2.0,5.0,,Morning
3,3,107737101,New Delhi: Two students were injured after a s...,Two students were injured after a speeding car...,,,Speed,,2.0,,
4,4,107897709,"Hyderabad: A newlywed software engineer , his ...","A newlywed software engineer , his father-in-l...",,,,,,,Night
5,5,107619624,MATHURA: Five people have lost their lives in ...,Five people have lost their lives in a road ac...,,,,5.0,,,
6,6,107806086,Gurgaon: A woman died and her husband suffered...,A woman died and her husband suffered injuries...,,Car,,,,Female,Morning
7,7,107443549,Trichy : A 15-year-old boy was killed after he...,A 15-year-old boy was killed after he was hit ...,15,,,,,,Evening
8,8,107737116,New Delhi: An unidentified man who was injured...,An unidentified man who was injured in an acci...,,,,,,Male,
9,9,107554146,HISAR: Three men were killed and three others ...,Three men were killed and three others were in...,,,,3.0,3.0,,


In [56]:
print('*'*20)
print('Percentage Missing')
print('*'*20)
for column in ['AGE', 'VEHICLE TYPE', 'REASON', 'FATALITIES', 'INJURED', 'GENDER', 'TIME']:
    print('Column ('+ column + ('-') * (20 - len(column)) + '>' + str(predictions[pd.isnull(predictions[column])].shape[0]*100/predictions.shape[0]))

********************
Percentage Missing
********************
Column (AGE----------------->73.06666666666666
Column (VEHICLE TYPE-------->26.266666666666666
Column (REASON-------------->78.75555555555556
Column (FATALITIES---------->51.733333333333334
Column (INJURED------------->70.04444444444445
Column (GENDER-------------->66.07407407407408
Column (TIME---------------->56.08888888888889


In [74]:
predictions.columns = ['Unnamed: 0', 'id', 'content', 'first_line', 'age', 'vehicle_type',
       'reason', 'fatalities', 'injured', 'gender', 'time']

In [75]:
predictions.to_csv("predictions_transform.csv",index = False)

# Concatinating Geo-Spatial Data

In [89]:
geo_spacial_df = pd.read_csv('../03_entity/output/entity_extracted.csv')

In [95]:
geo_spacial_df.columns = ['Unnamed: 0', 'id', 'place', 'link', 'content', 'news_date',
       'first_line', 'latitude', 'longitude', 'state', 'week_avg_weather',
       'precipitation_3days']

In [96]:
predictions.columns

Index(['Unnamed: 0', 'id', 'content', 'first_line', 'age', 'vehicle_type',
       'reason', 'fatalities', 'injured', 'gender', 'time'],
      dtype='object')

In [97]:
selected_columns = predictions[['id', 'age', 'vehicle_type', 'reason', 'fatalities', 'injured', 'gender', 'time']]

# Merge the selected columns with 'geo_spacial_df' on the 'id' column
transformed_df = geo_spacial_df.merge(selected_columns, on='id', how='inner')


In [98]:
transformed_df.to_csv('transformed.csv', index = False)

In [99]:
transformed_df

Unnamed: 0.1,Unnamed: 0,id,place,link,content,news_date,first_line,latitude,longitude,state,week_avg_weather,precipitation_3days,age,vehicle_type,reason,fatalities,injured,gender,time
0,0,107938160,Sultanpur,https://timesofindia.indiatimes.com/city/allah...,SULTANPUR : Three people lost their lives on F...,23/02/24,Three people lost their lives on Friday in a c...,26.334588,81.996789,Uttar Pradesh,18.672459,0.200000,,Car,,3.0,,,
1,1,107821301,Jaipur,https://timesofindia.indiatimes.com/city/jaipu...,JAIPUR: Three people died in a car accident in...,19/02/24,Three people died in a car accident in Rajasth...,26.915458,75.818982,Rajasthan,18.577219,0.000000,,,,3.0,,,
2,2,107790964,Raichur,https://timesofindia.indiatimes.com/city/benga...,RAICHUR: Two women killed and five others were...,18/02/24,Two women killed and five others were seriousl...,16.083333,77.166667,Karnataka,29.642168,0.000000,,"[Car, Commercial Vehicle]",Collision,2.0,5.0,,Morning
3,3,107737101,New Delhi,https://timesofindia.indiatimes.com/city/delhi...,New Delhi: Two students were injured after a s...,16/02/24,Two students were injured after a speeding car...,28.613895,77.209006,Delhi,18.050730,0.000000,,,Speed,,2.0,,
4,4,107897709,Hyderabad,https://timesofindia.indiatimes.com/city/hyder...,"Hyderabad: A newlywed software engineer , his ...",22/02/24,"A newlywed software engineer , his father-in-l...",17.360589,78.474061,Telangana,26.445761,0.000000,,,,,,,Night
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6745,6745,70438045,Rajkot,https://timesofindia.indiatimes.com/city/rajko...,RAJKOT: Three including two members of Jasdan ...,30/07/19,Three including two members of Jasdan Diamond ...,22.305326,70.802838,Gujarat,27.055136,73.299995,,Car,,3.0,,,
6746,6746,69529927,Bengaluru,https://timesofindia.indiatimes.com/city/benga...,"BENGALURU: Five members of a family, including...",28/05/19,"Five members of a family, including a 14-year-...",12.976794,77.590082,Karnataka,25.609011,5.900000,14,"[Car, Commercial Vehicle]",,5.0,,Male,Night
6747,6747,69168530,Ambala/Parwanoo,https://timesofindia.indiatimes.com/city/gurga...,AMBALA/PARWANOO: Three members of a family wer...,04/05/19,Three members of a family were killed after th...,,,,,,,Car,,3.0,,,Night
6748,6748,69652686,Madurai,https://timesofindia.indiatimes.com/city/chenn...,"MADURAI: Three men, including the manager of a...",04/06/19,"Three men, including the manager of a national...",9.926115,78.114098,Tamil Nadu,30.240919,30.000000,,"[Car, Commercial Vehicle]",Collision,3.0,,Male,
