# __ init __

In [1]:
import pandas as pd
import pickle
import regex as re
import numpy as np


In [2]:
import pickle
def load_data_from_pickle(filename):
    """
    Load data from a pickle file.
    
    Parameters:
        filename (str): The filename of the pickle file.
        
    Returns:
        The loaded data.
    """
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    print(f"Data loaded from {filename} successfully.")
    return data

In [3]:
train_data = load_data_from_pickle("input/train_data.pickle")

Data loaded from input/train_data.pickle successfully.


In [4]:
test_data = load_data_from_pickle("input/test_data.pickle")

Data loaded from input/test_data.pickle successfully.


# Predictions (Live data)

## Load Model

In [6]:
import spacy
test_nlp = spacy.load('./output/xlnet_800/model-best')

In [7]:
test_nlp.get_pipe('ner').labels

('AGE', 'FATALITIES', 'GENDER', 'INJURED', 'REASON', 'TIME', 'VEHICLE TYPE')

## Extract Live data and exclude test data

In [5]:
# Read the live data that needs to be classified
df_livedata = pd.read_csv('input/filtered_dataset_oneliner.csv')

## Individual sentence predictions

In [15]:
text = test_nlp(df_livedata.First_Line[2000])
doc = test_nlp(df_livedata.First_Line[2000])

In [18]:
light_colors = {
    "AGE": "#FFDDC1",          # Light orange
    "VEHICLE TYPE": "#D0E1F9", # Light blue
    "REASON": "#FFB6C1",       # Light peach
    "FATALITIES": "#C8E6C9",   # Light green
    "INJURED": "#FFE0B2",      # Light amber
    "GENDER": "#F0F4C3",       # Light yellow
    "TIME": "#E1BEE7"          # Light purple
}

In [19]:
spacy.displacy.render(doc, style="ent", jupyter=True , options={"colors": light_colors})

In [20]:
predicted_entities = {(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents}

In [21]:
for ent in doc.ents:
    print(f'Entity Label ({ent.label_}) {"-"*(20 - len(ent.label_))} {ent}')

Entity Label (AGE) ----------------- 30-year-old
Entity Label (GENDER) -------------- man
Entity Label (REASON) -------------- speeding
Entity Label (VEHICLE TYPE) -------- car
Entity Label (VEHICLE TYPE) -------- car
Entity Label (TIME) ---------------- night


## Predictions for all live data

In [18]:
from tqdm import tqdm  # Import tqdm for progress visualization
import pandas as pd

def transform_set_to_dict(input_set):
    # Convert set of tuples to dictionary
    transformed_dict = {}
    for key, value in input_set:
        transformed_dict.setdefault(key, []).append(value)
    # Flatten dictionary if value lists have length 1
    final_dict = {key: value[0] if len(value) == 1 else value for key, value in transformed_dict.items()}
    return final_dict

# Assuming df_livedata_ is already defined and populated
df_livedata_ = df_livedata

# Iterate over DataFrame rows and collect results
data = {
    'ID': [],
    'CONTENT': [],
    'FIRST_LINE': [],
    'PREDICTED_ENTITIES_DICT': []
}

for index, row in tqdm(df_livedata_.iterrows(), total=len(df_livedata_)):
    text = row['First_Line']
    doc = test_nlp(text)
    
    # Extract named entities and convert to dictionary
    predicted_entities = {(ent.label_, ent) for ent in doc.ents}  # set
    predicted_entities_dict = transform_set_to_dict(predicted_entities)  # dict
    
    # Append data to the dictionary
    data['ID'].append(row['id'])
    data['CONTENT'].append(row['content'])
    data['FIRST_LINE'].append(row['First_Line'])
    data['PREDICTED_ENTITIES_DICT'].append(predicted_entities_dict)

# Create DataFrame
df = pd.DataFrame(data)

# Define keys and initialize lists
keys = ["AGE", "VEHICLE TYPE", "REASON", "FATALITIES", "INJURED", "GENDER", "TIME"]
empty_value = np.nan
empty_data = {key: [] for key in keys}

# Iterate through each dictionary in df.PREDICTED_ENTITIES_DICT
for dictionary in df.PREDICTED_ENTITIES_DICT:
    for key in keys:
        # Append the value to the corresponding list, or 'na' if key doesn't exist
        empty_data[key].append(dictionary.get(key, empty_value))

# Create DataFrame from empty_data
df_empty = pd.DataFrame(empty_data)

# Merge DataFrames on index
df = pd.concat([df, df_empty], axis=1)

# Optionally, drop PREDICTED_ENTITIES_DICT column
df.drop(columns=['PREDICTED_ENTITIES_DICT'], inplace=True)


100%|██████████| 6750/6750 [05:59<00:00, 18.76it/s]


In [19]:
df.head()

Unnamed: 0,ID,CONTENT,FIRST_LINE,AGE,VEHICLE TYPE,REASON,FATALITIES,INJURED,GENDER,TIME
0,107938160,SULTANPUR : Three people lost their lives on F...,Three people lost their lives on Friday in a c...,,(car),,(Three),,,
1,107821301,JAIPUR: Three people died in a car accident in...,Three people died in a car accident in Rajasth...,,,,(Three),,,
2,107790964,RAICHUR: Two women killed and five others were...,Two women killed and five others were seriousl...,,"(car, -, bus)",(collision),(Two),(five),,(morning)
3,107737101,New Delhi: Two students were injured after a s...,Two students were injured after a speeding car...,,,(speeding),,(Two),,
4,107897709,"Hyderabad: A newlywed software engineer , his ...","A newlywed software engineer , his father-in-l...",,,(veered),,,"(father, -, in, -, law)",(wee)


## Save to csv

In [21]:
df.to_csv('output/predicted.csv',index = False)