# __ init __

In [10]:
import pandas as pd
import pickle
import regex as re

In [5]:
import pickle
def load_data_from_pickle(filename):
    """
    Load data from a pickle file.
    
    Parameters:
        filename (str): The filename of the pickle file.
        
    Returns:
        The loaded data.
    """
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    print(f"Data loaded from {filename} successfully.")
    return data

In [6]:
train_data = load_data_from_pickle("train_data.pickle")

Data loaded from train_data.pickle successfully.


In [21]:
test_data = load_data_from_pickle("test_data.pickle")

Data loaded from test_data.pickle successfully.


# Predictions (Live data)

## Load Model

In [1]:
import spacy
test_nlp = spacy.load('./output/spacy/model-best')

In [2]:
test_nlp.get_pipe('ner').labels

('AGE', 'FATALITIES', 'GENDER', 'INJURED', 'REASON', 'TIME', 'VEHICLE TYPE')

## Extract Live data and exclude test data

In [7]:
# Read the live data that needs to be classified
df_livedata = pd.read_csv('raw_firstline.csv')

In [8]:
def extract_article_id(url):
    match = re.search(r'\/(\d+)\.cms', url)
    if match:
        return int(match.group(1))
    else:
        return None

In [11]:
df_livedata['id'] = df_livedata['Link'].apply(extract_article_id)

In [12]:
# File path where the annotated IDs are stored
file_path = "id_list_annotated.txt"

# Reading the contents of the text file into a list
with open(file_path, 'r') as file:
    id_list_read = file.readlines()

# Removing newline characters from the elements of the list
id_list_read = [item.strip() for item in id_list_read]

# Converting the IDs from strings to integers
id_list_read = [int(x) for x in id_list_read]

# Creating a DataFrame from the list of IDs
id_df = pd.DataFrame(id_list_read, columns=['id'])


In [14]:
#Filter out the ids that have been used for training
df_livedata = df_livedata[~df_livedata['id'].isin(id_list_read)]

df_livedata = df_livedata[['id','content','First_Line']]  # Only filter out the id and the first line

## Individual sentence predictions

In [15]:
text = test_nlp(df_livedata.First_Line[2000])
doc = test_nlp(df_livedata.First_Line[2000])

In [18]:
light_colors = {
    "AGE": "#FFDDC1",          # Light orange
    "VEHICLE TYPE": "#D0E1F9", # Light blue
    "REASON": "#FFB6C1",       # Light peach
    "FATALITIES": "#C8E6C9",   # Light green
    "INJURED": "#FFE0B2",      # Light amber
    "GENDER": "#F0F4C3",       # Light yellow
    "TIME": "#E1BEE7"          # Light purple
}

In [19]:
spacy.displacy.render(doc, style="ent", jupyter=True , options={"colors": light_colors})

In [20]:
predicted_entities = {(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents}

In [21]:
for ent in doc.ents:
    print(f'Entity Label ({ent.label_}) {"-"*(20 - len(ent.label_))} {ent}')

Entity Label (AGE) ----------------- 30-year-old
Entity Label (GENDER) -------------- man
Entity Label (REASON) -------------- speeding
Entity Label (VEHICLE TYPE) -------- car
Entity Label (VEHICLE TYPE) -------- car
Entity Label (TIME) ---------------- night


## Predictions for all live data

In [22]:
from tqdm import tqdm  # Import tqdm for progress visualization
import pandas as pd

def transform_set_to_dict(input_set):
    # Convert set of tuples to dictionary
    transformed_dict = {}
    for key, value in input_set:
        transformed_dict.setdefault(key, []).append(value)
    # Flatten dictionary if value lists have length 1
    final_dict = {key: value[0] if len(value) == 1 else value for key, value in transformed_dict.items()}
    return final_dict

# Assuming df_livedata_ is already defined and populated
df_livedata_ = df_livedata[1000:1100]

# Iterate over DataFrame rows and collect results
data = {
    'ID': [],
    'CONTENT': [],
    'FIRST_LINE': [],
    'PREDICTED_ENTITIES_DICT': []
}

for index, row in tqdm(df_livedata_.iterrows(), total=len(df_livedata_)):
    text = row['First_Line']
    doc = test_nlp(text)
    
    # Extract named entities and convert to dictionary
    predicted_entities = {(ent.label_, ent) for ent in doc.ents}  # set
    predicted_entities_dict = transform_set_to_dict(predicted_entities)  # dict
    
    # Append data to the dictionary
    data['ID'].append(row['id'])
    data['CONTENT'].append(row['content'])
    data['FIRST_LINE'].append(row['First_Line'])
    data['PREDICTED_ENTITIES_DICT'].append(predicted_entities_dict)

# Create DataFrame
df = pd.DataFrame(data)

# Define keys and initialize lists
keys = ["AGE", "VEHICLE TYPE", "REASON", "FATALITIES", "INJURED", "GENDER", "TIME"]
empty_value = 'na'
empty_data = {key: [] for key in keys}

# Iterate through each dictionary in df.PREDICTED_ENTITIES_DICT
for dictionary in df.PREDICTED_ENTITIES_DICT:
    for key in keys:
        # Append the value to the corresponding list, or 'na' if key doesn't exist
        empty_data[key].append(dictionary.get(key, empty_value))

# Create DataFrame from empty_data
df_empty = pd.DataFrame(empty_data)

# Merge DataFrames on index
df = pd.concat([df, df_empty], axis=1)

# Optionally, drop PREDICTED_ENTITIES_DICT column
df.drop(columns=['PREDICTED_ENTITIES_DICT'], inplace=True)


  0%|                                                   | 0/100 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|█████████████████████████████████████████| 100/100 [00:05<00:00, 16.82it/s]


In [23]:
df.head()

Unnamed: 0,ID,CONTENT,FIRST_LINE,AGE,VEHICLE TYPE,REASON,FATALITIES,INJURED,GENDER,TIME
0,83674834,New Delhi: A 30-year-old man was killed after ...,A 30-year-old man was killed after a speeding ...,"(30, -, year, -, old)","[(car), (car)]",(speeding),na,na,(man),(night)
1,96313089,"Bhopal: A 61-year-old man, who was injured in ...","A 61-year-old man, who was injured in a car-SU...","(61, -, year, -, old)","[(SUV), (car)]",na,na,na,(man),na
2,79997626,KOTA : Five people were killed and seven injur...,Five people were killed and seven injured afte...,na,(car),na,(Five),(seven),na,na
3,94161346,Panaji: Porvorim police on Monday registered a...,Porvorim police on Monday registered a case of...,na,(car),"[(negligent), (rash)]",na,na,na,na
4,79880005,PALANPUR: Three teachers including a woman wer...,Three teachers including a woman were killed w...,na,(car),na,(Three),na,(woman),(morning)


## Save to csv

In [None]:
df.to_csv('predicted.csv')