# __ init __

In [24]:
import pandas as pd
import pickle
import regex as re

In [25]:
import pickle
def load_data_from_pickle(filename):
    """
    Load data from a pickle file.
    
    Parameters:
        filename (str): The filename of the pickle file.
        
    Returns:
        The loaded data.
    """
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    print(f"Data loaded from {filename} successfully.")
    return data

In [26]:
train_data = load_data_from_pickle("train_data.pickle")

Data loaded from train_data.pickle successfully.


In [27]:
test_data = load_data_from_pickle("test_data.pickle")

Data loaded from test_data.pickle successfully.


# Model Build

## Transformer

In [18]:
# This code initializes the configuration for spaCy using the 'base_config.cfg' file and fills it with specific train and test file paths,
# creating a customized configuration file named 'config.cfg'. This step is crucial for setting up the spaCy pipeline with the necessary 
# parameters and data paths for training and testing models.

!python -m spacy init fill-config base_config.cfg config.cfg #open and include train and test file paths

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


If your training data size is 775, and your batch size is 128, you'll have approximately 6 batches per epoch (775 / 128 ≈ 6.05). To ensure you cover the entire dataset for 4 epochs, you'd use the following calculation:

max_steps = batches_per_epoch × max_epochs

max_steps = 6 × 4 = 24

In [19]:
# This code utilizes spaCy's debugging tool to examine the data specified in the 'config.cfg' configuration file.
# By running the command 'python -m spacy debug data config.cfg', it enables detailed inspection of the data sources,
# annotations, and other 

!python -m spacy debug data config.cfg

[1m
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[38;5;2m✔ Pipeline can be initialized with data[0m
[38;5;2m✔ Corpus is loadable[0m
[1m
Language: en
Training pipeline: transformer, ner
775 training docs
186 evaluation docs
[38;5;2m✔ No overlap between training and evaluation data[0m
[38;5;3m⚠ Low number of examples to train a new pipeline (775)[0m
[1m
[38;5;4mℹ 24759 total word(s) in the data (2703 unique)[0m
[38;5;4mℹ No word vectors present in the package[0m
[1m
[38;5;4mℹ 7 label(s)[0m
0 missing value(s) (tokens with '-' label)
[38;5;2m✔ Good amount of examples for all labels[0m
[38;5;2m✔ Examples without occurrences available for all labels[0m
[38;5;2m✔ No entities consisting of or starting/ending with whitespace[0m


In [21]:
# config.cfg ## replace with a unique config file id
!python -m spacy train config.cfg --output ./output/spacy/ --paths.train ./train.spacy --paths.dev ./test.spacy

[38;5;4mℹ Saving to output directory: output/spacy[0m
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  -------------  --------  ------  ------  ------  ------
  0       0        1609.65    821.52    0.12    0.08    0.30    0.00
 12     200       74842.33  62863.44   83.57   82.05   85.14    0.84
 24     400       18082.66   9739.88   82.68   80.23   85.29    0.83
[38;5;2m✔ Saved pipeline to output directory[0m
output/spacy/model-last


# Predictions (Test Data)

## Load Model

In [67]:
import spacy
test_nlp = spacy.load('./output/spacy/model-best')

In [42]:
test_nlp.get_pipe('ner').labels

('AGE', 'FATALITIES', 'GENDER', 'INJURED', 'REASON', 'TIME', 'VEHICLE TYPE')

## Individual Predictions

In [93]:
entity_names = ["AGE", "VEHICLE TYPE", "REASON", "FATALITIES", "INJURED", "GENDER", "TIME"]

# Define colors for each entity
colors = {entity: "#{}".format(hash(entity) & 0x00FFFFF) for entity in entity_names}


light_colors = {
    "AGE": "#FFDDC1",          # Light orange
    "VEHICLE TYPE": "#D0E1F9", # Light blue
    "REASON": "#FFB6C1",       # Light peach
    "FATALITIES": "#C8E6C9",   # Light green
    "INJURED": "#FFE0B2",      # Light amber
    "GENDER": "#F0F4C3",       # Light yellow
    "TIME": "#E1BEE7"          # Light purple
}

In [94]:
test = 'A man in his 40s was injured and his mother-in-law seriously injured when their tractor dashed against a tree in this district, police said on Thursday'

In [95]:
doc = test_nlp(test)

In [96]:
spacy.displacy.render(doc, style="ent", jupyter=True, options={"colors": light_colors})

In [97]:
test = 'A 40-year-old drunk man was killed and his wife seriously injured when their car dashed against a tree in this district, police said on Thursday'

In [98]:
doc = test_nlp(test)

In [99]:
spacy.displacy.render(doc, style="ent", jupyter=True , options={"colors": light_colors})

## Performance Metrics Evaluvation (Test)

In [229]:
model_name = '' # Add the model name here
description = '''model_name_config_file_$date$time'''

In [106]:
import pandas as pd

# Initialize lists to store row-wise scores
data_rows = []

# Iterate through test data
for data in test_data:
    true_positives, false_positives, false_negatives = 0, 0, 0  # Reset counts for each data row
    if data is None:
        print("Skipping None value in test data.")
        continue

    text, annotations = data
    doc = test_nlp(text)
    predicted_entities = [(ent.text, ent.label_) for ent in doc.ents]
    true_entities = [(text[start:end], label) for start, end, label in annotations.get('entities', [])]

    # Calculate true positives, false positives, and false negatives for each row
    for entity in predicted_entities:
        if entity in true_entities:
            true_positives += 1
        else:
            false_positives += 1

    for entity in true_entities:
        if entity not in predicted_entities:
            false_negatives += 1

    # Calculate precision, recall, and F1-score for the current row
    row_accuracy = true_positives / (true_positives + false_positives + false_negatives) if (true_positives + false_positives + false_negatives) > 0 else 0
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # Append row-wise scores to the data_rows list
    data_rows.append({
        'Text': text,
        'Predicted Entities': predicted_entities,
        'Annotated Entities': true_entities,
        'Accuracy': row_accuracy,
        'Recall': recall,
        'F1-Score': f1
    })

# Create a DataFrame from the data_rows list
df_predicted = pd.DataFrame(data_rows)

# Display the DataFrame
df_predicted.head()


Unnamed: 0,Text,Predicted Entities,Annotated Entities,Accuracy,Recall,F1-Score
0,A Faridabad resident was killed and a youngste...,"[(truck, VEHICLE TYPE), (fog, REASON), (car, V...","[(truck, VEHICLE TYPE), (driven rashly, REASON...",0.6,0.6,0.75
1,A south Mumbai based businessman who was retur...,"[(businessman, GENDER)]","[(businessman, GENDER)]",1.0,1.0,1.0
2,A 40-year-old man was killed and his wife seri...,"[(40-year-old, AGE), (man, GENDER), (wife, GEN...","[(40-year-old, AGE), (man, GENDER), (wife, GEN...",1.0,1.0,1.0
3,Five friends celebrating the granting of a vis...,"[(Five, INJURED), (car, VEHICLE TYPE), (tree, ...","[(Five, INJURED), (car, VEHICLE TYPE), (tree, ...",1.0,1.0,1.0
4,"In a hit-and-run case, a couple and their two ...","[(hit-and-run, REASON), (daughters, GENDER), (...","[(hit-and-run, REASON), (five, FATALITIES), (S...",0.8,1.0,0.888889


## Weighted avg and convert to df above

In [102]:
from collections import defaultdict
import pandas as pd

# Initialize variables to store true positives, false positives, and false negatives for each entity type
tp = defaultdict(int)
fp = defaultdict(int)
fn = defaultdict(int)
te = defaultdict(int)

# Iterate over the test data
for data in test_data:
    if data is None:
        continue
    
    text, annotations = data
    doc = test_nlp(text)
    predicted_entities = {(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents}
    true_entities = {(start, end, label) for start, end, label in annotations.get('entities', [])}
    
    # Compute true positives, false positives, false negatives, and true entity count
    for ent in predicted_entities:
        if ent in true_entities:
            tp[ent[2]] += 1
        else:
            fp[ent[2]] += 1
    for ent in true_entities:
        if ent not in predicted_entities:
            fn[ent[2]] += 1
        te[ent[2]] += 1  # Increment true entity count for every true entity

# Compute precision, recall, and F1-score for each entity type
precision = {}
recall = {}
f1 = {}
for label in test_nlp.get_pipe('ner').labels:
    precision[label] = tp[label] / (tp[label] + fp[label]) if tp[label] + fp[label] > 0 else 0
    recall[label] = tp[label] / (tp[label] + fn[label]) if tp[label] + fn[label] > 0 else 0
    f1[label] = 2 * (precision[label] * recall[label]) / (precision[label] + recall[label]) if precision[label] + recall[label] > 0 else 0

# Prepare data for DataFrame
data_for_df = []
for label in test_nlp.get_pipe('ner').labels:
    data_for_df.append([label, te[label], round(precision[label], 3), round(recall[label], 3), round(f1[label], 3)])

# Convert to DataFrame
df = pd.DataFrame(data_for_df, columns=['Entity Type', 'Count', 'Precision', 'Recall', 'F1-score'])

# Calculate weighted averages
weighted_precision = (df['Precision'] * df['Count']).sum() / df['Count'].sum()
weighted_recall = (df['Recall'] * df['Count']).sum() / df['Count'].sum()
weighted_f1 = (df['F1-score'] * df['Count']).sum() / df['Count'].sum()

# Add weighted averages to the DataFrame
weighted_averages = pd.DataFrame([['Weighted Average', '', round(weighted_precision, 3), round(weighted_recall, 3), round(weighted_f1, 3)]],
                                 columns=['Entity Type', 'Count', 'Precision', 'Recall', 'F1-score'])
df = pd.concat([df, weighted_averages], ignore_index=True)

df['Model'] = model_name
df.head()


Unnamed: 0,Entity Type,Count,Precision,Recall,F1-score,Model
0,AGE,71,0.889,0.901,0.895,
1,FATALITIES,81,0.916,0.938,0.927,
2,GENDER,67,0.691,0.97,0.807,
3,INJURED,60,0.824,0.933,0.875,
4,REASON,89,0.66,0.348,0.456,


In [103]:
## This is used to log
with open(f'macro_metrics_{description}.txt', 'w') as f:
    f.write(captured_output.stdout)

NameError: name 'captured_output' is not defined

## Itemized results

In [None]:
# %%time
# %%capture captured_output

# Initialize lists to store row-wise scores
precision_list, recall_list, f1_list = [], [], []
true_positives, false_positives, false_negatives = 0, 0, 0

# Iterate through test data
for data in test_data:
    true_positives, false_positives, false_negatives = 0, 0, 0  # Reset counts for each data row
    if data is None:
        print("Skipping None value in test data.")
        continue

    text, annotations = data
    doc = test_nlp(text)
    predicted_entities = [(ent.text, ent.label_) for ent in doc.ents]
    true_entities = [(text[start:end], label) for start, end, label in annotations.get('entities', [])]

    # Calculate true positives, false positives, and false negatives for each row
    for entity in predicted_entities:
        if entity in true_entities:
            true_positives += 1
        else:
            false_positives += 1

    for entity in true_entities:
        if entity not in predicted_entities:
            false_negatives += 1

    # Calculate precision, recall, and F1-score for the current row
    row_accuracy = true_positives / (true_positives + false_positives + false_negatives) if (true_positives + false_positives + false_negatives) > 0 else 0
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # Append row-wise scores to lists
    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

    # Print the results for each row
    print("Text:", text)
    print("Predicted entities:", predicted_entities)
    print("Annotated entities:", true_entities)
    print("Accuracy:", row_accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1)
    print("---------------")

# Calculate accuracy, average precision, recall, and F1-score
accuracy = true_positives / (true_positives + false_positives + false_negatives)
avg_precision = sum(precision_list) / len(precision_list)
avg_recall = sum(recall_list) / len(recall_list)
avg_f1 = sum(f1_list) / len(f1_list)

print("Accuracy:", accuracy)
print("Average Precision:", avg_precision)
print("Average Recall:", avg_recall)
print("Average F1-score:", avg_f1)

Text: A Faridabad resident was killed and a youngster was injured after a truck, allegedly being driven rashly despite dense fog, rammed their car on the Gurugram-Faridabad road in the early hours of Monday
Predicted entities: [('truck', 'VEHICLE TYPE'), ('fog', 'REASON'), ('car', 'VEHICLE TYPE')]
Annotated entities: [('truck', 'VEHICLE TYPE'), ('driven rashly', 'REASON'), ('fog', 'REASON'), ('car', 'VEHICLE TYPE'), ('early', 'TIME')]
Accuracy: 0.6
Precision: 1.0
Recall: 0.6
F1-score: 0.7499999999999999
---------------
Text: A south Mumbai based businessman who was returning from a friend's house in Navi Mumbai after an Eid celebration died, while his relative was seriously injured
Predicted entities: [('businessman', 'GENDER')]
Annotated entities: [('businessman', 'GENDER')]
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score: 1.0
---------------
Text: A 40-year-old man was killed and his wife seriously injured when their car dashed against a tree in this district, police said on Thursd

In [28]:
import spacy
test_nlp = spacy.load('./output/spacy/model-best')

In [29]:
test_nlp.get_pipe('ner').labels

('AGE', 'FATALITIES', 'GENDER', 'INJURED', 'REASON', 'TIME', 'VEHICLE TYPE')

In [30]:
# Read the live data that needs to be classified
df_livedata = pd.read_csv('raw_firstline.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'raw_firstline.csv'

In [None]:
def extract_article_id(url):
    match = re.search(r'\/(\d+)\.cms', url)
    if match:
        return int(match.group(1))
    else:
        return None

In [None]:
df_livedata['id'] = df_livedata['Link'].apply(extract_article_id)

In [None]:
# File path where the annotated IDs are stored
file_path = "id_list_annotated.txt"

# Reading the contents of the text file into a list
with open(file_path, 'r') as file:
    id_list_read = file.readlines()

# Removing newline characters from the elements of the list
id_list_read = [item.strip() for item in id_list_read]

# Converting the IDs from strings to integers
id_list_read = [int(x) for x in id_list_read]

# Creating a DataFrame from the list of IDs
id_df = pd.DataFrame(id_list_read, columns=['id'])


In [None]:
#Filter out the ids that have been used for training
df_livedata = df_livedata[~df_livedata['id'].isin(id_list_read)]

df_livedata = df_livedata[['id','content','First_Line']]  # Only filter out the id and the first line