## Initialize

In [7]:
import spacy
import pandas as pd
import json
import spacy_transformers
import pickle

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [4]:
# !python -m spacy download en_core_web_lg

In [2]:
def save_data_to_pickle(data, filename):
    """
    Save data to a pickle file.
    
    Parameters:
        data: The data to be saved.
        filename (str): The filename of the pickle file.
    """
    with open(filename, 'wb') as f:
        pickle.dump(data, f)
    print(f"Data saved to {filename} successfully.")

def load_data_from_pickle(filename):
    """
    Load data from a pickle file.
    
    Parameters:
        filename (str): The filename of the pickle file.
        
    Returns:
        The loaded data.
    """
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    print(f"Data loaded from {filename} successfully.")
    return data

## Import model

In [16]:
# Reading the filtered news dataset from a CSV file into a DataFrame, setting the index to None
dataframe = pd.read_csv('input/entity_data.csv', index_col=None)

# Selecting specific columns from the filtered dataset
dataframe = dataframe[['id', 'place','Link', 'content', 'News_date', 'First_Line','latitude','longitude','state','week_avg_weather','precipitation_3days']]

dataframe = dataframe.head(1000)

## Annotated_data to spaCy input

In [17]:
import json
# Specify the file path
file_path = "input/annotations.json"

# Open the file and load its contents as JSON
with open(file_path, 'r') as file:
    data = json.load(file)
annotations = data.get("annotations", [])

In [18]:
dataframe['annotations'] = annotations

In [19]:
dataframe = dataframe[dataframe['annotations'] != 'None']

In [20]:
dataframe.isna().sum()

id                      0
place                   0
Link                    0
content                 0
News_date               0
First_Line              0
latitude               20
longitude              20
state                  20
week_avg_weather       20
precipitation_3days    20
annotations            39
dtype: int64

In [21]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   1000 non-null   int64  
 1   place                1000 non-null   object 
 2   Link                 1000 non-null   object 
 3   content              1000 non-null   object 
 4   News_date            1000 non-null   object 
 5   First_Line           1000 non-null   object 
 6   latitude             980 non-null    float64
 7   longitude            980 non-null    float64
 8   state                980 non-null    object 
 9   week_avg_weather     980 non-null    float64
 10  precipitation_3days  980 non-null    float64
 11  annotations          961 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 101.6+ KB


## Converting annotations into spaCy format

In [12]:
train_data = dataframe['annotations'][:800]
test_data = dataframe['annotations'][800:]

In [13]:
N = 1
train_data = pd.concat([train_data] * N, ignore_index=True)


In [14]:
entity_names = ["AGE", "VEHICLE TYPE", "REASON", "FATALITIES", "INJURED", "GENDER", "TIME"]

train_data = train_data.tolist()
test_data = test_data.tolist()

if train_data is not None:  # Check if 'train_data' is not None
    train_data = [tuple(i) for i in train_data if i is not None]  # Convert 'train_data' to list of tuples, excluding any None values
    # Further processing if needed
else:
    print("Annotations data is not available or is None ")
    
if test_data is not None:  # Check if 'test_data' is not None
    test_data = [tuple(i) for i in test_data if i is not None]  # Convert 'test_data' to list of tuples, excluding any None values
    # Further processing if needed
else:
    print("Annotations data is not available or is None.")

In [15]:
# Loop through each item in the train_data list
for i in train_data:
    # Check if the 'entities' list in the current item is empty
    if i[1]['entities'] == []:
        # If it's empty, populate it with default tuples (0, 0, name) 
        # where 'name' is iterated over all elements in 'entity_names'
        i[1]['entities'] = [(0, 0, name) for name in entity_names]
    else:
        # If 'entities' list is not empty, convert each element to a tuple
        i[1]['entities'] = [tuple(entity) for entity in i[1]['entities']]

# Loop through each item in the test_data list
for i in test_data:
    # Check if the 'entities' list in the current item is empty
    if i[1]['entities'] == []:
        # If it's empty, populate it with default tuples (0, 0, name) 
        # where 'name' is iterated over all elements in 'entity_names'
        i[1]['entities'] = [(0, 0, name) for name in entity_names]
    else:
        # If 'entities' list is not empty, convert each element to a tuple
        i[1]['entities'] = [tuple(entity) for entity in i[1]['entities']]


In [16]:
print(f"Length of train_data: {len(train_data)}\nLength of test_data: {len(test_data)}")

Length of train_data: 775
Length of test_data: 186


In [None]:
train_data

## Train Data to Doc bin

In [33]:
from spacy.tokens import DocBin
from tqdm import tqdm
nlp = spacy.blank("en") # load a new spacy model

In [34]:
len(train_data)

775

In [35]:
db = DocBin()

for data in tqdm(train_data, desc="Processing train_data"):
    if data is None:
        print("Encountered a null value in train_data. Skipping...")
        continue
    
    text, annot = data
    doc = nlp.make_doc(text)
    ents = []

    if annot is not None and "entities" in annot:  # Check if annot is not None and contains "entities" key
        for start, end, label in annot["entities"]:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                # print("Skipping entity")
                pass
            else:
                ents.append(span)
    doc.ents = ents
    db.add(doc)

Processing train_data: 100%|████████████████| 775/775 [00:00<00:00, 6732.79it/s]


In [36]:
db.to_disk("train.spacy") # save the docbin object

## Train Data to Doc bin

In [38]:
db_test = DocBin()

for data in test_data:
    if data is None:
        print("Encountered a null value in train_data. Skipping...")
        continue
    
    text, annot = data
    doc = nlp.make_doc(text)
    ents = []

    if annot is not None and "entities" in annot:  # Check if annot is not None and contains "entities" key
        for start, end, label in annot["entities"]:
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                # print("Skipping entity")
                pass
            else:
                ents.append(span)
    doc.ents = ents
    db_test.add(doc)

In [39]:
db_test.to_disk("test.spacy") # save the docbin object

## Save to a pickle file

In [41]:
# Save train_data to a pickle file
save_data_to_pickle(train_data, "train_data.pickle")
save_data_to_pickle(test_data, "test_data.pickle")

Data saved to train_data.pickle successfully.
Data saved to test_data.pickle successfully.
