In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from langdetect import detect
import re
import spacy
import yake
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator



In [2]:
# Load data
df = pd.read_csv(r"C:\Users\ncalvaresi\Documents\Safety_data_full.txt", delimiter='\t', encoding='latin1', on_bad_lines='skip')

# Rename columns
df.rename(columns={df.columns[0]: 'State', df.columns[6]: 'Observation', df.columns[7]: 'FollowupAction'}, inplace=True)

# Replace and drop NAs
df['FollowupAction'].fillna("None", inplace=True)
df.dropna(subset=['Observation'], inplace=True)

df = df[['Observation', 'Severity', 'Category']]

new_entries = [
    {'Observation': 'Spill', 'Severity': 'UNSAFE', 'Category': 'Slips/Trip Hazards'},
    {'Observation': 'Leak', 'Severity': 'UNSAFE', 'Category': 'Slips/Trip Hazards'},
    {'Observation': 'Fire', 'Severity': 'UNSAFE', 'Category': 'Fire Prevention'},
    {'Observation': 'Puddle of water', 'Severity': 'UNSAFE', 'Category': 'Slips/Trip Hazards'},
    {'Observation': 'Gap in the railing', 'Severity': 'UNSAFE', 'Category': 'Fall Protection'},
    {'Observation': 'He was wearing his hard hat incorrectly', 'Severity': 'UNSAFE', 'Category': 'PPE'}
]

# Convert the list of dictionaries to a DataFrame
new_entries_df = pd.DataFrame(new_entries)

# Add the new entries to the original DataFrame using pd.concat
df = pd.concat([df, new_entries_df], ignore_index=True)

# Filter for 'UNSAFE' and 'SAFE'
filtered_df = df[df['Severity'].isin(['UNSAFE', 'SAFE'])]

# Drop rows where the Category is NaN
filtered_df.dropna(subset=['Category'], inplace=True)

# Encode labels
label_encoder = LabelEncoder()
filtered_df['sentiment'] = label_encoder.fit_transform(filtered_df['Severity'])

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['FollowupAction'].fillna("None", inplace=True)


In [3]:
def filter_text(text):
    if text is None or str(text).lower() in ['none', 'n/a', 'na', 'null']:
        return False
    if len(text) == 1 and text.isalpha(): 
        return False
    if re.match(r'^[\W\d]*$', text): 
        return False
    try:
        lang = detect(text)
        if lang != 'en':  
            return False
    except:
        return False
    return True


In [4]:
new_filtered_df = filtered_df[filtered_df['Observation'].apply(filter_text)]

new_df = new_filtered_df[['Observation', 'sentiment', 'Category']]


In [5]:
tokenizer = get_tokenizer("basic_english")

def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(df['Observation']), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

label_encoder_category = LabelEncoder()
df['category'] = label_encoder_category.fit_transform(df['Category'])

label_encoder_severity = LabelEncoder()
df['severity'] = label_encoder_severity.fit_transform(df['Severity'])


In [6]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

def encode(text):
    return [vocab[token] for token in tokenizer(text)]

train_df['encoded'] = train_df['Observation'].apply(encode)
test_df['encoded'] = test_df['Observation'].apply(encode)

MAX_SEQUENCE_LENGTH = 100

def pad_sequence(seq, max_len):
    if len(seq) > max_len:
        return seq[:max_len]
    else:
        return seq + [0] * (max_len - len(seq))

train_df['padded'] = train_df['encoded'].apply(lambda x: pad_sequence(x, MAX_SEQUENCE_LENGTH))
test_df['padded'] = test_df['encoded'].apply(lambda x: pad_sequence(x, MAX_SEQUENCE_LENGTH))

train_categories = torch.tensor(train_df['category'].tolist())
test_categories = torch.tensor(test_df['category'].tolist())

train_severities = torch.tensor(train_df['severity'].tolist())
test_severities = torch.tensor(test_df['severity'].tolist())


In [8]:
class TextDataset(Dataset):
    def __init__(self, texts, categories, severities):
        self.texts = texts
        self.categories = categories
        self.severities = severities
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        category = self.categories[idx]
        severity = self.severities[idx]
        return torch.tensor(text, dtype=torch.long), torch.tensor(category, dtype=torch.long), torch.tensor(severity, dtype=torch.long)

train_dataset = TextDataset(train_df['padded'].tolist(), train_categories, train_severities)
test_dataset = TextDataset(test_df['padded'].tolist(), test_categories, test_severities)

train_dataloader = DataLoader(train_dataset, batch_size=15, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=15, shuffle=False)


In [9]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        self.attention_weights = nn.Parameter(torch.Tensor(hidden_size, hidden_size))
        self.context_vector = nn.Parameter(torch.Tensor(hidden_size, 1))
        nn.init.xavier_uniform_(self.attention_weights)
        nn.init.xavier_uniform_(self.context_vector)
    
    def forward(self, hidden_states):
        scores = torch.tanh(torch.matmul(hidden_states, self.attention_weights))
        scores = torch.matmul(scores, self.context_vector).squeeze(-1)
        attention_weights = torch.nn.functional.softmax(scores, dim=1)
        weighted_sum = torch.sum(hidden_states * attention_weights.unsqueeze(-1), dim=1)
        return weighted_sum

class EnhancedTextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_classes_category, num_classes_severity):
        super(EnhancedTextClassificationModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, 128, batch_first=True, num_layers=2, bidirectional=True)
        self.attention = Attention(128 * 2)  
        self.fc_category = nn.Linear(128 * 2, num_classes_category)
        self.fc_severity = nn.Linear(128 * 2, num_classes_severity)
        self.dropout = nn.Dropout(0.5)  
    
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.attention(x)
        x = self.dropout(x)
        output_category = self.fc_category(x)
        output_severity = self.fc_severity(x)
        return output_category, output_severity

In [10]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc])

In [11]:
def extract_keywords_percentages(observation, dataset, predicted_label, other_label):
    lemmatized_observation = clean_text(observation)
    
    kw_extractor = yake.KeywordExtractor()
    language = "en"
    max_ngram_size = 3
    deduplication_threshold = 0.8
    numOfKeywords = 10
    custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)
    keywords = custom_kw_extractor.extract_keywords(lemmatized_observation)
    
    keywords_perc = {}

    for keyword, _ in keywords:
        keyword_obs_df = dataset[dataset['Observation'].str.contains(keyword, case=False, na=False)]
        total_keyword_obs = keyword_obs_df.shape[0]
        
        label_keyword_obs = keyword_obs_df[keyword_obs_df['sentiment'] == predicted_label].shape[0]
        other_label_keyword_obs = keyword_obs_df[keyword_obs_df['sentiment'] == other_label].shape[0]
        
        if total_keyword_obs > 0:
            perc = label_keyword_obs / total_keyword_obs
            perc_other = other_label_keyword_obs / total_keyword_obs
        else:
            perc = 0
            perc_other = 0
        
        keywords_perc[keyword] = (perc, perc_other)
    
    return keywords_perc

In [12]:
def predict_and_analyze_observation(observation, model, tokenizer, vocab, max_length, device, label_encoder_category, label_encoder_severity, dataset):
    def pad_sequence(seq, max_len):
        if len(seq) > max_len:
            return seq[:max_len]
        else:
            return seq + [0] * (max_len - len(seq))
    
    tokens = tokenizer(observation)
    encoded = [vocab[token] for token in tokens]
    padded = pad_sequence(encoded, max_length)
    input_tensor = torch.tensor(padded, dtype=torch.long).unsqueeze(0).to(device)
    
    model.eval()
    with torch.no_grad():
        output_category, output_severity = model(input_tensor)
        probabilities_category = F.softmax(output_category, dim=1)
        probabilities_severity = F.softmax(output_severity, dim=1)
        confidence_category, predicted_label_category = torch.max(probabilities_category, dim=1)
        confidence_severity, predicted_label_severity = torch.max(probabilities_severity, dim=1)
    
    predicted_category = label_encoder_category.inverse_transform([predicted_label_category.item()])[0]
    predicted_severity = label_encoder_severity.inverse_transform([predicted_label_severity.item()])[0]
    
    confidence_score_category = confidence_category.item()
    confidence_score_severity = confidence_severity.item()
    
    predicted_label_severity = label_encoder_severity.transform([predicted_severity])[0]
    other_label_severity = 1 - predicted_label_severity
    
    keywords_percentages = extract_keywords_percentages(observation, dataset, predicted_label_severity, other_label_severity)
    
    print(f"Predicted Category: {predicted_category}, Confidence Score: {confidence_score_category:.4%}")
    print(f"Predicted Severity: {predicted_severity}, Confidence Score: {confidence_score_severity:.4%}")
    print("")
    for keyword, (perc, perc_other) in keywords_percentages.items():
        print(f"Keyword: {keyword}, Percentage of {keyword} in {predicted_severity}: {perc:.4%} and Percentage of {keyword} in the other severity: {perc_other:.4%}")


In [13]:
model = r'C:\Users\ncalvaresi\Documents\my_model.pth'

In [14]:
# Load model
MAX_SEQUENCE_LENGTH = 100
VOCAB_SIZE = len(vocab)
EMBED_SIZE = 100
NUM_CLASSES_CATEGORY = len(label_encoder_category.classes_)
NUM_CLASSES_SEVERITY = len(label_encoder_severity.classes_)
device = "cuda" if torch.cuda.is_available() else "cpu"

# Recreate the model architecture
loaded_model = EnhancedTextClassificationModel(VOCAB_SIZE, EMBED_SIZE, NUM_CLASSES_CATEGORY, NUM_CLASSES_SEVERITY)

loaded_model.load_state_dict(torch.load(model))

loaded_model.to(device)

# Initialize spacy model
nlp = spacy.load('en_core_web_sm')

# Example usage
example_text = "There was a small leak in the storage room that could damage everything."
predict_and_analyze_observation(example_text, loaded_model, tokenizer, vocab, MAX_SEQUENCE_LENGTH, device, label_encoder_category, label_encoder_severity, new_filtered_df)

Predicted Category: Materials/Tools, Confidence Score: 54.2326%
Predicted Severity: SAFE, Confidence Score: 89.0431%

Keyword: small leak, Percentage of small leak in SAFE: 42.8571% and Percentage of small leak in the other severity: 57.1429%
Keyword: storage room, Percentage of storage room in SAFE: 25.0000% and Percentage of storage room in the other severity: 75.0000%
Keyword: small, Percentage of small in SAFE: 52.0710% and Percentage of small in the other severity: 47.9290%
Keyword: leak, Percentage of leak in SAFE: 36.2949% and Percentage of leak in the other severity: 63.7051%
Keyword: storage, Percentage of storage in SAFE: 44.8454% and Percentage of storage in the other severity: 55.1546%
Keyword: room, Percentage of room in SAFE: 49.0476% and Percentage of room in the other severity: 50.9524%
Keyword: damage, Percentage of damage in SAFE: 42.2925% and Percentage of damage in the other severity: 57.7075%


: 

In [18]:
pip install streamlit

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip
