In [3]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [4]:
# Load the dataset
df = pd.read_csv("training_data.csv")

df = df.rename(columns={'is LCI inventory table?': 'label'})

# Convert labels to binary
df['label'] = df['label'].map({'Yes': 1, 'No': 0})

# Split the dataset for 80% train and 20% test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# Define LCI table title keywords
keywords = [
    "inventory", "input", "output", "flow", "consumption", "emission",
    "energy", "material", "resource", "process data", "raw material"
]

def rule_based_classification(title):
    """Classify a table title as an LCI inventory table using keyword-based rules."""
    title_lower = title.lower()
    return int(any(re.search(rf"\b{kw}\b", title_lower) for kw in keywords))

# Apply to test set
test_df['predicted'] = test_df['Table_title'].apply(rule_based_classification)

# Evaluate
cm = confusion_matrix(test_df['label'], test_df['predicted'])
cm_df = pd.DataFrame(cm, index=['True Non-LCI', 'True LCI'], columns=['Pred Non-LCI', 'Pred LCI'])

report = classification_report(
    test_df['label'], test_df['predicted'],
    target_names=['Non-LCI', 'LCI'],
    digits=3  # keep 3 decimal places
)

print(cm_df)
print(report)

              Pred Non-LCI  Pred LCI
True Non-LCI            15         2
True LCI                 5        10
              precision    recall  f1-score   support

     Non-LCI      0.750     0.882     0.811        17
         LCI      0.833     0.667     0.741        15

    accuracy                          0.781        32
   macro avg      0.792     0.775     0.776        32
weighted avg      0.789     0.781     0.778        32

