# Separate Model Training and Evaluation Notebook

## Load Data

In [1]:
import pandas as pd

train_path = '/Users/bilalhussain/Downloads/train_split.csv'
val_path = '/Users/bilalhussain/Downloads/val_split.csv'
test_path = '/Users/bilalhussain/Downloads/test_df.csv'


text_col = "text_with_markers"
target_col = "label_id"

train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)
test_df = pd.read_csv(test_path)

train_df.head()

Unnamed: 0,id,text,text_with_markers,relation_label,relation_type,entity1_text,entity2_text,label_id,num_tokens
0,1939,Even travel agents are turning to telepresence...,Even [E1]travel agents[/E1] are turning to [E2...,"Instrument-Agency(e2,e1)",Instrument-Agency,travel agents,telepresence,11,10
1,6614,The health medical insurance coverage for alle...,The health medical insurance coverage for alle...,"Entity-Origin(e1,e2)",Entity-Origin,drugs,blood,8,23
2,1094,Vietnam's response on the toll caused by the e...,Vietnam's response on the [E1]toll[/E1] caused...,"Cause-Effect(e2,e1)",Cause-Effect,toll,earthquake,1,15
3,2128,My mother bakes the puddings in a lidded dish ...,My [E1]mother[/E1] bakes the puddings in a lid...,"Instrument-Agency(e2,e1)",Instrument-Agency,mother,dish,11,18
4,3305,"A sailing boat has the large mainsail, a small...",A sailing [E1]boat[/E1] has the large [E2]main...,"Component-Whole(e2,e1)",Component-Whole,boat,mainsail,3,24


## Vectorizer

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_df[text_col])
y_train = train_df[target_col]

X_val = vectorizer.transform(val_df[text_col])
y_val = val_df[target_col]

X_test = vectorizer.transform(test_df[text_col])
y_test = test_df[target_col]

## SGD Logistic

In [3]:
from sklearn.linear_model import SGDClassifier
model = SGDClassifier(loss='log_loss', random_state=42)

model.fit(X_train, y_train)
preds = model.predict(X_val)

from sklearn.metrics import f1_score, confusion_matrix, classification_report

macro_f1 = f1_score(y_val, preds, average='macro')

labels = sorted(set(y_val))
labels_wo_other = [l for l in labels if l != "Other"]
macro_f1_wo_other = f1_score(y_val, preds, labels=labels_wo_other, average='macro')

conf_mat = confusion_matrix(y_val, preds)
report = classification_report(y_val, preds)

print("Macro F1:", macro_f1)
print("Macro F1 (no 'Other'):", macro_f1_wo_other)
print("Confusion Matrix:\n", conf_mat)
print("Report:\n", report)

Macro F1: 0.3828539633513239
Macro F1 (no 'Other'): 0.3828539633513239
Confusion Matrix:
 [[ 18  16   1   0   0   0   0   0   0   0   2   0   1   0   0  14   0   0]
 [  0  66   0   0   1   0   1  12   0   0   0   0   1   0   0  17   0   0]
 [  0   0  29   2   3   0   0   2   0   0   3   0   2   0   0  20   0   0]
 [  0   2   8  16   1   0   3   5   0   0   5   0   7   2   0  31   0   0]
 [  0   0   2   1  32   1   1   3   0   0   0   0   1   0   0   8   0   0]
 [  0   0   2   1   3  12   0   3   0   0   0   0   3   0   0   8   0   0]
 [  0   0   2   0   4   0 111   0   0   0   0   0   0   0   0   9   0   1]
 [  0   3   1   0   3   0   1  64   0   0   0   0   2   0   0  11   0   0]
 [  0   0   1   0   0   0   0   1   2   0   0   0   0   0   0  18   0   0]
 [  0   0   1   0   0   0   0   0   0   0   1   0   0   0   0  11   0   1]
 [  0   0   1   0   0   0   1   2   0   0  22   0   2   0   0  32   1   1]
 [  0   0   2   0   0   0   0   1   0   0   0   0   0   0   0   7   0   0]
 [  0   3 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## Interpretation & Explanation (Very Short)

### Interpretation
- I check feature weights, the confusion matrix, and the classification report.
- LIME/SHAP help explain individual predictions.

### Qualitative
- Learns keyword patterns, good on frequent classes.
- Weak on rare classes and confuses similar relations.
- Does not understand syntax or deeper meaning.

### Quantitative
- Macro F1 ≈ 0.38, accuracy ≈ 0.48.
- Some classes strong, many very weak.

### Pros
- Fast, simple, and interpretable.

### Cons
- Poor on rare classes, confuses similar labels.
- No understanding of context or structure.


## SGD SVM

In [4]:
from sklearn.linear_model import SGDClassifier
model = SGDClassifier(loss='hinge', random_state=42)

model.fit(X_train, y_train)
preds = model.predict(X_val)

from sklearn.metrics import f1_score, confusion_matrix, classification_report

macro_f1 = f1_score(y_val, preds, average='macro')

labels = sorted(set(y_val))
labels_wo_other = [l for l in labels if l != "Other"]
macro_f1_wo_other = f1_score(y_val, preds, labels=labels_wo_other, average='macro')

conf_mat = confusion_matrix(y_val, preds)
report = classification_report(y_val, preds)

print("Macro F1:", macro_f1)
print("Macro F1 (no 'Other'):", macro_f1_wo_other)
print("Confusion Matrix:\n", conf_mat)
print("Report:\n", report)

Macro F1: 0.5354563271158719
Macro F1 (no 'Other'): 0.5354563271158719
Confusion Matrix:
 [[ 28  13   1   1   0   1   0   1   0   0   3   0   1   0   1   2   0   0]
 [  2  75   0   2   1   1   0   8   0   0   0   0   1   0   0   7   1   0]
 [  2   0  36   2   4   0   2   1   0   1   3   0   1   0   1   7   0   1]
 [  2   4  13  28   2   1   1   3   1   0   7   1   5   2   0   8   1   1]
 [  0   1   1   1  39   1   1   2   1   0   0   0   0   0   0   2   0   0]
 [  0   0   2   1   3  21   0   1   0   0   0   0   2   0   0   2   0   0]
 [  0   1   1   0   3   1 111   1   0   0   2   0   1   0   0   4   0   2]
 [  0   6   1   0   3   0   1  63   2   0   0   0   3   0   0   4   1   1]
 [  0   0   1   1   2   0   0   1  15   0   0   0   0   0   0   2   0   0]
 [  0   0   1   0   0   1   0   0   0   7   2   0   0   0   0   2   1   0]
 [  2   1   4   0   0   1   3   3   0   1  33   0   2   1   0   8   1   2]
 [  0   0   1   0   0   0   0   1   0   0   0   3   1   1   0   3   0   0]
 [  0   2 

## Interpretation & Explanation (Very Short)

### Interpretation
- I check feature weights, confusion matrix, and the classification report.
- LIME/SHAP help explain individual predictions.

### Qualitative Results
- Model learns keyword patterns.
- Good on frequent classes, weak on rare ones.
- Confuses similar relations.
- Does not understand deeper syntax or context.

### Quantitative Results
- Macro F1 ≈ 0.54, accuracy ≈ 0.55.
- Some classes strong, others still weak.

### Pros
- Fast, simple, interpretable, good keyword learner.

### Cons
- Poor on rare classes.
- Confuses similar labels.
- No understanding of word order or deeper meaning.


## MultinomialNB

In [5]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

model.fit(X_train, y_train)
preds = model.predict(X_val)

from sklearn.metrics import f1_score, confusion_matrix, classification_report

macro_f1 = f1_score(y_val, preds, average='macro')

labels = sorted(set(y_val))
labels_wo_other = [l for l in labels if l != "Other"]
macro_f1_wo_other = f1_score(y_val, preds, labels=labels_wo_other, average='macro')

conf_mat = confusion_matrix(y_val, preds)
report = classification_report(y_val, preds)

print("Macro F1:", macro_f1)
print("Macro F1 (no 'Other'):", macro_f1_wo_other)
print("Confusion Matrix:\n", conf_mat)
print("Report:\n", report)

Macro F1: 0.06506861058894936
Macro F1 (no 'Other'): 0.06506861058894936
Confusion Matrix:
 [[  0   7   0   0   0   0   0   0   0   0   0   0   0   0   0  45   0   0]
 [  0  19   0   0   0   0   0   0   0   0   0   0   0   0   0  79   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  61   0   0]
 [  0   0   0   1   0   0   0   0   0   0   0   0   0   0   0  79   0   0]
 [  0   0   0   0   0   0   1   0   0   0   0   0   0   0   0  48   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  32   0   0]
 [  0   0   0   0   0   0  51   0   0   0   0   0   0   0   0  76   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  85   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  22   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  14   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  62   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  10   0   0]
 [  0   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## MultinomialNB Interpretation (Short)

### Qualitative Explanation
- The model collapses almost everything into one major class.
- It only predicts class 16 correctly because that class dominates.
- It cannot separate classes with similar wording.
- It fails completely on rare classes (almost all recall = 0).

### Quantitative Explanation
- Macro F1 ≈ **0.06**, meaning almost all classes perform near zero.
- Accuracy ≈ **0.23**, but this is misleading since it predicts mostly one class.
- Most classes show precision = 0 and recall = 0.
- The confusion matrix shows nearly all rows mapping to class 16.

### Pros
- Very fast and simple.
- Works well when classes are clearly separated and vocabulary-based.
- Good for binary or low-class problems.

### Cons
- Performs extremely poorly on multi-class relational tasks.
- Predicts majority class only.
- Cannot capture relational meaning or context.
- Completely fails on rare classes.


## Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=200, random_state=42)

model.fit(X_train, y_train)
preds = model.predict(X_val)

from sklearn.metrics import f1_score, confusion_matrix, classification_report

macro_f1 = f1_score(y_val, preds, average='macro')

labels = sorted(set(y_val))
labels_wo_other = [l for l in labels if l != "Other"]
macro_f1_wo_other = f1_score(y_val, preds, labels=labels_wo_other, average='macro')

conf_mat = confusion_matrix(y_val, preds)
report = classification_report(y_val, preds)

print("Macro F1:", macro_f1)
print("Macro F1 (no 'Other'):", macro_f1_wo_other)
print("Confusion Matrix:\n", conf_mat)
print("Report:\n", report)

Macro F1: 0.3413111505645179
Macro F1 (no 'Other'): 0.3413111505645179
Confusion Matrix:
 [[ 28   7   0   0   0   0   2   0   0   0   1   0   0   0   0  14   0   0]
 [  2  56   0   0   2   0   2  20   0   0   0   0   0   0   0  16   0   0]
 [  0   0  16   0   4   0   4   3   0   0   4   0   3   2   0  25   0   0]
 [  1   0   4   4   1   1   1   8   0   0   6   0   8   1   0  45   0   0]
 [  0   0   1   0  32   1   2   3   0   0   0   0   1   0   0   9   0   0]
 [  0   0   2   0   2  21   0   1   0   0   0   0   0   0   0   6   0   0]
 [  0   0   1   0   3   1 118   0   0   0   0   0   2   0   0   2   0   0]
 [  0   2   0   0   1   0   0  70   0   0   0   0   1   0   0  11   0   0]
 [  0   0   2   0   0   0   0   1   0   0   0   0   4   0   0  15   0   0]
 [  0   0   0   0   0   0   1   0   0   1   1   0   0   0   0  10   0   1]
 [  0   0   2   0   2   0   2   3   0   0  16   0   5   0   0  31   1   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   9   0   0]
 [  0   6 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## Random Forest (Short Interpretation)

### Qualitative Explanation
- The model learns some patterns but struggles with many classes.
- It performs well on large classes (like class 6 and 8).
- Rare classes and classes with similar wording are often confused.
- Random Forest is not well-suited for sparse text features like TF-IDF.

### Quantitative Explanation
- Macro F1 ≈ **0.34**, which is lower than SGD hinge (0.54).
- Accuracy ≈ **0.44**, affected by imbalance.
- Some classes have good recall, but several have near-zero performance.
- Confusion matrix shows many scattered predictions.

### Pros
- Handles noise better than Naive Bayes.
- Can learn non-linear patterns.
- Works fine for structured data.

### Cons
- Performs poorly on high-dimensional text data.
- Weak on rare classes.
- Not interpretable for text.
- Much worse than SGD for multi-class relation extraction.


## Select Best Model & Evaluate on Test Set

In [7]:
# NOTE: After running above cells manually choose best model and evaluate here.

In [8]:
## Deep Learning Model (Keras LSTM)
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 20000
max_len = 200
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_df[text_col])
X_train_seq = tokenizer.texts_to_sequences(train_df[text_col])
X_test_seq = tokenizer.texts_to_sequences(test_df[text_col])
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

model_dl = tf.keras.Sequential([
    # tf.keras.layers.Embedding(max_words, 128, input_length=max_len),
    tf.keras.layers.Embedding(max_words, 128),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_dl.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_dl.fit(X_train_pad, y_train, epochs=3, batch_size=64, validation_split=0.1)

test_loss, test_acc = model_dl.evaluate(X_test_pad, y_test)
print('DL Test Accuracy:', test_acc)


Epoch 1/3
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 85ms/step - accuracy: 0.0824 - loss: -88.2343 - val_accuracy: 0.0824 - val_loss: -128.9538
Epoch 2/3
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 85ms/step - accuracy: 0.0825 - loss: -167.3848 - val_accuracy: 0.0824 - val_loss: -183.5011
Epoch 3/3
[1m96/96[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 85ms/step - accuracy: 0.0825 - loss: -223.6037 - val_accuracy: 0.0824 - val_loss: -234.8213
[1m85/85[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.0714 - loss: -249.2389
DL Test Accuracy: 0.07140228152275085


## Deep Learning Model (Short Explanation)

### What happened
- The LSTM model did not learn anything (accuracy ≈ 0.07).
- The loss became negative, which means the model was using the wrong setup.
- The model treats the task as binary classification, but the dataset is multi-class.
- Because of this mismatch, the network collapses and predicts one class only.

### Why it failed
- Using `Dense(1, activation='sigmoid')` is only for binary tasks.
- Your labels have many classes, so the model cannot learn the correct outputs.
- `binary_crossentropy` is also incorrect for multi-class relational data.

### How to fix it
- Use `Dense(num_classes, activation='softmax')`.
- Use `loss='sparse_categorical_crossentropy'`.
- Encode labels as integers 0...N-1.

### Short summary
The LSTM failed because it used a binary setup on a multi-class problem.  
To work, it must be changed to a proper multi-class architecture.
