In [4]:
import json
import numpy as np
import nltk
import re
import string
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from nltk.tokenize import word_tokenize
from collections import Counter
from scipy.sparse import hstack


# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Download the 'punkt_tab' resource

def load_data(file_path):
    """ Load JSON data from a file. """
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

def preprocess_text(text):
    """ Preprocess text by lowercasing, removing numbers/punctuation, and filtering stopwords. """
    stop_words = set(nltk.corpus.stopwords.words('english'))
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = word_tokenize(text)  # Tokenize words
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return " ".join(words)

def extract_features_and_labels(data):
    """ Extract features (text, POS, dependency relations, entity types) and labels. """
    texts, pos_tags, dep_rels, entity_types, labels = [], [], [], [], []
    
    for item in data:
        sentence = " ".join(item["token"])
        texts.append(preprocess_text(sentence))
        pos_tags.append(" ".join(item["stanford_pos"]))
        dep_rels.append(" ".join(item["stanford_deprel"]))
        entity_types.append(item["subj_type"] + " " + item["obj_type"])
        labels.append(item["relation"])
    
    return texts, pos_tags, dep_rels, entity_types, labels

# Load dataset
train_data = load_data("train.json")
test_data = load_data("test.json")

# Extract features and labels
X_train_text, X_train_pos, X_train_dep, X_train_ent, y_train = extract_features_and_labels(train_data)
X_test_text, X_test_pos, X_test_dep, X_test_ent, y_test = extract_features_and_labels(test_data)

# TF-IDF vectorization
vectorizer_text = TfidfVectorizer()
vectorizer_pos = TfidfVectorizer()
vectorizer_dep = TfidfVectorizer()
vectorizer_ent = TfidfVectorizer()

X_train_tfidf = vectorizer_text.fit_transform(X_train_text)
X_test_tfidf = vectorizer_text.transform(X_test_text)
X_train_pos_tfidf = vectorizer_pos.fit_transform(X_train_pos)
X_test_pos_tfidf = vectorizer_pos.transform(X_test_pos)
X_train_dep_tfidf = vectorizer_dep.fit_transform(X_train_dep)
X_test_dep_tfidf = vectorizer_dep.transform(X_test_dep)
X_train_ent_tfidf = vectorizer_ent.fit_transform(X_train_ent)
X_test_ent_tfidf = vectorizer_ent.transform(X_test_ent)

# Combine all feature vectors
X_train_combined = hstack([X_train_tfidf, X_train_pos_tfidf, X_train_dep_tfidf, X_train_ent_tfidf])
X_test_combined = hstack([X_test_tfidf, X_test_pos_tfidf, X_test_dep_tfidf, X_test_ent_tfidf])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\melod\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\melod\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\melod\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
# Save vectorizers
with open('svm_vectorizer_text.pickle', 'wb') as file:
    pickle.dump(vectorizer_text, file)

with open('svm_vectorizer_pos.pickle', 'wb') as file:
    pickle.dump(vectorizer_pos, file)

with open('svm_vectorizer_dep.pickle', 'wb') as file:
    pickle.dump(vectorizer_dep, file)

with open('svm_vectorizer_ent.pickle', 'wb') as file:
    pickle.dump(vectorizer_ent, file)

In [5]:
# Convert labels to binary (relation vs. no_relation)
binary_labels_train = np.array(["relation" if lbl != "no_relation" else "no_relation" for lbl in y_train])
binary_labels_test = np.array(["relation" if lbl != "no_relation" else "no_relation" for lbl in y_test])

# Balance training data using SMOTE
print("Original train distribution:", Counter(binary_labels_train))
smote = SMOTE(sampling_strategy='not majority', random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_combined, binary_labels_train)
print("Balanced train distribution:", Counter(y_train_balanced))

# Balance test data using RandomUnderSampler
print("Original test distribution:", Counter(binary_labels_test))
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_test_balanced, y_test_balanced = rus.fit_resample(X_test_combined, binary_labels_test)
print("Balanced test distribution:", Counter(y_test_balanced))

# Train binary SVM classifier
svm_binary = SVC(kernel='rbf', class_weight='balanced', C=1.0, probability=True)
svm_binary.fit(X_train_balanced, y_train_balanced)

# Predict on test set
y_pred_binary = svm_binary.predict(X_test_balanced)
print("Binary Classification Report:")
print(classification_report(y_test_balanced, y_pred_binary))

# Save model
with open('svm_binary_model.pickle', 'wb') as file:
    pickle.dump(svm_binary, file)

Original train distribution: Counter({'no_relation': 55112, 'relation': 13012})
Balanced train distribution: Counter({'relation': 55112, 'no_relation': 55112})
Original test distribution: Counter({'no_relation': 12184, 'relation': 3325})
Balanced test distribution: Counter({'no_relation': 3325, 'relation': 3325})
Binary Classification Report:
              precision    recall  f1-score   support

 no_relation       0.57      0.94      0.71      3325
    relation       0.84      0.30      0.44      3325

    accuracy                           0.62      6650
   macro avg       0.71      0.62      0.58      6650
weighted avg       0.71      0.62      0.58      6650



In [6]:
# Extract only "relation" instances for multi-class classification
relation_indices = [i for i, label in enumerate(y_train) if label != "no_relation"]
X_train_relation = X_train_combined[relation_indices]
y_train_relation = np.array(y_train)[relation_indices]

# Train multi-class SVM WITHOUT SMOTE (no oversampling)
svm_multi = SVC(kernel='rbf', class_weight='balanced', C=1.0, probability=True)
svm_multi.fit(X_train_relation, y_train_relation)

# Extract only "relation" instances from test data
relation_indices_test = [i for i, label in enumerate(y_test) if label != "no_relation"]
X_test_relation = X_test_combined[relation_indices_test]
y_test_relation = np.array(y_test)[relation_indices_test]

# Predict multi-class labels
y_pred_multi = svm_multi.predict(X_test_relation)
print("Multi-class Classification Report:")
print(classification_report(y_test_relation, y_pred_multi))

# Save model
with open('svm_multi_model.pickle', 'wb') as file:
    pickle.dump(svm_multi, file)

Multi-class Classification Report:
                                     precision    recall  f1-score   support

                org:alternate_names       0.86      0.79      0.82       213
           org:city_of_headquarters       0.99      0.94      0.96        82
        org:country_of_headquarters       0.91      0.97      0.94       108
                      org:dissolved       0.50      0.50      0.50         2
                        org:founded       0.97      0.97      0.97        37
                     org:founded_by       0.55      0.09      0.15        68
                      org:member_of       0.12      0.17      0.14        18
                        org:members       0.00      0.00      0.00        31
    org:number_of_employees/members       1.00      1.00      1.00        19
                        org:parents       0.35      0.37      0.36        62
org:political/religious_affiliation       1.00      1.00      1.00        10
                   org:shareholders     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
def predict_relation(user_text, entity1, entity2):
    """ Predict the relation between two entities in a user-provided sentence. """
    processed_input = preprocess_text(user_text)
    input_vector_text = vectorizer_text.transform([processed_input])
    input_vector_pos = vectorizer_pos.transform(["unknown"])
    input_vector_dep = vectorizer_dep.transform(["unknown"])
    input_vector_ent = vectorizer_ent.transform(["unknown unknown"])
    
    input_vector_combined = np.hstack([input_vector_text.toarray(), input_vector_pos.toarray(), input_vector_dep.toarray(), input_vector_ent.toarray()])
    binary_prediction = svm_binary.predict(input_vector_combined)[0]
    
    if binary_prediction == "no_relation":
        return "No relation detected"
    
    multi_prediction = svm_multi.predict(input_vector_combined)[0]
    return multi_prediction

# Example prediction
user_sentence = "Barack Obama was born in Hawaii."
entity1 = "Barack Obama"
entity2 = "Hawaii"
predicted_relation = predict_relation(user_sentence, entity1, entity2)
print("Predicted Relation:", predicted_relation)

Predicted Relation: per:countries_of_residence


In [8]:
# Example User Input
user_sentence = input("Please input a sentence:")
entity1 = input("Please input the first entity:")
entity2 = input("Please input the second entity:")
predicted_relation = predict_relation(user_sentence, entity1, entity2)
print("Predicted Relation:", predicted_relation)

Please input a sentence: David was born in Italy.
Please input the first entity: David
Please input the second entity: Italy
Predicted Relation: per:countries_of_residence
