In [1]:
import json
from pprint import pprint
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pandas.io.json import json_normalize

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import sys
sys.path.insert(0, '../utils')
import data_utils

sys.path.insert(0, '../models')
import log_reg_word_embeddings

In [3]:
df_train = data_utils.load_data_set('../data/train/train.csv')
df_train_x = df_train[['head.word', 'tail.word', 'sentence']]
df_train_y = df_train['relation']

In [4]:
model = log_reg_word_embeddings.LogRegWordEmbeddings()

In [8]:
model.fit(df_train_x, df_train_y)

Tokenizing sentences...
0    [she, also, oversaw, refinancing, state, super...
1    [more, 2, 5, million, cubic, yards, contaminat...
2    [the, onondaga, nation, 1, 500, members, feder...
3    [he, born, istanbul, raised, eastern, city, ad...
4    [by, end, recent, tour, rollins, met, soldiers...
Name: sentence, dtype: object
Averaging word embeddings...
0    [-0.024940285714285717, -0.014328357142857137,...
1    [0.022366650000000002, 0.060093550000000016, -...
2    [-0.02160072727272727, 0.10642886363636364, -0...
3    [0.012861874999999998, 0.0178651875, -0.102017...
4    [0.07083508108108107, 0.03429608108108108, -0....
Name: sentence, dtype: object
Shape of transformed input: (522517, 50)
Fitting label encoder...
['/broadcast/content/location' '/broadcast/producer/location'
 '/business/business_location/parent_company' '/business/company/advisors'
 '/business/company/founders' '/business/company/industry'
 '/business/company/locations' '/business/company/major_shareholders'
 '/bu

In [12]:
model.predict(df_train_x.head())

Tokenizing sentences...
0    [she, also, oversaw, refinancing, state, super...
1    [more, 2, 5, million, cubic, yards, contaminat...
2    [the, onondaga, nation, 1, 500, members, feder...
3    [he, born, istanbul, raised, eastern, city, ad...
4    [by, end, recent, tour, rollins, met, soldiers...
Name: sentence, dtype: object
Averaging word embeddings...
0    [-0.024940285714285717, -0.014328357142857137,...
1    [0.022366650000000002, 0.060093550000000016, -...
2    [-0.02160072727272727, 0.10642886363636364, -0...
3    [0.012861874999999998, 0.0178651875, -0.102017...
4    [0.07083508108108107, 0.03429608108108108, -0....
Name: sentence, dtype: object
Shape of transformed input: (5, 50)


array(['NA', 'NA', 'NA', 'NA', 'NA'], dtype=object)

In [17]:
import pickle

save_dir = '../trained_models/log_reg_word_embeddings.pkl'
with open(save_dir, 'wb') as save_file:
    pickle.dump(model.model, save_file)

In [19]:
with open(save_dir, 'rb') as saved_file:
    loaded_model = pickle.load(saved_file)

In [22]:
loaded_model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=4, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [23]:
df_test = data_utils.load_data_set('../data/test/test.csv')
df_test_x = df_train[['head.word', 'tail.word', 'sentence']]
df_test_y = df_train['relation']

In [24]:
predictions = model.predict(df_test_x)

Tokenizing sentences...
0    [she, also, oversaw, refinancing, state, super...
1    [more, 2, 5, million, cubic, yards, contaminat...
2    [the, onondaga, nation, 1, 500, members, feder...
3    [he, born, istanbul, raised, eastern, city, ad...
4    [by, end, recent, tour, rollins, met, soldiers...
Name: sentence, dtype: object
Averaging word embeddings...
0    [-0.024940285714285717, -0.014328357142857137,...
1    [0.022366650000000002, 0.060093550000000016, -...
2    [-0.02160072727272727, 0.10642886363636364, -0...
3    [0.012861874999999998, 0.0178651875, -0.102017...
4    [0.07083508108108107, 0.03429608108108108, -0....
Name: sentence, dtype: object
Shape of transformed input: (522517, 50)


In [54]:
from collections import Counter
Counter(predictions)

predicted_labels = predictions
gold_labels = df_test_y

Counter({'NA': 499204,
         '/location/location/contains': 15861,
         '/location/neighborhood/neighborhood_of': 3674,
         '/business/person/company': 497,
         '/people/person/nationality': 1602,
         '/location/country/capital': 523,
         '/location/country/administrative_divisions': 710,
         '/people/person/place_lived': 439,
         '/people/deceased_person/place_of_death': 1,
         '/business/company/founders': 6})

In [49]:
# Evaluation

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

def compute_score(predicted_labels, gold_labels, average='weighted'):
    accuracy = accuracy_score(gold_labels, predicted_labels)
    precision = precision_score(gold_labels, predicted_labels, average=average)
    recall = recall_score(gold_labels, predicted_labels, average=average)
    f1 = f1_score(gold_labels, predicted_labels, average=average)

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

compute_score(predicted_labels, gold_labels)

{'accuracy': 0.7346880579961992,
 'precision': 0.6301679116706429,
 'recall': 0.7346880579961992,
 'f1': 0.6516748864717016}