In [2]:
# -*- coding: utf-8 -*-
# --------------------------------------------------
#
# logisticRegression.ipynb
#
# Written by cetinsamet -*- cetin.samet@metu.edu.tr
# April, 2019
# --------------------------------------------------

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import scipy.io as sio
import numpy as np
import pickle

In [43]:
features = sio.loadmat('data/features.mat')
labels = sio.loadmat('data/labels.mat')

In [44]:
label_names = [label[0] for label in labels['labels'][0]]

x_train   = features['trainTokenFeatures']
y_train   = labels['trainLabels'].ravel()

x_val     = features['validTokenFeatures']
y_val     = labels['validLabels'].ravel()

x_test    = features['testTokenFeatures']
y_test    = labels['testLabels'].ravel()

In [45]:
print("*" * 100)
print("Label Names   :", label_names)
print()
print("x_train shape :", x_train.shape)
print("y_train shape :", y_train.shape)
print()
print("x_val shape   :", x_val.shape)
print("y_val shape   :", y_val.shape)
print()
print("x_test shape  :", x_test.shape)
print("y_test shape  :", y_test.shape)
print("*" * 100)

****************************************************************************************************
Label Names   : ['B-DATE', 'B-LOCATION', 'B-MONEY', 'B-ORGANIZATION', 'B-PERCENT', 'B-PERSON', 'B-TIME', 'I-DATE', 'I-LOCATION', 'I-MONEY', 'I-ORGANIZATION', 'I-PERCENT', 'I-PERSON', 'I-TIME', 'O']

x_train shape : (380181, 768)
y_train shape : (380181,)

x_val shape   : (45415, 768)
y_val shape   : (45415,)

x_test shape  : (45187, 768)
y_test shape  : (45187,)
****************************************************************************************************


In [26]:
n_train, feat_dim   = x_train.shape
n_val, _            = x_val.shape
n_test, _           = x_test.shape

In [27]:
print("*" * 100)
print("Number of Train samples  : %d" % n_train)
print("Number of Val samples    : %d" % n_val)
print("Number of Test samples   : %d" % n_test)
print("Feature dimension        : %d" % feat_dim)
print("*" * 100)

****************************************************************************************************
Number of Train samples  : 380181
Number of Val samples    : 45415
Number of Test samples   : 45187
Feature dimension        : 768
****************************************************************************************************


In [None]:
lr = LogisticRegression(solver='lbfgs', random_state=123)
lr.fit(x_train, y_train)

In [29]:
# SAVE LOGISTIC REGRESSION MODEL
with open('model/logistic_regression.pickle', 'wb') as outfile:
    pickle.dump(lr, outfile, pickle.HIGHEST_PROTOCOL)
    print("Logistic Regression model is saved.")

Logistic Regression model is saved.


In [30]:
# LOAD LOGISTIC REGRESSION MODEL
with open('model/logistic_regression.pickle', 'rb') as infile:
    lr = pickle.load(infile)

In [33]:
sorted_label_names = sorted(label_names, key=lambda name: name[1:])
sorted_label_names.remove('O')
sorted_label_indices = [label_names.index(label) for label in sorted_label_names]

In [34]:
pred_train = lr.predict(x_train)
print("### TRAIN CLASSIFICATION REPORT ###\n")
print(classification_report(y_train, pred_train, labels=sorted_label_indices, target_names=sorted_label_names))

### TRAIN CLASSIFICATION REPORT ###

                precision    recall  f1-score   support

        B-DATE       0.88      0.51      0.65      2831
        I-DATE       0.67      0.46      0.55      1289
    B-LOCATION       0.76      0.66      0.71      8459
    I-LOCATION       0.80      0.23      0.36      1526
       B-MONEY       0.48      0.04      0.08       521
       I-MONEY       0.84      0.59      0.70       960
B-ORGANIZATION       0.81      0.47      0.60      8033
I-ORGANIZATION       0.74      0.26      0.39      5468
     B-PERCENT       0.97      1.00      0.99       574
     I-PERCENT       0.78      0.43      0.55       581
      B-PERSON       0.59      0.38      0.46     12727
      I-PERSON       0.48      0.12      0.20      6271
        B-TIME       0.99      0.86      0.92       169
        I-TIME       0.00      0.00      0.00        23

     micro avg       0.71      0.41      0.52     49432
     macro avg       0.70      0.43      0.51     49432
  weighte

  'precision', 'predicted', average, warn_for)


In [36]:
pred_val = lr.predict(x_val)
print("### VAL CLASSIFICATION REPORT ###\n")
print(classification_report(y_val, pred_val, labels=sorted_label_indices, target_names=sorted_label_names))

### VAL CLASSIFICATION REPORT ###

                precision    recall  f1-score   support

        B-DATE       0.84      0.55      0.66       272
        I-DATE       0.67      0.41      0.51       112
    B-LOCATION       0.74      0.69      0.71       950
    I-LOCATION       0.52      0.20      0.29       172
       B-MONEY       0.12      0.01      0.02        73
       I-MONEY       0.92      0.60      0.73       146
B-ORGANIZATION       0.81      0.49      0.61      1001
I-ORGANIZATION       0.65      0.18      0.28       690
     B-PERCENT       1.00      1.00      1.00        43
     I-PERCENT       0.78      0.42      0.55        43
      B-PERSON       0.65      0.42      0.51      1749
      I-PERSON       0.40      0.11      0.17       738
        B-TIME       1.00      0.50      0.67         6
        I-TIME       0.00      0.00      0.00         3

     micro avg       0.70      0.41      0.52      5998
     macro avg       0.65      0.40      0.48      5998
  weighted 

  'precision', 'predicted', average, warn_for)


In [46]:
pred_test = lr.predict(x_test)
print("### TEST CLASSIFICATION REPORT ###\n")
print(classification_report(y_test, pred_test, labels=sorted_label_indices, target_names=sorted_label_names))

### TEST CLASSIFICATION REPORT ###

                precision    recall  f1-score   support

        B-DATE       0.87      0.51      0.64       364
        I-DATE       0.57      0.30      0.39       181
    B-LOCATION       0.77      0.64      0.70      1091
    I-LOCATION       0.76      0.20      0.32       240
       B-MONEY       0.33      0.02      0.03       113
       I-MONEY       0.96      0.51      0.67       235
B-ORGANIZATION       0.78      0.38      0.51       862
I-ORGANIZATION       0.79      0.22      0.34       826
     B-PERCENT       1.00      0.95      0.98       107
     I-PERCENT       0.77      0.31      0.45       108
      B-PERSON       0.58      0.37      0.45      1594
      I-PERSON       0.45      0.09      0.15       785
        B-TIME       0.95      0.91      0.93        23
        I-TIME       0.00      0.00      0.00         2

     micro avg       0.72      0.37      0.49      6531
     macro avg       0.68      0.39      0.47      6531
  weighted

  'precision', 'predicted', average, warn_for)
