In [1]:
# -*- coding: utf-8 -*-
# --------------------------------------------------
#
# logisticRegression.ipynb
#
# Written by cetinsamet -*- cetin.samet@metu.edu.tr
# April, 2019
# --------------------------------------------------

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from seqeval.metrics import classification_report
import scipy.io as sio
import numpy as np
import pickle

In [3]:
def readFile(filepath):
    
        text     = []
        sentence = []

        with open(filepath, 'r') as infile:
            for line in infile:
                word, _, _, _ = line.strip().split('\t')

                if word == '<S>':
                    text.append(sentence)
                    sentence = []
                    continue

                sentence.append(line.strip())

        return text

In [4]:
trainText = readFile('data/train.txt')
validText = readFile('data/valid.txt')
testText  = readFile('data/test.txt') 

In [5]:
features  = sio.loadmat('data/features.mat')

In [8]:
trainFeatures = features['trainTokenFeatures']
validFeatures = features['validTokenFeatures']
testFeatures  = features['testTokenFeatures']

In [9]:
trainLabels = [[token.split('\t')[-1] for token in sentence] for sentence in trainText]
validLabels = [[token.split('\t')[-1] for token in sentence] for sentence in validText]
testLabels  = [[token.split('\t')[-1] for token in sentence] for sentence in testText]

In [10]:
unique_labels = list(np.unique([label for sentence in trainLabels for label in sentence]))
print(unique_labels)

['B-DATE', 'B-LOCATION', 'B-MONEY', 'B-ORGANIZATION', 'B-PERCENT', 'B-PERSON', 'B-TIME', 'I-DATE', 'I-LOCATION', 'I-MONEY', 'I-ORGANIZATION', 'I-PERCENT', 'I-PERSON', 'I-TIME', 'O']


In [11]:
trainLabelsIdx = [[unique_labels.index(token.split('\t')[-1]) for token in sentence] for sentence in trainLabels]
validLabelsIdx = [[unique_labels.index(token.split('\t')[-1]) for token in sentence] for sentence in validLabels]
testLabelsIdx  = [[unique_labels.index(token.split('\t')[-1]) for token in sentence] for sentence in testLabels]

In [12]:
x_train = trainFeatures
y_train = np.asarray([label for sent in trainLabelsIdx for label in sent])

x_valid = validFeatures
y_valid = np.asarray([label for sent in validLabelsIdx for label in sent])

x_test  = testFeatures
y_test  = np.asarray([label for sent in testLabelsIdx for label in sent])

In [13]:
n_train = x_train.shape[0]
n_valid = x_valid.shape[0]
n_test  = x_test.shape[0]

In [14]:
x_trainval = np.vstack([x_train, x_valid])
y_trainval = np.vstack([np.expand_dims(y_train, 1), np.expand_dims(y_valid, 1)]).ravel()

In [None]:
grid  = {"C":np.logspace(-3, 3, 7), "penalty":['l2']} # l1 lasso l2 ridge
lr    = LogisticRegression(solver='lbfgs', random_state=123, verbose=True)
lr_cv = GridSearchCV(lr, grid, cv = 3)
lr_cv.fit(x_trainval, y_trainval)

In [117]:
print("tuned hpyerparameters :(best parameters) ",lr_cv.best_params_)
print("accuracy :",lr_cv.best_score_)

tuned hpyerparameters :(best parameters)  {'C': 1.0, 'penalty': 'l2'}
accuracy : 0.9105607195556349


In [None]:
lr = LogisticRegression(solver='lbfgs', C=1.0, penalty='l2', random_state=123)
lr.fit(x_trainval, y_trainval)

In [119]:
# SAVE LOGISTIC REGRESSION MODEL
with open('model/logistic_regression.pickle', 'wb') as outfile:
    pickle.dump(lr, outfile, pickle.HIGHEST_PROTOCOL)
    print("Logistic Regression model is saved.")

Logistic Regression model is saved.


In [6]:
# LOAD LOGISTIC REGRESSION MODEL
with open('model/logistic_regression.pickle', 'rb') as infile:
    lr = pickle.load(infile)

In [None]:
trainPredIdx = lr.predict(x_train)

trainPredLabel = []
sent           = []
ct             = 0

for sentence in trainLabels:
    for token in sentence:
        sent.append(unique_labels[trainPredIdx[ct]])
        ct += 1
    trainPredLabel.axppend(sent)
    sent = []
print("### TRAIN CLASSIFICATION REPORT ###\n")
print(classification_report(trainLabels, trainPredLabel))

In [None]:
validPredIdx = lr.predict(x_valid)

validPredLabel = []
sent           = []
ct             = 0

for sentence in validLabels:
    for token in sentence:
        sent.append(unique_labels[validPredIdx[ct]])
        ct += 1
    validPredLabel.append(sent)
    sent = []
    
print("### VALID CLASSIFICATION REPORT ###\n")
print(classification_report(validLabels, validPredLabel))

In [15]:
testPredIdx = lr.predict(x_test)

testPredLabel = []
sent          = []
ct            = 0

for sentence in testLabels:
    for token in sentence:
        sent.append(unique_labels[testPredIdx[ct]])
        ct += 1
    testPredLabel.append(sent)
    sent = []
    
print("### TEST CLASSIFICATION REPORT ###\n")
print(classification_report(testLabels, testPredLabel))

### TEST CLASSIFICATION REPORT ###

              precision    recall  f1-score   support

      PERSON       0.35      0.25      0.29      1594
ORGANIZATION       0.47      0.36      0.41       862
    LOCATION       0.70      0.60      0.64      1091
        DATE       0.57      0.51      0.54       364
       MONEY       0.02      0.02      0.02       113
     PERCENT       0.28      0.31      0.30       107
        TIME       0.91      0.87      0.89        23

   micro avg       0.48      0.38      0.43      4154
   macro avg       0.48      0.38      0.43      4154

