In [77]:
# -*- coding: utf-8 -*-
# --------------------------------------------------
#
# logisticRegression.ipynb
#
# Written by cetinsamet -*- cetin.samet@metu.edu.tr
# April, 2019
# --------------------------------------------------

In [97]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from seqeval.metrics import classification_report
import scipy.io as sio
import numpy as np
import pickle

In [79]:
def readFile(filepath):
    
        text     = []
        sentence = []

        with open(filepath, 'r') as infile:
            for line in infile:
                word, _, _, _ = line.strip().split('\t')

                if word == '<S>':
                    text.append(sentence)
                    sentence = []
                    continue

                sentence.append(line.strip())

        return text

In [80]:
trainText = readFile('data/train.txt')
validText = readFile('data/valid.txt')
testText  = readFile('data/test.txt') 

In [81]:
features  = sio.loadmat('data/features.mat')
labels    = sio.loadmat('data/labels.mat')

In [82]:
trainFeatures = features['trainTokenFeatures']
validFeatures = features['validTokenFeatures']
testFeatures  = features['testTokenFeatures']

In [83]:
trainLabels = [[token.split('\t')[-1] for token in sentence] for sentence in trainText]
validLabels = [[token.split('\t')[-1] for token in sentence] for sentence in validText]
testLabels  = [[token.split('\t')[-1] for token in sentence] for sentence in testText]

In [84]:
unique_labels = list(np.unique([label for sentence in trainLabels for label in sentence]))
print(unique_labels)

['B-DATE', 'B-LOCATION', 'B-MONEY', 'B-ORGANIZATION', 'B-PERCENT', 'B-PERSON', 'B-TIME', 'I-DATE', 'I-LOCATION', 'I-MONEY', 'I-ORGANIZATION', 'I-PERCENT', 'I-PERSON', 'I-TIME', 'O']


In [85]:
trainLabelsIdx = [[unique_labels.index(token.split('\t')[-1]) for token in sentence] for sentence in trainLabels]
validLabelsIdx = [[unique_labels.index(token.split('\t')[-1]) for token in sentence] for sentence in validLabels]
testLabelsIdx  = [[unique_labels.index(token.split('\t')[-1]) for token in sentence] for sentence in testLabels]

In [86]:
x_train = trainFeatures
y_train = np.asarray([label for sent in trainLabelsIdx for label in sent])

x_valid = validFeatures
y_valid = np.asarray([label for sent in validLabelsIdx for label in sent])

x_test  = testFeatures
y_test  = np.asarray([label for sent in testLabelsIdx for label in sent])

In [87]:
n_train = x_train.shape[0]
n_valid = x_valid.shape[0]
n_test  = x_test.shape[0]

In [107]:
x_trainval = np.vstack([x_train, x_valid])
y_trainval = np.vstack([np.expand_dims(y_train, 1), np.expand_dims(y_valid, 1)]).ravel()

In [116]:
grid  = {"C":np.logspace(-3, 3, 7), "penalty":['l2']} # l1 lasso l2 ridge
lr    = LogisticRegression(solver='lbfgs', random_state=123, verbose=True)
lr_cv = GridSearchCV(lr, grid, cv = 3)
lr_cv.fit(x_trainval, y_trainval)

[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 15.5min finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 14.9min finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  8.2min finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  8.6min finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  8.5min finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  8.4min finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  9.0min finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  8.8min finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  8.9min finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  8.9min finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  8.8min finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  9.0min finished


[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  8.8min finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  8.8min finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  8.9min finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  8.9min finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  9.0min finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  8.8min finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  8.9min finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  9.1min finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  8.8min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 13.6min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=123, solver='lbfgs',
          tol=0.0001, verbose=True, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]), 'penalty': ['l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [117]:
print("tuned hpyerparameters :(best parameters) ",lr_cv.best_params_)
print("accuracy :",lr_cv.best_score_)

tuned hpyerparameters :(best parameters)  {'C': 1.0, 'penalty': 'l2'}
accuracy : 0.9105607195556349


In [118]:
lr = LogisticRegression(solver='lbfgs', C=1.0, penalty='l2', random_state=123)
lr.fit(x_trainval, y_trainval)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=123, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [119]:
# SAVE LOGISTIC REGRESSION MODEL
with open('model/logistic_regression.pickle', 'wb') as outfile:
    pickle.dump(lr, outfile, pickle.HIGHEST_PROTOCOL)
    print("Logistic Regression model is saved.")

Logistic Regression model is saved.


In [120]:
# LOAD LOGISTIC REGRESSION MODEL
with open('model/logistic_regression.pickle', 'rb') as infile:
    lr = pickle.load(infile)

In [121]:
trainPredIdx = lr.predict(x_train)

trainPredLabel = []
sent           = []
ct             = 0

for sentence in trainLabels:
    for token in sentence:
        sent.append(unique_labels[trainPredIdx[ct]])
        ct += 1
    trainPredLabel.append(sent)
    sent = []
print("### TRAIN CLASSIFICATION REPORT ###\n")
print(classification_report(trainPredLabel, trainLabels))

### TRAIN CLASSIFICATION REPORT ###

              precision    recall  f1-score   support

      PERSON       0.25      0.35      0.29      7872
ORGANIZATION       0.44      0.55      0.49      5701
    LOCATION       0.61      0.69      0.65      6834
       MONEY       0.10      0.08      0.09       665
        DATE       0.54      0.58      0.56      2362
     PERCENT       0.43      0.37      0.39       606
        TIME       0.86      0.99      0.92       136

   micro avg       0.42      0.51      0.46     24176
   macro avg       0.43      0.51      0.47     24176



In [122]:
validPredIdx = lr.predict(x_valid)

validPredLabel = []
sent           = []
ct             = 0

for sentence in validLabels:
    for token in sentence:
        sent.append(unique_labels[validPredIdx[ct]])
        ct += 1
    validPredLabel.append(sent)
    sent = []
    
print("### VALID CLASSIFICATION REPORT ###\n")
print(classification_report(validPredLabel, validLabels))

### VALID CLASSIFICATION REPORT ###

              precision    recall  f1-score   support

      PERSON       0.31      0.40      0.35      2603
ORGANIZATION       0.40      0.52      0.45      1505
    LOCATION       0.63      0.66      0.65      1590
       MONEY       0.02      0.01      0.02       153
        TIME       0.74      0.93      0.82        15
        DATE       0.57      0.63      0.60       492
     PERCENT       0.48      0.41      0.44       111

   micro avg       0.42      0.50      0.46      6469
   macro avg       0.43      0.50      0.46      6469



In [123]:
testPredIdx = lr.predict(x_test)

testPredLabel = []
sent          = []
ct            = 0

for sentence in testLabels:
    for token in sentence:
        sent.append(unique_labels[testPredIdx[ct]])
        ct += 1
    testPredLabel.append(sent)
    sent = []
    
print("### TEST CLASSIFICATION REPORT ###\n")
print(classification_report(testPredLabel, testLabels))

### TEST CLASSIFICATION REPORT ###

              precision    recall  f1-score   support

        DATE       0.51      0.57      0.54       326
     PERCENT       0.31      0.28      0.30       116
ORGANIZATION       0.36      0.47      0.41       657
      PERSON       0.25      0.35      0.29      1126
    LOCATION       0.60      0.70      0.64       928
       MONEY       0.02      0.02      0.02       132
        TIME       0.87      0.91      0.89        22

   micro avg       0.38      0.48      0.43      3307
   macro avg       0.39      0.48      0.43      3307

