In [1]:
import time
import pickle
import seaborn as sns
import numpy as np

from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

t = time.time()

In [2]:
def read_pickle(filename: str): 
    '''Read pickle to get the info'''
    list_pickle =  pickle.load(open(filename,"rb"))
    return list_pickle

In [3]:
train_emb = read_pickle('train_non_cont_word_emb.pickle')
test_emb = read_pickle('test_non_cont_word_emb.pickle')

I found out that some arrays are sometimes missing in the file, here is an example : 

In [4]:
train_emb[32]

('14562',
 'D198286',
 array([-3.99169922e-02, -4.21346016e-02, -6.51041651e-03,  6.29882812e-02,
        -1.32486984e-01, -2.06217453e-01, -1.26342773e-02, -2.92317718e-01,
         7.02339783e-02,  7.21842423e-02, -1.97265625e-01, -1.31835938e-02,
        -5.77799492e-02,  1.66015625e-02, -2.65625000e-01,  1.46484375e-02,
         1.26139326e-02,  4.39860038e-02,  5.01302071e-02, -1.31510422e-01,
         3.92303467e-02, -5.38736992e-02, -4.19921875e-02,  1.00585938e-01,
         2.06054688e-01, -1.55273438e-01,  1.57877598e-02,  6.27746582e-02,
        -7.32421875e-02, -3.13313790e-02,  2.99479160e-02,  4.32128906e-02,
         2.44140625e-02, -1.21663414e-01, -4.33349609e-02,  7.22249374e-02,
         2.88899746e-02, -7.84505233e-02,  1.56290695e-01,  2.29492188e-02,
        -3.49934888e-03,  4.71496582e-02,  2.19665527e-01,  4.10156250e-02,
        -5.15950508e-02, -1.07421875e-01,  9.22851562e-02, -9.11458302e-03,
        -5.62744141e-02,  4.27246094e-03,  7.51953125e-02,  1.0091

In [5]:
(len(train_emb), len(test_emb))

(734026, 32516)

In [6]:
(len(train_emb[0][2]), len(test_emb[0][3]))

(300, 300)

In [7]:
def get_vectors(batch):
    '''Input : batch
    Output : Arrays of vectors for respectively X and y'''
    
    # Getting X
    X = []
    index_errors = []
    for i in range(len(batch)):
        try:
            X.append(np.concatenate((batch[i][2], batch[i][3])))
        except:
            index_errors.append(i) #Creating a list of indexes where errors occurred
            
    # Getting Y
    list_bool = [list(elem[4]) for elem in batch]
    preprocessed_list_bool = []
    for i in range(len(list_bool)):
        if i not in index_errors: # Removing indexes where errors occurred
            if list_bool[i] == [False]:
                preprocessed_list_bool.append(0)
            else:
                preprocessed_list_bool.append(1)
    return np.array(X), np.array(preprocessed_list_bool)

In [8]:
X_train, y_train = get_vectors(train_emb)
X_train.shape

(731894, 600)

In [9]:
y_train.size

731894

In [10]:
X_test, y_test = get_vectors(test_emb)
X_test.shape

(32514, 600)

In [11]:
y_test.size

32514

In [12]:
logreg = LogisticRegression(penalty='l1', C=1.0, solver='liblinear')
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print("F1-score:",metrics.f1_score(y_test, y_pred))

F1-score: 0.5512208504801098


In [14]:
outfile = open('LR_emb_fit.pickle', 'wb')
pickle.dump(logreg, outfile)
outfile.close()

In [13]:
print(f"Execution time : {time.strftime('%H:%M:%S', time.gmtime(time.time()-t))}")

Execution time : 00:27:14
