In [1]:
import time
import pickle
import numpy as np

from sklearn import metrics
from sklearn.linear_model import LogisticRegression

t = time.time()

## Creating the LR pickle

In [2]:
def read_pickle(filename: str): 
    '''Read pickle to get the info'''
    list_pickle =  pickle.load(open(filename,"rb"))
    return list_pickle

In [3]:
# We assume that we have train_tf_idf.pickle and test_tf_idf in folder
train_tf_idf = read_pickle('train_tf_idf.pickle')
test_tf_idf = read_pickle('test_tf_idf.pickle')

In [4]:
def get_X_vectors(batch):
    '''Input : batch
    Output : Array of vectors'''
    X = []
    for i in range(len(batch)):
        X.append(np.concatenate((batch[i][2], batch[i][3])))
    return np.array(X)

In [5]:
def get_y_vector(batch): 
    '''Input : batch
    Output : array of integers (0 or 1)'''
    list_bool = [list(elem[4]) for elem in batch]
    preprocessed_list_bool = []
    for boolean in list_bool:
        if boolean == [False]:
            preprocessed_list_bool.append(0)
        else:
            preprocessed_list_bool.append(1)
    return np.array(preprocessed_list_bool)

In [6]:
X_train = get_X_vectors(train_tf_idf)
y_train = get_y_vector(train_tf_idf)

X_test = get_X_vectors(test_tf_idf)
y_test = get_y_vector(test_tf_idf)

In [7]:
logreg = LogisticRegression(penalty='l1', C=1.0, solver='liblinear')
logreg.fit(X_train, y_train)

LogisticRegression(penalty='l1', solver='liblinear')

In [8]:
outfile = open('LR_model_fit.pickle', 'wb')
pickle.dump(logreg, outfile)
outfile.close()

## Defining a function that takes vectors in entry and return LR probabilities

In [9]:
def get_batch_LR_proba(query_vecs, doc_vecs, logreg):
    '''Input : 
    query_vecs, doc_vecs : tfidf vectors of query and doc (2D array)
    logreg : fitted logistic regression
    Output : array of probabilites returned by LR'''
    if len(query_vecs)!=len(doc_vecs):
        raise ValueError('Arrays are not of the same size')
    X = []
    for i in range(len(query_vecs)):
        X.append(np.concatenate((query_vecs[i], doc_vecs[i])))
    y_scores = logreg.predict_proba(X)
    LR_results = [y_scores[i,0] for i in range(len(y_scores))]
    return np.array(LR_results)

In [10]:
# Testing the function
query_vecs_list = [train_tf_idf[i][2] for i in range(10)]
query_vecs = np.array(query_vecs_list)
doc_vecs_list = [train_tf_idf[i][3] for i in range(10)]
doc_vecs = np.array(doc_vecs_list)
get_batch_LR_proba(query_vecs, doc_vecs, logreg)

array([0.37057483, 0.38482529, 0.491763  , 0.46707768, 0.44896146,
       0.66988109, 0.46134323, 0.36765006, 0.43042553, 0.73689352])

In [11]:
print(f"Execution time : {time.strftime('%H:%M:%S', time.gmtime(time.time()-t))}")

Execution time : 00:01:01
