# Multilabel-classification Models

## Model development with TF-IDF

In [25]:
import pandas as pd
import numpy as np
import pickle

In [76]:
from sklearn.multiclass import OneVsRestClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

## TF-IDF Model for Contents

In [19]:
df_contents = pd.read_csv(r"..\Nancy PsyData\nancy_contents.csv")
df_determinants = pd.read_csv(r"..\Nancy PsyData\nancy_determinants.csv")

In [6]:
df_contents.columns

Index(['Réponse (French)', 'Answer (English)', 'Contenu', '(A)', '(AD)', '(H)',
       '(HD)', 'A', 'ABS', 'AD', 'ALIM', 'ANAT', 'ARCH', 'ART', 'BOT', 'ELEM',
       'FRAG', 'GÉO', 'H', 'HD', 'MQ', 'NAT', 'OBJ', 'PAYS', 'RADIO', 'SC',
       'SCÈNE', 'SEX', 'SG', 'VÊT'],
      dtype='object')

In [7]:
df_determinants.columns

Index(['Réponse (French)', 'Answer (English)', 'Déterminant', 'C', 'C'', 'C'F',
       'CF', 'CF'', 'CLOB', 'CLOBF', 'E', 'EF', 'F', 'FC', 'FC'', 'FCLOB',
       'FE', 'K', 'KAN', 'KOB', 'KP'],
      dtype='object')

In [21]:

X = df_contents['Answer (English)'].to_numpy()

y_contents = df_contents[['(A)', '(AD)', '(H)',
       '(HD)', 'A', 'ABS', 'AD', 'ALIM', 'ANAT', 'ARCH', 'ART', 'BOT', 'ELEM',
       'FRAG', 'GÉO', 'H', 'HD', 'MQ', 'NAT', 'OBJ', 'PAYS', 'RADIO', 'SC',
       'SCÈNE', 'SEX', 'SG', 'VÊT']].to_numpy()

y_determinants = df_determinants[['C', 'C\'', 'C\'F',
       'CF', 'CF\'', 'CLOB', 'CLOBF', 'E', 'EF', 'F', 'FC', 'FC\'', 'FCLOB',
       'FE', 'K', 'KAN', 'KOB', 'KP']].to_numpy()

In [65]:
models = [KNeighborsClassifier(), LogisticRegression(random_state=42, solver = "sag"), SVC(), 
          RandomForestClassifier(random_state=42), SGDClassifier(random_state=42), GradientBoostingClassifier(random_state=42)]

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y_contents, test_size = 0.10, random_state = 42)

for clf in models:

    pipeline = Pipeline([
                    ('text_tfidf', TfidfVectorizer(min_df = 2, max_df = 0.3, ngram_range = (1,3))),
                    ('clf', OneVsRestClassifier(clf))
                ])

    model = pipeline.fit(X_train, y_train)

    predictions = model.predict(X_test)

    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average="micro")

    print(clf, accuracy, f1)

KNeighborsClassifier() 0.10526315789473684 0.19607843137254902
LogisticRegression(random_state=42, solver='sag') 0.0 0.0




SVC() 0.0 0.0
RandomForestClassifier(random_state=42) 0.05263157894736842 0.07999999999999999
SGDClassifier(random_state=42) 0.18421052631578946 0.3428571428571429
GradientBoostingClassifier(random_state=42) 0.2894736842105263 0.4788732394366198


In [24]:
class_report = classification_report(y_test, predictions)
print(class_report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00         1
           4       0.67      0.20      0.31        10
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         4
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         2
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         3
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         2
          16       0.00      0.00      0.00         4
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Looking at the classification report it is evident that the imbalanced dataset is causing problems

In [26]:
pickle.dump(model, open(r"..\Models\Contents\pipeline_contents_One-Many_V4-11-05.sav", 'wb')) 

# will save the RF model since it was at the end of the list

## TF-IDF Model for Determinants

Our models are not doing too good, maybe we could "cascade" the classifiers if the predictions of a previous classifier were good.
For example, computing content labels first and then using the content label prediction to inform the determinant prediction model
IFF one of the models was good 

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y_determinants, test_size = 0.10, random_state = 42)

for clf in models:

    pipeline = Pipeline([
                    ('text_tfidf', TfidfVectorizer(min_df = 2, max_df = 0.3, ngram_range = (1,3))),
                    ('clf', OneVsRestClassifier(clf))
                ])

    model = pipeline.fit(X_train, y_train)

    predictions = model.predict(X_test)

    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average="micro")

    print(clf, accuracy, f1)


KNeighborsClassifier() 0.2631578947368421 0.3278688524590164
LogisticRegression(random_state=42, solver='sag') 0.21052631578947367 0.3018867924528302




SVC() 0.18421052631578946 0.2745098039215686
RandomForestClassifier(random_state=42) 0.21052631578947367 0.2857142857142857
SGDClassifier(random_state=42) 0.2894736842105263 0.41791044776119407
GradientBoostingClassifier(random_state=42) 0.21052631578947367 0.2608695652173913


In [68]:
class_report = classification_report(y_test, predictions)
print(class_report)

              precision    recall  f1-score   support

           0       1.00      0.20      0.33         5
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         0
           9       0.47      0.44      0.45        16
          10       0.00      0.00      0.00         4
          11       0.00      0.00      0.00         2
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         3
          15       0.00      0.00      0.00         3
          16       0.33      1.00      0.50         1
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Looking at the classification report it is evident that the imbalanced dataset is causing problems

In [32]:

pickle.dump(model, open(r"..\Models\Determinants\pipeline_determinants_One-Many_V4-11-05.sav", 'wb'))

# will save the RF model by default

# Function to get the predictions back from the tf idf classifier

In [33]:
list(y_determinants.columns)

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [None]:
list(y_contents.columns)

['(A)',
 '(Ad)',
 '(H)',
 '(Hd)',
 'A',
 'Abs',
 'Ad',
 'Alim',
 'Anat',
 'Art',
 'Bot',
 'Elem',
 'Frag',
 'Ge',
 'H',
 'Hd',
 'Id',
 'Nat',
 'Obj',
 'Pays',
 'Radio',
 'Sc',
 'Sex',
 'Sg',
 'Vet']

If the model is tfidf

In [None]:


def evaluate_one_vs_rest_TFIDF(path, text):
    
    pipeline = pickle.load(open(path, "rb"))
    
    if "content" in path:
        # print("content found")
        possible_outcomes = ['(A)', '(Ad)', '(H)', '(Hd)', 'A', 'Abs', 'Ad', 'Alim', 'Anat', 'Art',
       'Bot', 'Elem', 'Frag', 'Ge', 'H', 'Hd', 'Id', 'Nat', 'Obj', 'Pays', 'Radio', 'Sc', 'Sex', 'Sg', 'Vet']
        
    elif "determinant" in path:
        # print("determinant found")
        possible_outcomes = ['C', 'C\'', 'C\'F', 'CF', 'E', 'EF', 'F', 'FC', 'FC\'', 'FE', 'K', 'kan']

    prediction = pipeline.predict([text])
    probabilities = pipeline.predict_proba([text]) # sometimes no prediction is given back so we can take the outcome with the highest P instead

    # print("prediction:", prediction)
    # print("probabilities:", probabilities)
    
    list_predictions = prediction.tolist()
    list_predictions = [x for sublist in list_predictions for x in sublist] # avoid lists with sublists

    
    if len(list_predictions) != len(possible_outcomes): # sanity check
        print(prediction)
        print( len(list_predictions)  )
        print(possible_outcomes)
        print( len(possible_outcomes)  )
        print("Error encountered in the predictions")
        
    results = ([possible_outcomes[i] for i in range(len(list_predictions)) if list_predictions[i] == 1]) 

    if results == []:
        # print("No result")
        i = probabilities.argmax(1).item()
        # print(ix)
        final_results = possible_outcomes[i]
    
    else:
        final_results = str(results).replace("\'", "").replace("[", "").replace("]", "")
    
    return final_results

In [None]:
evaluate_one_vs_rest_TFIDF(r"..\Models\Contents\pipeline_contents_One-Many_V3-18-04.sav", "Dog")

'A'

In [None]:
evaluate_one_vs_rest_TFIDF(r"..\Models\Determinants\pipeline_determinants_One-Many_V3-18-04.sav", "Dog")

'FE, kan'

# Model development with SentenceTransformers


In [34]:
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [69]:
X = df_contents["Answer (English)"]

In [50]:
X.dtype

dtype('O')

In [70]:
X = X.astype("str")

In [37]:
embeddigngs_model = SentenceTransformer("all-MiniLM-L6-v2")



In [71]:
X_transformers = X.apply(lambda x: embeddigngs_model.encode(x, convert_to_numpy=True)) # getting the embeddings for each row

In [53]:
X_transformers

0      [0.01710045, 0.032927256, -0.050096795, -0.014...
1      [0.04927347, 0.06730551, -0.03920415, -0.03209...
2      [0.003027574, 0.03194066, -0.09189637, 0.05275...
3      [0.07981707, -0.054176375, 0.07149547, 0.06627...
4      [0.017808866, -0.040367622, 0.032963585, -0.01...
                             ...                        
375    [-0.022829905, 0.06105979, 0.04656104, 0.01641...
376    [-0.0058849268, 0.049261548, 0.060328465, -0.0...
377    [-0.06306299, 0.031928115, 0.037187733, 0.0149...
378    [-0.023190409, -0.007055615, 0.027367428, 0.03...
379    [-0.01656596, -0.005031914, 0.016621804, 0.037...
Name: Answer (English), Length: 380, dtype: object

In [72]:
X_transformers = X_transformers.apply(lambda x: np.mean(x)) # getting the centroid of each embedding array, we cna't just feed the classifier vectors
X_transformers = np.array(X_transformers).reshape(-1,1)

In [73]:
X_transformers[:5]

array([[-1.7147581e-03],
       [-1.3931376e-03],
       [-2.7161246e-05],
       [ 3.6774381e-04],
       [-2.1004721e-03]], dtype=float32)

## Sentence Transfomers model for Determinants

In [74]:
y = y_determinants

X_train, X_test, y_train, y_test = train_test_split(X_transformers, y, test_size = 0.10, random_state = 42)

for clf in models:
        
        pipeline = Pipeline([
                        ('clf', OneVsRestClassifier(clf)),
                ])

        model = pipeline.fit(X_train, y_train)

        predictions = model.predict(X_test)

        accuracy = accuracy_score(y_test, predictions)
        f1 = f1_score(y_test, predictions, average="micro")

        print(clf, accuracy, f1)

KNeighborsClassifier() 0.05263157894736842 0.07017543859649122
LogisticRegression(random_state=42, solver='sag') 0.0 0.0
SVC() 0.0 0.0
RandomForestClassifier(random_state=42) 0.18421052631578946 0.17777777777777776
SGDClassifier(random_state=42) 0.0 0.0
GradientBoostingClassifier(random_state=42) 0.10526315789473684 0.10256410256410256


In [58]:
class_report = classification_report(y_test, predictions)
print(class_report)

              precision    recall  f1-score   support

           0       0.25      0.20      0.22         5
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         2
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         0
           9       0.56      0.31      0.40        16
          10       0.25      0.50      0.33         4
          11       0.00      0.00      0.00         2
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         3
          15       0.00      0.00      0.00         3
          16       0.00      0.00      0.00         1
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [59]:
pickle.dump(model, open(r"..\Models\Determinants\sentence_transformer_determinants_V6-11-05.sav", 'wb'))

## Sentence Transfomers model for Contents

In [75]:
y = y_contents

X_train, X_test, y_train, y_test = train_test_split(X_transformers, y, test_size = 0.10, random_state = 42)

for clf in models:
        
        pipeline = Pipeline([
                        ('clf', OneVsRestClassifier(clf)),
                ])

        model = pipeline.fit(X_train, y_train)

        predictions = model.predict(X_test)

        accuracy = accuracy_score(y_test, predictions)
        f1 = f1_score(y_test, predictions, average="micro")

        print(clf, accuracy, f1)

KNeighborsClassifier() 0.05263157894736842 0.10344827586206898
LogisticRegression(random_state=42, solver='sag') 0.0 0.0
SVC() 0.0 0.0
RandomForestClassifier(random_state=42) 0.07894736842105263 0.08333333333333333
SGDClassifier(random_state=42) 0.0 0.0
GradientBoostingClassifier(random_state=42) 0.07894736842105263 0.09090909090909091


In [61]:
class_report = classification_report(y_test, predictions)
print(class_report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         4
           3       0.00      0.00      0.00         1
           4       0.38      0.30      0.33        10
           5       0.00      0.00      0.00         1
           6       0.00      0.00      0.00         4
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         2
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         3
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         2
          16       0.00      0.00      0.00         4
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [62]:
pickle.dump(model, open(r"..\Models\Contents\sentence_transformer_contents_V6-11-05.sav", 'wb'))


## Function to get the label from the Sentence Transformer classifier (in progress)

In [None]:
def preprocess_text_for_transformer(text):
    
    embeddings_model = SentenceTransformer("all-MiniLM-L6-v2")
    
    x_array = embeddings_model.encode(text, convert_to_numpy=True)
    
    x_centroid = np.mean(x_array)
    X_transformers = x_centroid.reshape(-1,1)

    
    return X_transformers[0]

In [None]:
preprocess_text_for_transformer("dog with two tails")

array([0.0003033], dtype=float32)

In [None]:


def evaluate_one_vs_rest_transformer(path, text):
    
    pipeline = pickle.load(open(path, "rb"))
    
    if "content" in path:
        # print("content found")
        possible_outcomes = ['(A)', '(Ad)', '(H)', '(Hd)', 'A', 'Abs', 'Ad', 'Alim', 'Anat', 'Art',
       'Bot', 'Elem', 'Frag', 'Ge', 'H', 'Hd', 'Id', 'Nat', 'Obj', 'Pays', 'Radio', 'Sc', 'Sex', 'Sg', 'Vet']
        
    elif "determinant" in path:
        # print("determinant found")
        possible_outcomes = ['C', 'C\'', 'C\'F', 'CF', 'E', 'EF', 'F', 'FC', 'FC\'', 'FE', 'K', 'kan']

    text_transformed = preprocess_text_for_transformer(text)
    
    prediction = pipeline.predict([text_transformed])
    probabilities = pipeline.predict_proba([text_transformed]) # sometimes no prediction is given back so we can take the outcome with the highest P instead

    # print("prediction:", prediction)
    # print("probabilities:", probabilities)
    
    list_predictions = prediction.tolist()
    list_predictions = [x for sublist in list_predictions for x in sublist] # avoid lists with sublists

    
    if len(list_predictions) != len(possible_outcomes): # sanity check
        print(prediction)
        print( len(list_predictions)  )
        print(possible_outcomes)
        print( len(possible_outcomes)  )
        print("Error encountered in the predictions")
        
    results = ([possible_outcomes[i] for i in range(len(list_predictions)) if list_predictions[i] == 1]) 

    if results == []:
        # print("No result")
        i = probabilities.argmax(1).item()
        # print(ix)
        final_results = possible_outcomes[i]
    
    else:
        final_results = str(results).replace("\'", "").replace("[", "").replace("]", "")
    
    return final_results

In [None]:
preprocess_text_for_transformer("dog with tail")

array([0.00023756], dtype=float32)

In [None]:
evaluate_one_vs_rest_transformer(r"..\Models\Contents\sentence_transformer_contents_V23-18-04.sav", "Dog with tail")

'A, Anat'

In [None]:
evaluate_one_vs_rest_transformer(r"..\Models\Determinants\sentence_transformer_determinants_V23-18-04.sav", "Dog with tail")

'F, FE'