# Multilabel-classification Models

## Model development with TF-IDF

In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
from sklearn.multiclass import OneVsRestClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from skmultilearn.model_selection import iterative_train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import hamming_loss

In [3]:
np.random.seed(42)

## TF-IDF Model for Contents

In [4]:
def pre_process_data(df, test_proportion=0.1):

    df.columns.values[0] = "FQText"
    size_df = df.shape[0]

    X = df[["FQText"]].to_numpy()

    y_df = df.drop(["FQText"], axis=1).astype(np.float32)
    cols = df.columns
    df_labels = list(y_df.columns)

    y = y_df.to_numpy()

    X_train, y_train, X_test, y_test = iterative_train_test_split(X, y, test_size=test_proportion)
    
    X_train = pd.DataFrame(X_train, columns=["FQText"])
    y_train = pd.DataFrame(y_train, columns=[df_labels], dtype=np.float32)
    X_test = pd.DataFrame(X_test, columns=["FQText"])
    y_test = pd.DataFrame(y_test, columns=[df_labels], dtype=np.float32)

    # print(df_labels)

    # print("Total amount of data: {}".format(size_df))
    # print("Number of rows used to TRAIN: {}".format(X_train.shape[0]))
    # print("Number of rows used to TEST: {}".format(X_test.shape[0]))

    return X_train, y_train, X_test, y_test, df_labels

In [5]:
dfs = ['nancy_determinants_individual_labels_eng.csv',
 'nancy_contents_individual_labels_eng.csv',
 'nancy_contents_macro_labels_english.csv',
 'nancy_determinants_macro_labels_english.csv']

In [6]:
models = [KNeighborsClassifier(random_state=42), LogisticRegression(random_state=42, solver = "sag"), SVC(random_state=42), 
          RandomForestClassifier(random_state=42), SGDClassifier(random_state=42), GradientBoostingClassifier(random_state=42)]

In [7]:
def model_tfidf(df_name, models=models):
    
    name = "_".join(df_name.split("_")[1:3])
    df = pd.read_csv(df_name)

    X_train, y_train, X_test, y_test, df_labels = pre_process_data(df, test_proportion=0.1)

    tfidf_vectorizer = TfidfVectorizer(stop_words="english")
    X_train_transformed = tfidf_vectorizer.fit_transform(X_train["FQText"].to_list())
    X_test_transformed = tfidf_vectorizer.transform(X_test["FQText"].to_list())
    
    list_results = []

    for model in models:
        model_name = str(model).split("()")[0]
        dict_model_info = {"data_model": name,
                            "labels": df_labels,
                            "ml_algo":model_name}
        one_v_rest = OneVsRestClassifier(model)

        model_fit = one_v_rest.fit(X_train_transformed, y_train)

        y_pred = model_fit.predict(X_test_transformed)

        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="micro")
        hamming = hamming_loss(y_test, y_pred)
        
        dict_model_info["accuracy"] = accuracy
        dict_model_info["f1"] = f1
        dict_model_info["hamming"] = hamming
        
        list_results.append(dict_model_info)
    return list_results

In [8]:
results = []
for df in dfs:
    results_per_df = model_tfidf(df)
    results.extend(results_per_df)



## Results for the TF-IDF Models

In [9]:
df_results_tf = pd.DataFrame(results)

In [10]:
df_results_tf.sort_values("f1", ascending=False)

Unnamed: 0,data_model,labels,ml_algo,accuracy,f1,hamming
16,contents_macro,"[animal_sum, human_sum, abs_sum, food_sum, art...",SGDClassifier(random_state=42),0.5,0.650602,0.062771
12,contents_macro,"[animal_sum, human_sum, abs_sum, food_sum, art...",KNeighborsClassifier,0.47619,0.588235,0.075758
23,determinants_macro,"[color_sum, threat_sum, fading_sum, form_sum, ...",GradientBoostingClassifier(random_state=42),0.184211,0.444444,0.210526
17,contents_macro,"[animal_sum, human_sum, abs_sum, food_sum, art...",GradientBoostingClassifier(random_state=42),0.214286,0.441176,0.082251
15,contents_macro,"[animal_sum, human_sum, abs_sum, food_sum, art...",RandomForestClassifier(random_state=42),0.261905,0.405797,0.088745
13,contents_macro,"[animal_sum, human_sum, abs_sum, food_sum, art...","LogisticRegression(random_state=42, solver='sag')",0.261905,0.393443,0.080087
14,contents_macro,"[animal_sum, human_sum, abs_sum, food_sum, art...",SVC,0.261905,0.393443,0.080087
22,determinants_macro,"[color_sum, threat_sum, fading_sum, form_sum, ...",SGDClassifier(random_state=42),0.184211,0.385542,0.268421
11,contents_individual,"[(A), (AD), (H), (HD), A, ABS, AD, ALIM, ANAT,...",GradientBoostingClassifier(random_state=42),0.186047,0.371429,0.037898
21,determinants_macro,"[color_sum, threat_sum, fading_sum, form_sum, ...",RandomForestClassifier(random_state=42),0.184211,0.363636,0.221053


In [11]:
df_results_tf.to_csv("tf_models.csv", index=False)

In [12]:
# pickle.dump(model, open(r"..\Models\Contents\pipeline_contents_One-Many_V4-11-05.sav", 'wb')) 

# will save the RF model since it was at the end of the list

In [13]:

# pickle.dump(model, open(r"..\Models\Determinants\pipeline_determinants_One-Many_V4-11-05.sav", 'wb'))

# will save the RF model by default

# Function to get the predictions back from the tf idf classifier for the individual labels

In [14]:
# list(y_determinants.columns)

In [15]:
# list(y_contents.columns)

If the model is tfidf

In [16]:


# def evaluate_one_vs_rest_TFIDF(path, text):
    
#     pipeline = pickle.load(open(path, "rb"))
    
#     if "content" in path:
#         # print("content found")
#         possible_outcomes = ['(A)', '(Ad)', '(H)', '(Hd)', 'A', 'Abs', 'Ad', 'Alim', 'Anat', 'Art',
#        'Bot', 'Elem', 'Frag', 'Ge', 'H', 'Hd', 'Id', 'Nat', 'Obj', 'Pays', 'Radio', 'Sc', 'Sex', 'Sg', 'Vet']
        
#     elif "determinant" in path:
#         # print("determinant found")
#         possible_outcomes = ['C', 'C\'', 'C\'F', 'CF', 'E', 'EF', 'F', 'FC', 'FC\'', 'FE', 'K', 'kan']

#     prediction = pipeline.predict([text])
#     probabilities = pipeline.predict_proba([text]) # sometimes no prediction is given back so we can take the outcome with the highest P instead

#     # print("prediction:", prediction)
#     # print("probabilities:", probabilities)
    
#     list_predictions = prediction.tolist()
#     list_predictions = [x for sublist in list_predictions for x in sublist] # avoid lists with sublists

    
#     if len(list_predictions) != len(possible_outcomes): # sanity check
#         print(prediction)
#         print( len(list_predictions)  )
#         print(possible_outcomes)
#         print( len(possible_outcomes)  )
#         print("Error encountered in the predictions")
        
#     results = ([possible_outcomes[i] for i in range(len(list_predictions)) if list_predictions[i] == 1]) 

#     if results == []:
#         # print("No result")
#         i = probabilities.argmax(1).item()
#         # print(ix)
#         final_results = possible_outcomes[i]
    
#     else:
#         final_results = str(results).replace("\'", "").replace("[", "").replace("]", "")
    
#     return final_results

In [17]:
# evaluate_one_vs_rest_TFIDF(r"..\Models\Contents\pipeline_contents_One-Many_V3-18-04.sav", "Dog")

In [18]:
# evaluate_one_vs_rest_TFIDF(r"..\Models\Determinants\pipeline_determinants_One-Many_V3-18-04.sav", "Dog")

# Model development with SentenceTransformers


In [19]:
from sentence_transformers import SentenceTransformer, util

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
def preprocess_text_for_transformer(text):
    
    embeddings_model = SentenceTransformer("all-MiniLM-L6-v2")
    
    x_array = embeddings_model.encode(text, convert_to_numpy=True)
    
    x_centroid = np.mean(x_array)
    X_transformers = x_centroid.reshape(-1,1)

    
    return X_transformers[0]

In [21]:
def pre_process_data(df, test_proportion=0.1):

    df.columns.values[0] = "FQText"
    size_df = df.shape[0]
    df["FQText"] = df["FQText"].apply(preprocess_text_for_transformer)
    X = df[["FQText"]].to_numpy()

    y_df = df.drop(["FQText"], axis=1).astype(np.float32)
    cols = df.columns
    df_labels = list(y_df.columns)

    y = y_df.to_numpy()

    X_train, y_train, X_test, y_test = iterative_train_test_split(X, y, test_size=test_proportion)
    
    X_train = pd.DataFrame(X_train, columns=["FQText"])
    y_train = pd.DataFrame(y_train, columns=[df_labels], dtype=np.float32)
    
    X_test = pd.DataFrame(X_test, columns=["FQText"])
    y_test = pd.DataFrame(y_test, columns=[df_labels], dtype=np.float32)

    # print(df_labels)

    # print("Total amount of data: {}".format(size_df))
    # print("Number of rows used to TRAIN: {}".format(X_train.shape[0]))
    # print("Number of rows used to TEST: {}".format(X_test.shape[0]))

    return X_train, y_train, X_test, y_test, df_labels

In [22]:
def model_sentence_transformers(df_name, models=models):
    
    name = "_".join(df_name.split("_")[1:3])
    df = pd.read_csv(df_name)

    X_train, y_train, X_test, y_test, df_labels = pre_process_data(df, test_proportion=0.1)
    
    list_results = []

    for model in models:
        model_name = str(model).split("()")[0]
        dict_model_info = {"data_model": name,
                            "labels": df_labels,
                            "ml_algo":model_name}
        one_v_rest = OneVsRestClassifier(model)

        model_fit = one_v_rest.fit(X_train, y_train)

        y_pred = model_fit.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="micro")
        hamming = hamming_loss(y_test, y_pred)
        
        dict_model_info["accuracy"] = accuracy
        dict_model_info["f1"] = f1
        dict_model_info["hamming"] = hamming
        
        list_results.append(dict_model_info)
    return list_results

In [23]:
results = []
for df in dfs:
    results_per_df = model_sentence_transformers(df)
    results.extend(results_per_df)



: 

## Results for the Sentence Transformers Models

In [None]:
df_results_st = pd.DataFrame(results)

In [None]:
df_results_st.sort_values("f1", ascending=False)

Unnamed: 0,data_model,labels,ml_algo,accuracy,f1,hamming
21,determinants_macro,"[color_sum, threat_sum, fading_sum, form_sum, ...",RandomForestClassifier(random_state=42),0.307692,0.424242,0.292308
16,contents_macro,"[animal_sum, human_sum, abs_sum, food_sum, art...",SGDClassifier(random_state=42),0.375,0.413793,0.115909
18,determinants_macro,"[color_sum, threat_sum, fading_sum, form_sum, ...",KNeighborsClassifier,0.25641,0.358974,0.25641
15,contents_macro,"[animal_sum, human_sum, abs_sum, food_sum, art...",RandomForestClassifier(random_state=42),0.225,0.346939,0.145455
12,contents_macro,"[animal_sum, human_sum, abs_sum, food_sum, art...",KNeighborsClassifier,0.275,0.329114,0.120455
22,determinants_macro,"[color_sum, threat_sum, fading_sum, form_sum, ...",SGDClassifier(random_state=42),0.205128,0.321839,0.302564
17,contents_macro,"[animal_sum, human_sum, abs_sum, food_sum, art...",GradientBoostingClassifier(random_state=42),0.225,0.321839,0.134091
14,contents_macro,"[animal_sum, human_sum, abs_sum, food_sum, art...",SVC,0.2,0.268657,0.111364
23,determinants_macro,"[color_sum, threat_sum, fading_sum, form_sum, ...",GradientBoostingClassifier(random_state=42),0.179487,0.253165,0.302564
9,contents_individual,"[(A), (AD), (H), (HD), A, ABS, AD, ALIM, ANAT,...",RandomForestClassifier(random_state=42),0.190476,0.224299,0.073192


In [None]:
df_results_st.to_csv("sentence_transformers_models.csv", index=False)

## Function to get the label from the Sentence Transformer classifier

In [None]:
# def preprocess_text_for_transformer(text):
    
#     embeddings_model = SentenceTransformer("all-MiniLM-L6-v2")
    
#     x_array = embeddings_model.encode(text, convert_to_numpy=True)
    
#     x_centroid = np.mean(x_array)
#     X_transformers = x_centroid.reshape(-1,1)

    
#     return X_transformers[0]

In [None]:
# preprocess_text_for_transformer("dog with two tails")

array([0.0003033], dtype=float32)

In [None]:
# def evaluate_one_vs_rest_transformer(path, text):
    
#     pipeline = pickle.load(open(path, "rb"))
    
#     if "content" in path:
#         # print("content found")
#         possible_outcomes = ['(A)', '(Ad)', '(H)', '(Hd)', 'A', 'Abs', 'Ad', 'Alim', 'Anat', 'Art',
#        'Bot', 'Elem', 'Frag', 'Ge', 'H', 'Hd', 'Id', 'Nat', 'Obj', 'Pays', 'Radio', 'Sc', 'Sex', 'Sg', 'Vet']
        
#     elif "determinant" in path:
#         # print("determinant found")
#         possible_outcomes = ['C', 'C\'', 'C\'F', 'CF', 'E', 'EF', 'F', 'FC', 'FC\'', 'FE', 'K', 'kan']

#     text_transformed = preprocess_text_for_transformer(text)
    
#     prediction = pipeline.predict([text_transformed])
#     probabilities = pipeline.predict_proba([text_transformed]) # sometimes no prediction is given back so we can take the outcome with the highest P instead

#     # print("prediction:", prediction)
#     # print("probabilities:", probabilities)
    
#     list_predictions = prediction.tolist()
#     list_predictions = [x for sublist in list_predictions for x in sublist] # avoid lists with sublists

    
#     if len(list_predictions) != len(possible_outcomes): # sanity check
#         print(prediction)
#         print( len(list_predictions)  )
#         print(possible_outcomes)
#         print( len(possible_outcomes)  )
#         print("Error encountered in the predictions")
        
#     results = ([possible_outcomes[i] for i in range(len(list_predictions)) if list_predictions[i] == 1]) 

#     if results == []:
#         # print("No result")
#         i = probabilities.argmax(1).item()
#         # print(ix)
#         final_results = possible_outcomes[i]
    
#     else:
#         final_results = str(results).replace("\'", "").replace("[", "").replace("]", "")
    
#     return final_results

In [None]:
# preprocess_text_for_transformer("dog with tail")

array([0.00023756], dtype=float32)

In [None]:
# evaluate_one_vs_rest_transformer(r"..\Models\Contents\sentence_transformer_contents_V23-18-04.sav", "Dog with tail")

'A, Anat'

In [None]:
# evaluate_one_vs_rest_transformer(r"..\Models\Determinants\sentence_transformer_determinants_V23-18-04.sav", "Dog with tail")

'F, FE'