# Impact of image and text features on classification
The trained models consistently showed worse or equivalent results when image or image + text was used for prediciton compared to text only. The aim of this section is to check if image and text are failing on the same predictions (i.e., images features are not helpful for sentiment analysis in memes) or if images can be more helpful than text in some cases (i.e., image and text features are complementary but images are less often used as a strong predictor).

In [1]:
import pandas as pd
import numpy as np
from joblib import dump, load
from src.utils.files import load_dfs, load_clfs
from src.utils.embeddings import retrieve_all_embeds
from src.utils.reports import generate_report
from src.models.voting import soft_transform, hard_transform
from pathlib import Path
from sklearn.metrics import classification_report
import joblib

In [2]:
def evaluate(clfs, embeds, y_dev, voting="soft", multitask=False, multilabel=False):
    res = {}
    if voting == "soft":
        y_pred_dev = soft_transform(clfs, embeds[1], multilabel, multitask)
        y_pred_test = soft_transform(clfs, embeds[2], multilabel, multitask)
    else:
        y_pred_dev = hard_transform(clfs, embeds[1], multilabel, multitask)
        y_pred_test = hard_transform(clfs, embeds[2], multilabel, multitask)
    if not multitask:
        rep = classification_report(y_dev, y_pred_dev)
        print(rep)
    else:
        rep = [classification_report(y_dev[:,col], y_pred_dev[:,col]) for col in range(y_dev.shape[1])]
        cols = ["Humour", "Sarcasm", "Offense", "Motivation"]
        for c, r in list(zip(cols, rep)):
            print("results for class {}:\n{}".format(c, r))
    res = {"pred_cls_dev": y_pred_dev, "report_str": rep, "pred_cls_test": y_pred_test}
    return res

In [3]:
def evaluate_all_tasks(model_name, embeds, y_devs):
    tasks = ["task_a", "task_b", "task_c"]
    embed_type = ["image_only", "text_only", "concatenated"]
    model_path = "data/models/custom"
    res = []
    
    for i, task in enumerate(tasks):
        print(task)
        clf_names = [str(Path("{}/{}_{}_{}.joblib".format(model_path,task, model_name, e)).resolve()) 
                     for e in embed_type]
        clfs_task = [joblib.load(f) for f in clf_names]
        multitask = True if i == 2 else False
        multilabel = True if i ==1 else False
        res.append(evaluate(clfs_task, embeds, y_devs[i], multitask=multitask, multilabel=multilabel)["pred_cls_test"])
    generate_report(*res, zipname="res_{}_ensemble.zip".format(model_name))
    return res

In [4]:
df_train, df_dev = load_dfs(["data/train_cleaned_final.csv", "data/dev_cleaned_final.csv"])
cols = ["Humour", "Sarcasm", "Offense", "Motivation"]
y_devs = [df_dev["Overall_sentiment"].cat.codes,
          df_dev[["Humour_bin", "Sarcasm_bin", "Offense_bin", "Motivation_bin"]].to_numpy().astype(int),
          pd.concat([df_dev[name].cat.codes for name in cols], axis=1).to_numpy()]
embed = retrieve_all_embeds([("data/features/use.pkl.train", "data/features/xception.pkl.train"), 
                              ("data/features/use.pkl.dev","data/features/xception.pkl.dev"),
                              ("data/features/use.pkl.test", "data/features/xception.pkl.test")])
embed = list(zip(*embed.values()))

In [7]:
res = evaluate_all_tasks("lr", embed, y_devs)

task_a
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        80
           1       0.00      0.00      0.00       302
           2       0.62      1.00      0.76       618

    accuracy                           0.62      1000
   macro avg       0.21      0.33      0.25      1000
weighted avg       0.38      0.62      0.47      1000

task_b


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.77      1.00      0.87       773
           1       0.75      1.00      0.86       751
           2       0.60      1.00      0.75       601
           3       0.00      0.00      0.00       366

   micro avg       0.71      0.85      0.77      2491
   macro avg       0.53      0.75      0.62      2491
weighted avg       0.61      0.85      0.71      2491
 samples avg       0.71      0.80      0.72      2491

task_c
results for class Humour:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       227
           1       0.34      0.94      0.50       343
           2       0.50      0.09      0.16       341
           3       0.00      0.00      0.00        89

    accuracy                           0.35      1000
   macro avg       0.21      0.26      0.17      1000
weighted avg       0.29      0.35      0.23      1000

results for class Sarcasm:
              pr

  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
res = evaluate_all_tasks("knn", embed, y_devs)

task_a
              precision    recall  f1-score   support

           0       0.67      0.07      0.13        80
           1       0.56      0.31      0.40       302
           2       0.67      0.89      0.76       618

    accuracy                           0.65      1000
   macro avg       0.63      0.43      0.43      1000
weighted avg       0.64      0.65      0.60      1000

task_b


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.78      0.99      0.87       773
           1       0.76      0.99      0.86       751
           2       0.68      0.87      0.76       601
           3       0.56      0.23      0.33       366

   micro avg       0.74      0.85      0.79      2491
   macro avg       0.70      0.77      0.71      2491
weighted avg       0.72      0.85      0.76      2491
 samples avg       0.73      0.80      0.73      2491

task_c
results for class Humour:
              precision    recall  f1-score   support

           0       0.40      0.35      0.37       227
           1       0.47      0.61      0.53       343
           2       0.49      0.49      0.49       341
           3       0.57      0.09      0.16        89

    accuracy                           0.46      1000
   macro avg       0.48      0.38      0.39      1000
weighted avg       0.47      0.46      0.45      1000

results for class Sarcasm:
              pr

  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
res = evaluate_all_tasks("gnb", embed, y_devs)

task_a
              precision    recall  f1-score   support

           0       0.08      0.61      0.14        80
           1       0.27      0.15      0.19       302
           2       0.60      0.20      0.30       618

    accuracy                           0.22      1000
   macro avg       0.32      0.32      0.21      1000
weighted avg       0.46      0.22      0.25      1000

task_b


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.79      0.24      0.36       773
           1       0.73      0.24      0.36       751
           2       0.61      0.19      0.29       601
           3       0.38      0.80      0.51       366

   micro avg       0.53      0.31      0.39      2491
   macro avg       0.63      0.37      0.38      2491
weighted avg       0.67      0.31      0.37      2491
 samples avg       0.45      0.28      0.31      2491

task_c
results for class Humour:
              precision    recall  f1-score   support

           0       0.28      0.58      0.37       227
           1       0.38      0.10      0.15       343
           2       0.36      0.12      0.18       341
           3       0.11      0.40      0.17        89

    accuracy                           0.24      1000
   macro avg       0.28      0.30      0.22      1000
weighted avg       0.32      0.24      0.22      1000

results for class Sarcasm:
              pr

In [10]:
res = evaluate_all_tasks("abc", embed, y_devs)

task_a


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.50      0.01      0.02        80
           1       0.00      0.00      0.00       302
           2       0.62      1.00      0.76       618

    accuracy                           0.62      1000
   macro avg       0.37      0.34      0.26      1000
weighted avg       0.42      0.62      0.47      1000

task_b


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.78      0.99      0.87       773
           1       0.75      0.99      0.86       751
           2       0.63      0.89      0.74       601
           3       0.57      0.17      0.26       366

   micro avg       0.72      0.85      0.78      2491
   macro avg       0.68      0.76      0.68      2491
weighted avg       0.70      0.85      0.74      2491
 samples avg       0.72      0.80      0.72      2491

task_c
results for class Humour:
              precision    recall  f1-score   support

           0       0.23      0.89      0.37       227
           1       0.00      0.00      0.00       343
           2       0.00      0.00      0.00       341
           3       0.19      0.26      0.22        89

    accuracy                           0.23      1000
   macro avg       0.10      0.29      0.15      1000
weighted avg       0.07      0.23      0.10      1000

results for class Sarcasm:
              pr

  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
res = evaluate_all_tasks("mlp", embed, y_devs)

task_a
              precision    recall  f1-score   support

           0       0.84      0.61      0.71        80
           1       0.77      0.70      0.73       302
           2       0.85      0.91      0.88       618

    accuracy                           0.82      1000
   macro avg       0.82      0.74      0.77      1000
weighted avg       0.82      0.82      0.82      1000

task_b


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.90      0.96      0.93       773
           1       0.89      0.97      0.93       751
           2       0.85      0.88      0.87       601
           3       0.83      0.74      0.78       366

   micro avg       0.88      0.91      0.89      2491
   macro avg       0.87      0.89      0.88      2491
weighted avg       0.88      0.91      0.89      2491
 samples avg       0.83      0.85      0.82      2491

task_c
results for class Humour:
              precision    recall  f1-score   support

           0       0.83      0.74      0.78       227
           1       0.73      0.81      0.77       343
           2       0.75      0.77      0.76       341
           3       0.88      0.69      0.77        89

    accuracy                           0.77      1000
   macro avg       0.80      0.75      0.77      1000
weighted avg       0.77      0.77      0.77      1000

results for class Sarcasm:
              pr

In [12]:
res = evaluate_all_tasks("rf", embed, y_devs)

task_a
              precision    recall  f1-score   support

           0       0.71      0.15      0.25        80
           1       0.75      0.36      0.48       302
           2       0.71      0.96      0.82       618

    accuracy                           0.71      1000
   macro avg       0.72      0.49      0.52      1000
weighted avg       0.72      0.71      0.67      1000

task_b


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.80      1.00      0.89       773
           1       0.78      0.99      0.87       751
           2       0.72      0.94      0.82       601
           3       0.82      0.39      0.53       366

   micro avg       0.78      0.89      0.83      2491
   macro avg       0.78      0.83      0.78      2491
weighted avg       0.78      0.89      0.81      2491
 samples avg       0.76      0.83      0.77      2491

task_c
results for class Humour:
              precision    recall  f1-score   support

           0       0.65      0.64      0.64       227
           1       0.66      0.75      0.70       343
           2       0.68      0.70      0.69       341
           3       0.62      0.27      0.37        89

    accuracy                           0.66      1000
   macro avg       0.65      0.59      0.60      1000
weighted avg       0.66      0.66      0.65      1000

results for class Sarcasm:
              pr