# Task 8 of Semeval 2020: Memotion analysis
## Models training and evaluation
This task is divided into 3 subtasks which are detailed below

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from src.models.ordinal_regression import OrdinalClassifier
from src.utils.files import load_dfs
from src.utils.embeddings import retrieve_all_embeds
from sklearn.metrics import classification_report
import pandas as pd
from sklearn.multioutput import MultiOutputClassifier

In [56]:
def evaluate(clf, embeds, y_train, y_dev, multitask=False):
    res = {}
    for item, (X_train, X_dev, X_test) in embeds.items():
            print("############### Embeddings: {} ####################".format(item))
            clf.fit(X_train, y_train)
            y_pred_dev = clf.predict(X_dev)
            y_pred_test = clf.predict(X_test)
            if not multitask:
                rep = classification_report(y_dev, y_pred_dev)
                print(rep)
            else:
                rep = [classification_report(y_dev[:,col], y_pred_dev[:,col]) for col in range(y_dev.shape[1])]
                cols = ["Humour", "Sarcasm", "Offense", "Motivation"]
                for c, r in list(zip(cols, rep)):
                    print("results for class {}:\n{}".format(c, r))
            res[item] = {"pred_cls_dev": y_pred_dev, "report_str": rep, "pred_cls_test": y_pred_test}
    return res

In [8]:
df_train, df_dev = load_dfs(["data/train_cleaned_final.csv", "data/dev_cleaned_final.csv"])
embed = retrieve_all_embeds([("data/features/use.pkl.train", "data/features/xception.pkl.train"), 
                              ("data/features/use.pkl.dev","data/features/xception.pkl.dev"),
                              ("data/features/use.pkl.test", "data/features/xception.pkl.test")])

## Task A: sentiment polarity detection
Classify memes as negative, neutral or positive. More details here: https://competitions.codalab.org/competitions/20629
We compare the results of Ordinal classifier with logistic regression, SVM and random forest.
To investigate how each modality contributes to the detection, we test these models with embeddings of sentences only, 
images only and both concatenated.

In [9]:
y_train_a = df_train["Overall_sentiment"].cat.codes
y_dev_a = df_dev["Overall_sentiment"].cat.codes

In [10]:
y_dev_a

0      2
1      0
2      1
3      2
4      1
      ..
995    2
996    2
997    2
998    2
999    2
Length: 1000, dtype: int8

In [11]:
lr_oc = OrdinalClassifier(LogisticRegression(random_state=0, solver="lbfgs"))
res_a_lr = evaluate(lr_oc, embed, y_train_a, y_dev_a)

  _warn_prf(average, modifier, msg_start, len(result))


image only
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        80
           1       1.00      0.00      0.01       302
           2       0.62      1.00      0.76       618

    accuracy                           0.62      1000
   macro avg       0.54      0.33      0.26      1000
weighted avg       0.68      0.62      0.47      1000



  _warn_prf(average, modifier, msg_start, len(result))


text only
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        80
           1       0.60      0.04      0.07       302
           2       0.62      0.99      0.76       618

    accuracy                           0.62      1000
   macro avg       0.41      0.34      0.28      1000
weighted avg       0.57      0.62      0.50      1000

concatenated
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        80
           1       0.55      0.06      0.11       302
           2       0.63      0.98      0.76       618

    accuracy                           0.62      1000
   macro avg       0.39      0.35      0.29      1000
weighted avg       0.55      0.62      0.51      1000



  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
svm_oc = OrdinalClassifier(SVC(probability=True))
res_a_svc = evaluate(svm_oc, embed, y_train_a, y_dev_a)

  _warn_prf(average, modifier, msg_start, len(result))


image only
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        80
           1       0.00      0.00      0.00       302
           2       0.62      1.00      0.76       618

    accuracy                           0.62      1000
   macro avg       0.21      0.33      0.25      1000
weighted avg       0.38      0.62      0.47      1000



  _warn_prf(average, modifier, msg_start, len(result))


text only
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        80
           1       0.00      0.00      0.00       302
           2       0.62      1.00      0.76       618

    accuracy                           0.62      1000
   macro avg       0.21      0.33      0.25      1000
weighted avg       0.38      0.62      0.47      1000

concatenated
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        80
           1       0.00      0.00      0.00       302
           2       0.62      1.00      0.76       618

    accuracy                           0.62      1000
   macro avg       0.21      0.33      0.25      1000
weighted avg       0.38      0.62      0.47      1000



  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
rf_oc = OrdinalClassifier(RandomForestClassifier(random_state=0))
res_a_rf = evaluate(rf_oc, embed, y_train_a, y_dev_a)

image only
              precision    recall  f1-score   support

           0       0.07      0.07      0.07        80
           1       0.30      0.25      0.27       302
           2       0.63      0.68      0.65       618

    accuracy                           0.50      1000
   macro avg       0.33      0.33      0.33      1000
weighted avg       0.48      0.50      0.49      1000

text only
              precision    recall  f1-score   support

           0       0.95      0.76      0.85        80
           1       0.98      0.76      0.86       302
           2       0.87      0.99      0.93       618

    accuracy                           0.90      1000
   macro avg       0.93      0.84      0.88      1000
weighted avg       0.91      0.90      0.90      1000

concatenated
              precision    recall  f1-score   support

           0       0.67      0.03      0.05        80
           1       0.73      0.08      0.14       302
           2       0.63      0.99      0.

## Task B: Multilabel sentiment detection
Classify memes as Humourous, sarcastics, offensive and/or motivationnal. One meme can have multiple sentiments.
More details here: https://competitions.codalab.org/competitions/20629
We compare the results of OneVsRest classifier with logistic regression, SVM and random forest.
To investigate how each modality contributes to the detection, we test these models with embeddings of sentences only, 
images only and both concatenated.

In [11]:
y_train_b = df_train[["Humour_bin", "Sarcasm_bin", "Offense_bin", "Motivation_bin"]].to_numpy().astype(int)
y_dev_b = df_dev[["Humour_bin", "Sarcasm_bin", "Offense_bin", "Motivation_bin"]].to_numpy().astype(int)

In [12]:
y_dev_b

array([[1, 1, 1, 0],
       [1, 1, 0, 1],
       [0, 0, 0, 0],
       ...,
       [1, 1, 1, 0],
       [1, 1, 1, 0],
       [1, 1, 1, 0]])

In [13]:
lr_ovc = OneVsRestClassifier(LogisticRegression(random_state=0, solver="lbfgs"))
res_b_lr = evaluate(lr_ovc, embed, y_train_b, y_dev_b)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


image only
              precision    recall  f1-score   support

           0       0.77      1.00      0.87       773
           1       0.75      1.00      0.86       751
           2       0.60      1.00      0.75       601
           3       0.00      0.00      0.00       366

   micro avg       0.71      0.85      0.77      2491
   macro avg       0.53      0.75      0.62      2491
weighted avg       0.61      0.85      0.71      2491
 samples avg       0.71      0.80      0.72      2491



  _warn_prf(average, modifier, msg_start, len(result))


text only
              precision    recall  f1-score   support

           0       0.78      1.00      0.87       773
           1       0.75      1.00      0.86       751
           2       0.61      0.96      0.75       601
           3       0.57      0.07      0.12       366

   micro avg       0.71      0.85      0.78      2491
   macro avg       0.68      0.76      0.65      2491
weighted avg       0.70      0.85      0.73      2491
 samples avg       0.71      0.80      0.72      2491

concatenated
              precision    recall  f1-score   support

           0       0.78      1.00      0.87       773
           1       0.75      1.00      0.86       751
           2       0.62      0.94      0.75       601
           3       0.56      0.07      0.13       366

   micro avg       0.72      0.85      0.78      2491
   macro avg       0.68      0.75      0.65      2491
weighted avg       0.70      0.85      0.73      2491
 samples avg       0.72      0.80      0.72      2491


  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
svm_ovc = OneVsRestClassifier(SVC(probability=True))
res_b_svc = evaluate(svm_ovc, embed, y_train_b, y_dev_b)

  _warn_prf(average, modifier, msg_start, len(result))


image only
              precision    recall  f1-score   support

           0       0.77      0.99      0.87       773
           1       0.75      1.00      0.86       751
           2       0.60      0.98      0.75       601
           3       0.33      0.01      0.02       366

   micro avg       0.71      0.85      0.77      2491
   macro avg       0.62      0.75      0.62      2491
weighted avg       0.66      0.85      0.71      2491
 samples avg       0.71      0.80      0.72      2491



  _warn_prf(average, modifier, msg_start, len(result))


text only
              precision    recall  f1-score   support

           0       0.78      1.00      0.88       773
           1       0.75      1.00      0.86       751
           2       0.67      1.00      0.80       601
           3       0.98      0.17      0.30       366

   micro avg       0.74      0.88      0.80      2491
   macro avg       0.80      0.79      0.71      2491
weighted avg       0.78      0.88      0.77      2491
 samples avg       0.73      0.83      0.75      2491

concatenated
              precision    recall  f1-score   support

           0       0.77      1.00      0.87       773
           1       0.75      1.00      0.86       751
           2       0.63      1.00      0.77       601
           3       1.00      0.03      0.06       366

   micro avg       0.72      0.86      0.78      2491
   macro avg       0.79      0.76      0.64      2491
weighted avg       0.76      0.86      0.72      2491
 samples avg       0.72      0.81      0.73      2491


  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
rf_ovc = OneVsRestClassifier(RandomForestClassifier(random_state=0))
res_b_rf = evaluate(rf_ovc, embed, y_train_b, y_dev_b)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


image only
              precision    recall  f1-score   support

           0       0.76      0.80      0.78       773
           1       0.76      0.85      0.80       751
           2       0.61      0.68      0.64       601
           3       0.37      0.30      0.33       366

   micro avg       0.68      0.71      0.69      2491
   macro avg       0.62      0.66      0.64      2491
weighted avg       0.67      0.71      0.69      2491
 samples avg       0.65      0.67      0.61      2491



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


text only
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       773
           1       0.93      0.99      0.96       751
           2       0.87      0.98      0.92       601
           3       0.98      0.80      0.88       366

   micro avg       0.92      0.96      0.94      2491
   macro avg       0.93      0.94      0.93      2491
weighted avg       0.93      0.96      0.94      2491
 samples avg       0.87      0.89      0.87      2491

concatenated
              precision    recall  f1-score   support

           0       0.78      1.00      0.87       773
           1       0.75      1.00      0.86       751
           2       0.63      0.94      0.75       601
           3       0.68      0.08      0.14       366

   micro avg       0.72      0.85      0.78      2491
   macro avg       0.71      0.75      0.66      2491
weighted avg       0.72      0.85      0.73      2491
 samples avg       0.72      0.80      0.73      2491


  _warn_prf(average, modifier, msg_start, len(result))


## Task C: Multilabel sentiment intensity detection
Classify the degree of humour, sarcasm, offense and motivation of each meme. 
One meme can have multiple sentiments of different intensities. Each sentiment intensity is ranked from 0 (not at all) 
to 5 (very much).
More details here: https://competitions.codalab.org/competitions/20629
We compare the results of OneVsRest Ordinal classifier with logistic regression, SVM and random forest.
To investigate how each modality contributes to the detection, we test these models with embeddings of sentences only, 
images only and both concatenated.

In [44]:
cols = ["Humour", "Sarcasm", "Offense", "Motivation"]
y_train_c = pd.concat([df_train[name].cat.codes for name in cols], axis=1).to_numpy()
y_dev_c = pd.concat([df_dev[name].cat.codes for name in cols], axis=1).to_numpy()

In [45]:
y_dev_c

array([[2, 1, 3, 0],
       [1, 1, 0, 1],
       [0, 0, 0, 0],
       ...,
       [1, 1, 2, 0],
       [2, 2, 2, 0],
       [1, 1, 1, 0]], dtype=int8)

In [57]:
lr_ovc_oc = MultiOutputClassifier(OrdinalClassifier(LogisticRegression(random_state=0, solver="lbfgs")))
res_c_lr = evaluate(lr_ovc_oc, embed, y_train_c, y_dev_c, multitask=True)

############### Embeddings: image only ####################


  _warn_prf(average, modifier, msg_start, len(result))


results for class Humour:
              precision    recall  f1-score   support

           0       0.29      0.02      0.03       227
           1       0.35      0.76      0.48       343
           2       0.33      0.23      0.27       341
           3       0.00      0.00      0.00        89

    accuracy                           0.34      1000
   macro avg       0.24      0.25      0.19      1000
weighted avg       0.30      0.34      0.26      1000

results for class Sarcasm:
              precision    recall  f1-score   support

           0       1.00      0.00      0.01       249
           1       0.49      1.00      0.66       491
           2       0.00      0.00      0.00       214
           3       0.00      0.00      0.00        46

    accuracy                           0.49      1000
   macro avg       0.37      0.25      0.17      1000
weighted avg       0.49      0.49      0.33      1000

results for class Offense:
              precision    recall  f1-score   supp

  _warn_prf(average, modifier, msg_start, len(result))


results for class Humour:
              precision    recall  f1-score   support

           0       0.43      0.14      0.21       227
           1       0.40      0.61      0.48       343
           2       0.40      0.47      0.43       341
           3       0.00      0.00      0.00        89

    accuracy                           0.40      1000
   macro avg       0.31      0.30      0.28      1000
weighted avg       0.37      0.40      0.36      1000

results for class Sarcasm:
              precision    recall  f1-score   support

           0       0.57      0.03      0.06       249
           1       0.49      0.97      0.65       491
           2       0.38      0.03      0.05       214
           3       0.00      0.00      0.00        46

    accuracy                           0.49      1000
   macro avg       0.36      0.26      0.19      1000
weighted avg       0.46      0.49      0.35      1000

results for class Offense:
              precision    recall  f1-score   supp

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
svm_ovc_oc = MultiOutputClassifier(OrdinalClassifier(SVC(probability=True)))
res_c_svc = evaluate(svm_ovc_oc, embed, y_train_c, y_dev_c, multitask=True)

############### Embeddings: image only ####################


In [None]:
rf_ovc_oc = MultiOutputClassifier(OrdinalClassifier(RandomForestClassifier(random_state=0)))
res_c_rf = evaluate(rf_ovc_oc, embed, y_train_c, y_dev_c, multitask=True)