<a href="https://colab.research.google.com/github/antoniobelotti/HVD/blob/main/4_sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Human Value Detection

Exam project for the information retrieval course, UNIMI 2023, Belotti Antonio 960822.

Multi-label classification task: "predict which of the 20 human value categories are present in a textual argument".



#Setup env

Set this variable to False for a demo run where it's not important to persist results, models and data.

In [None]:
PERSIST_ON_DRIVE = True

In [None]:
import pathlib

if PERSIST_ON_DRIVE:
  from google.colab import drive
  drive.mount('/content/gdrive')

  BASE_PATH = pathlib.Path("/content/gdrive/MyDrive/human_value_detection")

  # # create or reuse virtualenv stored on drive
  # !pip3 install virtualenv
  # !virtualenv $BASE_PATH/hvd
  # !source $BASE_PATH/hvd/bin/activate;
else:
  BASE_PATH = pathlib.Path("/content/human_value_detection")

Mounted at /content/gdrive


In [None]:
!python -m pip install --upgrade
#!pip install --no-cache-dir transformers sentencepiece
#!pip install accelerate -U

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas() # to use progress_apply
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.model_selection import (
  StratifiedKFold,
  cross_validate,
  train_test_split
)
from sklearn.metrics import (
  make_scorer,
  accuracy_score,
  precision_score,
  recall_score,
  f1_score,
  confusion_matrix,
  classification_report
)

%matplotlib inline

[31mERROR: You must give at least one requirement to install (see "pip help install")[0m[31m
[0m

In [None]:
MODELS_PATH = BASE_PATH / "models"
DATA_PATH = BASE_PATH / "data"
TRAINING_CACHE = BASE_PATH / "training_cache"

!mkdir -p $DATA_PATH
!mkdir -p $MODELS_PATH
!mkdir -p $TRAINING_CACHE

# Dataset

In [None]:
!pip install -q transformers datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m91.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m68.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from datasets import load_dataset

ds_name = "webis/Touche23-ValueEval"
DS_PATH = DATA_PATH / ds_name

dataset = load_dataset(ds_name, "main")
dataset.save_to_disk(DS_PATH)

dataset

Downloading builder script:   0%|          | 0.00/32.6k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.1k [00:00<?, ?B/s]

Downloading and preparing dataset touche23-value_eval/main to /root/.cache/huggingface/datasets/webis___touche23-value_eval/main/0.0.2/109738f7f54e5a68f95e3d0b4d07797f6b7e558edce5e29c71cf0668208bfa43...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.01M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/254k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/363k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/89.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/290k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.4k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset touche23-value_eval downloaded and prepared to /root/.cache/huggingface/datasets/webis___touche23-value_eval/main/0.0.2/109738f7f54e5a68f95e3d0b4d07797f6b7e558edce5e29c71cf0668208bfa43. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Saving the dataset (0/1 shards):   0%|          | 0/5393 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1896 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1576 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Argument ID', 'Conclusion', 'Stance', 'Premise', 'Labels'],
        num_rows: 5393
    })
    validation: Dataset({
        features: ['Argument ID', 'Conclusion', 'Stance', 'Premise', 'Labels'],
        num_rows: 1896
    })
    test: Dataset({
        features: ['Argument ID', 'Conclusion', 'Stance', 'Premise', 'Labels'],
        num_rows: 1576
    })
})

In [None]:
labels = ["Self-direction: thought", "Self-direction: action", "Stimulation", "Hedonism", "Achievement", "Power: dominance", "Power: resources", "Face", "Security: personal", "Security: societal", "Tradition", "Conformity: rules", "Conformity: interpersonal", "Humility", "Benevolence: caring", "Benevolence: dependability", "Universalism: concern", "Universalism: nature", "Universalism: tolerance", "Universalism: objectivity"]

# Classifiers

### Data preprocessing

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize

import nltk
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

def decontract(sentence):
  sentence = re.sub(r"n\'t", " not", sentence)
  sentence = re.sub(r"\'re", " are", sentence)
  sentence = re.sub(r"\'s", " is", sentence)
  sentence = re.sub(r"\'d", " would", sentence)
  sentence = re.sub(r"\'ll", " will", sentence)
  sentence = re.sub(r"\'t", " not", sentence)
  sentence = re.sub(r"\'ve", " have", sentence)
  sentence = re.sub(r"\'m", " am", sentence)
  return sentence

def removePunctuation(sentence):
  sentence = re.sub(r'[?|!|\'|"|#]',r'',sentence)
  sentence = re.sub(r'[.|,|)|(|\|/]',r' ',sentence)
  sentence = sentence.strip()
  sentence = sentence.replace("\n"," ")
  return sentence

def removeNumber(sentence):
  alpha_sent = ""
  for word in sentence.split():
    alpha_word = re.sub('[^a-z A-Z]+', '', word)
    alpha_sent += alpha_word
    alpha_sent += " "
  alpha_sent = alpha_sent.strip()
  return alpha_sent

def removeStopWords(sentence):
  tokens = set(word_tokenize(sentence))
  filtered_tokens = tokens.difference(stop_words)
  return " ".join(filtered_tokens)


def stemming(sentence):
  stemmer = SnowballStemmer("english")
  stemmedSentence = ""
  for word in sentence.split():
    stem = stemmer.stem(word)
    stemmedSentence += stem
    stemmedSentence += " "
  stemmedSentence = stemmedSentence.strip()
  return stemmedSentence

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def preprocess_data(s):
  s = decontract(s)
  s = removePunctuation(s)
  s = removeNumber(s)
  s = removeStopWords(s)
  s = stemming(s)
  return s

In [None]:
X_train = [preprocess_data(s) for s in dataset["train"]["Premise"]]
y_train = dataset["train"]["Labels"]

X_test = [preprocess_data(s) for s in dataset["test"]["Premise"]]
y_test = dataset["test"]["Labels"]

X_train.extend([preprocess_data(s) for s in dataset["validation"]["Premise"]])
y_train.extend(dataset["validation"]["Labels"])

In [None]:
X_train[0], y_train[0]

('human caus clone bunch huge ban issu run human around act',
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

## GridSearchCV

In [None]:
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
classifier = OneVsRestClassifier(RandomForestClassifier())

pipeline = Pipeline([
  ('vectorizer', vectorizer),
  ('classifier', classifier)
])

param_grid = [
  {
    'classifier': [OneVsRestClassifier(RandomForestClassifier())],
  },
  {
    'classifier': [OneVsRestClassifier(LogisticRegression())],
    'classifier__estimator__C': [0.1, 1, 10]
  },
  {
    'classifier': [OneVsRestClassifier(SVC())],
    'classifier__estimator__C': [0.1, 1, 10],
    'classifier__estimator__kernel': ['linear', 'rbf']
  }
]

grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

y_pred = grid_search.predict(X_test)

report = classification_report(y_test, grid_search.predict(X_test), target_names=["Self-direction: thought", "Self-direction: action", "Stimulation", "Hedonism", "Achievement", "Power: dominance", "Power: resources", "Face", "Security: personal", "Security: societal", "Tradition", "Conformity: rules", "Conformity: interpersonal", "Humility", "Benevolence: caring", "Benevolence: dependability", "Universalism: concern", "Universalism: nature", "Universalism: tolerance", "Universalism: objectivity"])
print("\nClassification Report:")
print(report)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Parameters:  {'classifier': OneVsRestClassifier(estimator=SVC(C=1, kernel='linear')), 'classifier__estimator__C': 1, 'classifier__estimator__kernel': 'linear'}
Best Score:  0.07161520044663998

Classification Report:
                            precision    recall  f1-score   support

   Self-direction: thought       0.64      0.41      0.50       143
    Self-direction: action       0.67      0.44      0.53       391
               Stimulation       0.00      0.00      0.00        77
                  Hedonism       0.00      0.00      0.00        26
               Achievement       0.57      0.34      0.43       412
          Power: dominance       1.00      0.03      0.05       108
          Power: resources       0.56      0.22      0.32       105
                      Face       0.00      0.00      0.00        96
        Security: personal       0.66      0.69      0.67       537
        Security: societal       0.57      0.50      0.53       397
                 Tradition   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
m = list(zip(labels, range(0,20)))
m = [(name.split(": ")[0], id) for name,id in m]

grouped = {}
reversed = {}
for l,id in m:
  if l not in grouped:
    grouped[l] = []
  grouped[l].append(id)
  reversed[id] = l

new_label_map = {label: new_id for new_id, label in enumerate(grouped.keys())}


import numpy as np

def convert_labels(original_labels):
  original_labels = np.array(original_labels)
  new_labels = [0] * len(new_label_map)
  old_positives = np.where(original_labels == 1)

  for i in old_positives[0]:
    old_textual_label = reversed[i]
    new_numeric_label = new_label_map[old_textual_label]

    new_labels[new_numeric_label] = 1

  return new_labels

grouped_y_test = [convert_labels(l) for l in y_test]
groupet_y_hat_test = [convert_labels(l) for l in grid_search.predict(X_test)]

report = classification_report(grouped_y_test, groupet_y_hat_test, target_names=["Self-direction", "Stimulation", "Hedonism", "Achievement", "Power", "Face", "Security", "Tradition", "Conformity", "Humility", "Benevolence", "Universalism"])
print("\nClassification Report:")
print(report)


Classification Report:
                precision    recall  f1-score   support

Self-direction       0.68      0.42      0.52       427
   Stimulation       0.00      0.00      0.00        77
      Hedonism       0.00      0.00      0.00        26
   Achievement       0.57      0.34      0.43       412
         Power       0.61      0.13      0.21       211
          Face       0.00      0.00      0.00        96
      Security       0.71      0.71      0.71       840
     Tradition       0.71      0.14      0.24       168
    Conformity       0.59      0.20      0.30       303
      Humility       0.00      0.00      0.00        74
   Benevolence       0.65      0.32      0.43       442
  Universalism       0.81      0.53      0.64      1060

     micro avg       0.71      0.42      0.53      4136
     macro avg       0.44      0.23      0.29      4136
  weighted avg       0.65      0.42      0.49      4136
   samples avg       0.67      0.47      0.51      4136



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Ensemble

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

In [None]:
def select_best_model(base_model, model_param_grid):
  print(f"{base_model} model selection... ", end="")
  vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
  classifier = OneVsRestClassifier(base_model)

  pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', classifier)
  ])

  param_grid = {
    'vectorizer__max_features': [1000, 3000],
    **{"classifier__estimator__" + k:v for k,v in model_param_grid.items()}
  }

  grid_search = GridSearchCV(pipeline, param_grid, cv=5)
  grid_search.fit(X_train, y_train)

  print("Done")
  return grid_search.best_estimator_


rf = select_best_model(RandomForestClassifier(), {})
svc = select_best_model(SVC(), {"kernel": ["linear", "rbf"], "C": [0.1, 1, 10], "gamma":[0.1, 1]})
knn = select_best_model(KNeighborsClassifier(), {"n_neighbors": [2,5]})

RandomForestClassifier() model selection... Done
SVC() model selection... 

KeyboardInterrupt: ignored

In [None]:
y_pred_1 = rf.predict(X_test)
y_pred_2 = svc.predict(X_test)
y_pred_3 = knn.predict(X_test)

In [None]:
y_pred = np.mean( np.array([ y_pred_1, y_pred_2, y_pred_3]), axis=0 )
y_pred = np.rint(y_pred)

In [None]:
report = classification_report(y_test, y_pred, target_names=["Self-direction: thought", "Self-direction: action", "Stimulation", "Hedonism", "Achievement", "Power: dominance", "Power: resources", "Face", "Security: personal", "Security: societal", "Tradition", "Conformity: rules", "Conformity: interpersonal", "Humility", "Benevolence: caring", "Benevolence: dependability", "Universalism: concern", "Universalism: nature", "Universalism: tolerance", "Universalism: objectivity"])
print("\nClassification Report:")
print(report)


report = classification_report(y_test, y_pred_3, target_names=["Self-direction: thought", "Self-direction: action", "Stimulation", "Hedonism", "Achievement", "Power: dominance", "Power: resources", "Face", "Security: personal", "Security: societal", "Tradition", "Conformity: rules", "Conformity: interpersonal", "Humility", "Benevolence: caring", "Benevolence: dependability", "Universalism: concern", "Universalism: nature", "Universalism: tolerance", "Universalism: objectivity"])
print("\n\n  Classification Report:")
print(report)

## chain

In [None]:
from sklearn.metrics import f1_score
from sklearn.multioutput import ClassifierChain

vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')

base_lr = OneVsRestClassifier(RandomForestClassifier())
ovr = Pipeline([
  ('vectorizer', vectorizer),
  ('classifier', base_lr)
])
ovr.fit(X_train, y_train)
Y_pred_ovr = ovr.predict(X_test)
ovr_jaccard_score = f1_score(y_test, Y_pred_ovr, average="macro")

# Fit an ensemble of logistic regression classifier chains and take the
# take the average prediction of all the chains.
chains = [ClassifierChain(base_lr, order="random", random_state=i) for i in range(10)]
for chain in chains:
    chain.fit(X_train, y_train)

Y_pred_chains = np.array([chain.predict(X_test) for chain in chains])
chain_jaccard_scores = [
    f1_score(y_test, Y_pred_chain >= 0.25, average="samples")
    for Y_pred_chain in Y_pred_chains
]

Y_pred_ensemble = Y_pred_chains.mean(axis=0)
ensemble_jaccard_score = f1_score(
    y_test, Y_pred_ensemble >= 0.25, average="samples"
)

model_scores = [ovr_jaccard_score] + chain_jaccard_scores
model_scores.append(ensemble_jaccard_score)

model_names = (
    "Independent",
    "Chain 1",
    "Chain 2",
    "Chain 3",
    "Chain 4",
    "Chain 5",
    "Chain 6",
    "Chain 7",
    "Chain 8",
    "Chain 9",
    "Chain 10",
    "Ensemble",
)

x_pos = np.arange(len(model_names))

# Plot the Jaccard similarity scores for the independent model, each of the
# chains, and the ensemble (note that the vertical axis on this plot does
# not begin at 0).

fig, ax = plt.subplots(figsize=(7, 4))
ax.grid(True)
ax.set_title("Classifier Chain Ensemble Performance Comparison")
ax.set_xticks(x_pos)
ax.set_xticklabels(model_names, rotation="vertical")
ax.set_ylabel("F1 macro")
ax.set_ylim([min(model_scores) * 0.9, max(model_scores) * 1.1])
colors = ["r"] + ["b"] * len(chain_jaccard_scores) + ["g"]
ax.bar(x_pos, model_scores, alpha=0.5, color=colors)
plt.tight_layout()
plt.show()