In [1]:
import pandas as pd
import time

import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

#from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

from lib.utils import get_data, get_two_classes


In [2]:
# directories
main_path = "./"
dataset_filename = main_path+"data/annotation_results__ann"
dataset_filename_2 = main_path+"data/annotation_results"
dataset_wt_filename = main_path+"data/sample_ann2_"
results_path = main_path+"experiments_03/"

# classes
Y_feat_names = ["ns", "sens"]

# models
models = [{"name": "LR", "model": LogisticRegression()},
          {"name": "lSVC", "model": LinearSVC()},
          #{"name": "KNN", "model": KNeighborsClassifier()},
          {"name": "RF", "model": RandomForestClassifier(n_estimators=100, random_state=0)},
         ]

In [3]:
STOPWORDS = set(stopwords.words("english"))
STEMMER = SnowballStemmer('english')
LEMMATIZER = WordNetLemmatizer()

#stem_or_lemma: 1 for stemming, 2 for lemmatization, other for none
def clean_text(text, remove_sw=False, stem_or_lemma=0):
    def remove_url(txt):
        return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())
    
    res = remove_url(text).lower().split()
    # Remove english stop words
    if remove_sw:
        #stop_words = set(stopwords.words("english"))
        res = [word for word in res if not word in STOPWORDS]
    if stem_or_lemma == 1: # stemming
        #stemmer = SnowballStemmer('english')
        res = [STEMMER.stem(word) for word in res]
        #res = " ".join([stemmer.stem(word) for word in res])
    elif stem_or_lemma == 2: # lemmatization
        #lemmatizer = WordNetLemmatizer()
        res = [LEMMATIZER.lemmatize(word, pos='v') for word in res]

    return res#.split() #remove_url(text).lower().split()

def get_metrics(y_test, y_pred, Y_feat_names):
    res = {}
    res["accuracy"] = metrics.accuracy_score(y_test, y_pred)
    tmp = metrics.precision_score(y_test, y_pred, average=None)
    for index, cls in enumerate(Y_feat_names):
        res["precision_"+cls] = tmp[index]
    res["precision-micro"] = metrics.precision_score(y_test, y_pred, average="micro")
    res["precision-macro"] = metrics.precision_score(y_test, y_pred, average="macro")
    tmp = metrics.recall_score(y_test, y_pred, average=None)
    for index, cls in enumerate(Y_feat_names):
        res["recall_"+cls] = tmp[index]
    tmp = metrics.f1_score(y_test, y_pred, average=None)
    for index, cls in enumerate(Y_feat_names):
        res["f1_"+cls] = tmp[index]
    res["f1-micro"] = metrics.f1_score(y_test, y_pred, average="micro")
    res["f1-macro"] = metrics.f1_score(y_test, y_pred, average="macro")
    return res

## [Sens] BoW: Execution and Performances
using as training sets Sens2 and Sens3, while as test sets Sens2, Sens3 and WH+TW

In [4]:
met_dict = []
experiment_time = time.perf_counter()
for ann in [2,3]:
    print("#### Agreement on", ann, "annotators")
    # get data
    train_loc = pd.read_csv(dataset_filename+str(ann)+"_training.csv")
    test_loc = pd.read_csv(dataset_filename+str(ann)+"_test.csv")
    # clean and stemmming
    train_loc['text_cleaned_stem'] = train_loc['text'].map(lambda x: " ".join(clean_text(x, remove_sw=True, stem_or_lemma=1)))
    test_loc['text_cleaned_stem'] = test_loc['text'].map(lambda x: " ".join(clean_text(x, remove_sw=True, stem_or_lemma=1)))
    # bag of words
    tfidfconverter = TfidfVectorizer()
    X_train = tfidfconverter.fit_transform(train_loc['text_cleaned_stem']).toarray()
    X_test = tfidfconverter.transform(test_loc['text_cleaned_stem']).toarray()
    # classes
    y_train = train_loc["class"].tolist()
    y_test = test_loc["class"].tolist()
    for model_dict in models:
        model_name = model_dict["name"]
        model = model_dict["model"]
        model_time = time.perf_counter()
        print(" - model:", model_name)
        # training
        model.fit(X_train, y_train)
        # prediction
        y_pred = model.predict(X_test)
        # model evaluation on test set
        met_dict_loc = get_metrics(y_test, y_pred, Y_feat_names)
        met_dict_loc["set"] = "test"
        met_dict_loc["data"] = "ann"+str(ann)
        met_dict_loc["mod"] = model_name
        met_dict.append(met_dict_loc)
        # model evaluation on other sets
        for index in range(10):
            test_loc_2 = pd.read_csv(dataset_wt_filename+(("0"+str(index+1))[-2:])+".csv")
            test_loc_2['text_cleaned_stem'] = test_loc_2['text'].map(lambda x: " ".join(clean_text(x, remove_sw=True, stem_or_lemma=1)))
            X_test_loc = tfidfconverter.transform(test_loc_2['text_cleaned_stem']).toarray()
            y_test_loc = test_loc_2["class"].tolist()
            y_pred_loc = model.predict(X_test_loc)
            met_dict_loc = get_metrics(y_test_loc, y_pred_loc, Y_feat_names)
            met_dict_loc["set"] = "sample_"+str(("0"+str(index+1))[-2:])
            met_dict_loc["data"] = "ann"+str(ann)
            met_dict_loc["mod"] = model_name
            met_dict.append(met_dict_loc)
        other_ann = 2
        if (ann == 2):
            other_ann = 3
        test_loc_2 = get_data(dataset_filename_2+".csv", lim=other_ann)
        test_loc_2 = get_two_classes(test_loc_2)
        test_loc_2['text_cleaned_stem'] = test_loc_2['text'].map(lambda x: " ".join(clean_text(x, remove_sw=True, stem_or_lemma=1)))
        X_test_loc = tfidfconverter.transform(test_loc_2['text_cleaned_stem']).toarray()
        y_test_loc = test_loc_2["class"].tolist()#pd.get_dummies(pd.DataFrame({"class": test_loc_2["class"].tolist()})["class"])[Y_feat_names].values
        y_pred_loc = model.predict(X_test_loc)
        met_dict_loc = get_metrics(y_test_loc, y_pred_loc, Y_feat_names)
        met_dict_loc["set"] = "ann"+str(other_ann)
        met_dict_loc["data"] = "ann"+str(ann)
        met_dict_loc["mod"] = model_name
        met_dict.append(met_dict_loc)
        model_time = time.perf_counter() - model_time
        print("   time:", time.strftime("/%d, %H:%M:%S",time.gmtime(model_time)))

experiment_time = time.perf_counter() - experiment_time
print("Experiment time in seconds:", int(experiment_time))
print("Experiment time:", time.strftime("/%d, %H:%M:%S",time.gmtime(experiment_time)))

met_dict_df_1 = pd.DataFrame(met_dict)
met_dict_df_1

#### Agreement on 2 annotators
 - model: LR
   time: /01, 00:00:13
 - model: lSVC
   time: /01, 00:00:13
 - model: RF
   time: /01, 00:01:33
#### Agreement on 3 annotators
 - model: LR
   time: /01, 00:00:12
 - model: lSVC
   time: /01, 00:00:12
 - model: RF
   time: /01, 00:00:38
Experiment time in seconds: 185
Experiment time: /01, 00:03:05


Unnamed: 0,accuracy,precision_ns,precision_sens,precision-micro,precision-macro,recall_ns,recall_sens,f1_ns,f1_sens,f1-micro,f1-macro,set,data,mod
0,0.721461,0.729647,0.697778,0.721461,0.713712,0.874770,0.471471,0.795645,0.562724,0.721461,0.679184,test,ann2,LR
1,0.588933,0.621442,0.392943,0.588933,0.507192,0.860564,0.146882,0.721712,0.213834,0.588933,0.467773,sample_01,ann2,LR
2,0.581175,0.617106,0.366958,0.581175,0.492032,0.853196,0.138489,0.716196,0.201088,0.581175,0.458642,sample_02,ann2,LR
3,0.584141,0.617958,0.371571,0.584141,0.494765,0.860748,0.133993,0.719421,0.196960,0.584141,0.458190,sample_03,ann2,LR
4,0.590416,0.621066,0.391453,0.590416,0.506260,0.868852,0.137290,0.724355,0.203285,0.590416,0.463820,sample_04,ann2,LR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,0.578095,0.612242,0.328273,0.578095,0.470258,0.869589,0.103717,0.718569,0.157631,0.578095,0.438100,sample_07,ann3,RF
68,0.572961,0.610254,0.318141,0.572961,0.464197,0.859458,0.106715,0.713728,0.159820,0.572961,0.436774,sample_08,ann3,RF
69,0.581631,0.614654,0.346901,0.581631,0.480777,0.869958,0.112410,0.720354,0.169799,0.581631,0.445076,sample_09,ann3,RF
70,0.581289,0.615830,0.357509,0.581289,0.486669,0.861300,0.125600,0.718169,0.185892,0.581289,0.452031,sample_10,ann3,RF


In [5]:
met_dict_df_1["set"][0]

'test'

In [6]:
def get_df_stats_test(df, mod_name):
    df_loc = df[(df["set"] == "test") & (df["data"] == "ann2") & (df["mod"] == mod_name)][df.columns[:-3]]
    df_res = df_loc.T.reset_index()
    df_res.columns = ["metric", "ann2"]
    df_loc = df[(df["set"] == "test") & (df["data"] == "ann3") & (df["mod"] == mod_name)][df.columns[:-3]]
    df_loc = df_loc.T.reset_index()
    df_res["ann3"] = df_loc[df_loc.columns[-1]]
    return df_res

def get_df_stats_test_rev(df, mod_name):
    df_loc = df[(df["set"] == "ann3") & (df["data"] == "ann2") & (df["mod"] == mod_name)][df.columns[:-3]]
    df_res = df_loc.T.reset_index()
    df_res.columns = ["metric", "ann2 -> ann3"]
    df_loc = df[(df["set"] == "ann2") & (df["data"] == "ann3") & (df["mod"] == mod_name)][df.columns[:-3]]
    df_loc = df_loc.T.reset_index()
    df_res["ann3 -> ann2"] = df_loc[df_loc.columns[-1]]
    return df_res

def get_df_stats(df, ann, mod_name):
    df_loc = df[(df["set"].str[0] == 's') & (df["data"] == "ann"+str(ann)) & (df["mod"] == mod_name)][df.columns[:-3]]
    df_res = df_loc.mean().reset_index(drop=False)
    df_res[1] = df_loc.std().reset_index(drop=False)[0]
    df_res.columns = ["metric", "mean", "std"]
    return df_res

model_names = [x["name"] for x in models]

print("##  metrics on test set")
for mod_name in model_names:
    print("####  model:", mod_name)
    print(get_df_stats_test(met_dict_df_1, mod_name))

print("##  metrics on the other set")
for mod_name in model_names:
    print("####  model:", mod_name)
    print(get_df_stats_test_rev(met_dict_df_1, mod_name))
    
for ann in [2,3]:
    print("\n##  agreement on", ann, "annotators: test on 10 samples")
    for mod_name in model_names:
        print("####  model:", mod_name)
        print(get_df_stats(met_dict_df_1, ann, mod_name))

##  metrics on test set
####  model: LR
             metric      ann2      ann3
0          accuracy  0.721461  0.779703
1      precision_ns  0.729647  0.769716
2    precision_sens  0.697778  0.816092
3   precision-micro  0.721461  0.779703
4   precision-macro  0.713712  0.792904
5         recall_ns  0.874770  0.938462
6       recall_sens  0.471471  0.493056
7             f1_ns  0.795645  0.845754
8           f1_sens  0.562724  0.614719
9          f1-micro  0.721461  0.779703
10         f1-macro  0.679184  0.730236
####  model: lSVC
             metric      ann2      ann3
0          accuracy  0.705479  0.799505
1      precision_ns  0.752212  0.835206
2    precision_sens  0.620579  0.729927
3   precision-micro  0.705479  0.799505
4   precision-macro  0.686396  0.782566
5         recall_ns  0.782689  0.857692
6       recall_sens  0.579580  0.694444
7             f1_ns  0.767148  0.846300
8           f1_sens  0.599379  0.711744
9          f1-micro  0.705479  0.799505
10         f1-macro  0

## [WH+TW] BoW: Execution and Performances
using as training sets Sens2 and Sens3, while as test sets Sens2, Sens3 and WH+TW

In [7]:
met_dict = []
experiment_time = time.perf_counter()
for index in range(10):
    # get data
    train_loc = pd.read_csv(dataset_wt_filename+(("0"+str(index+1))[-2:])+"_training.csv")
    test_loc = pd.read_csv(dataset_wt_filename+(("0"+str(index+1))[-2:])+"_test.csv")
    # clean and stemmming
    train_loc['text_cleaned_stem'] = train_loc['text'].map(lambda x: " ".join(clean_text(x, remove_sw=True, stem_or_lemma=1)))
    test_loc['text_cleaned_stem'] = test_loc['text'].map(lambda x: " ".join(clean_text(x, remove_sw=True, stem_or_lemma=1)))
    # bag of words
    tfidfconverter = TfidfVectorizer()
    X_train = tfidfconverter.fit_transform(train_loc['text_cleaned_stem']).toarray()
    X_test = tfidfconverter.transform(test_loc['text_cleaned_stem']).toarray()
    # classes
    y_train = train_loc["class"].tolist()
    y_test = test_loc["class"].tolist()
    for model_dict in models:
        model_name = model_dict["name"]
        model = model_dict["model"]
        model_time = time.perf_counter()
        print(" - model:", model_name)
        # training
        model.fit(X_train, y_train)
        # prediction
        y_pred = model.predict(X_test)
        # model evaluation on test set
        met_dict_loc = get_metrics(y_test, y_pred, Y_feat_names)
        met_dict_loc["set"] = "test"
        met_dict_loc["data"] = "sample_"+str(("0"+str(index+1))[-2:])
        met_dict_loc["mod"] = model_name
        met_dict.append(met_dict_loc)
        # model evaluation on other sets
        for ann in [2,3]:
            # get data
            test_loc_2 = get_data(dataset_filename_2+".csv", lim=ann)
            test_loc_2 = get_two_classes(test_loc_2)
            test_loc_2['text_cleaned_stem'] = test_loc_2['text'].map(lambda x: " ".join(clean_text(x, remove_sw=True, stem_or_lemma=1)))
            X_test_loc = tfidfconverter.transform(test_loc_2['text_cleaned_stem']).toarray()
            y_test_loc = test_loc_2["class"].tolist()
            y_pred_loc = model.predict(X_test_loc)
            met_dict_loc = get_metrics(y_test_loc, y_pred_loc, Y_feat_names)
            met_dict_loc["set"] = "ann"+str(ann)
            met_dict_loc["data"] = "sample_"+str(("0"+str(index+1))[-2:])
            met_dict_loc["mod"] = model_name
            met_dict.append(met_dict_loc)
            model_time = time.perf_counter() - model_time
        print("   time:", time.strftime("/%d, %H:%M:%S",time.gmtime(model_time)))

experiment_time = time.perf_counter() - experiment_time
print("Experiment time in seconds:", int(experiment_time))
print("Experiment time:", time.strftime("/%d, %H:%M:%S",time.gmtime(experiment_time)))

met_dict_df_2 = pd.DataFrame(met_dict)
met_dict_df_2

 - model: LR
   time: /01, 04:22:32
 - model: lSVC
   time: /01, 04:22:35
 - model: RF
   time: /01, 04:22:38
 - model: LR
   time: /01, 04:23:35
 - model: lSVC
   time: /01, 04:23:39
 - model: RF
   time: /01, 04:23:42
 - model: LR
   time: /01, 04:24:37
 - model: lSVC
   time: /01, 04:24:40
 - model: RF
   time: /01, 04:24:43
 - model: LR
   time: /01, 04:25:40
 - model: lSVC
   time: /01, 04:25:43
 - model: RF
   time: /01, 04:25:46
 - model: LR
   time: /01, 04:26:43
 - model: lSVC
   time: /01, 04:26:46
 - model: RF
   time: /01, 04:26:49
 - model: LR
   time: /01, 04:27:43
 - model: lSVC
   time: /01, 04:27:46
 - model: RF
   time: /01, 04:27:49
 - model: LR
   time: /01, 04:28:43
 - model: lSVC
   time: /01, 04:28:46
 - model: RF
   time: /01, 04:28:49
 - model: LR
   time: /01, 04:29:45
 - model: lSVC
   time: /01, 04:29:48
 - model: RF
   time: /01, 04:29:51
 - model: LR
   time: /01, 04:30:46
 - model: lSVC
   time: /01, 04:30:49
 - model: RF
   time: /01, 04:30:52
 - model: 

Unnamed: 0,accuracy,precision_ns,precision_sens,precision-micro,precision-macro,recall_ns,recall_sens,f1_ns,f1_sens,f1-micro,f1-macro,set,data,mod
0,0.807078,0.801613,0.820312,0.807078,0.810963,0.915285,0.630631,0.854686,0.713073,0.807078,0.783880,test,sample_01,LR
1,0.578779,0.614291,0.347341,0.578779,0.480816,0.859827,0.121403,0.716610,0.179920,0.578779,0.448265,ann2,sample_01,LR
2,0.591696,0.635714,0.309524,0.591696,0.472619,0.855111,0.117036,0.729269,0.169849,0.591696,0.449559,ann3,sample_01,LR
3,0.821918,0.843694,0.782748,0.821918,0.813221,0.874770,0.735736,0.858951,0.758514,0.821918,0.808733,test,sample_01,lSVC
4,0.554022,0.613875,0.362984,0.554022,0.488429,0.754651,0.227518,0.677022,0.279713,0.554022,0.478367,ann2,sample_01,lSVC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,0.563605,0.618885,0.378900,0.563605,0.498893,0.769018,0.229317,0.685832,0.285714,0.563605,0.485773,ann2,sample_10,lSVC
86,0.569698,0.637496,0.337705,0.569698,0.487600,0.767102,0.213989,0.696320,0.261975,0.569698,0.479147,ann3,sample_10,lSVC
87,0.795662,0.811644,0.763699,0.795662,0.787671,0.872928,0.669670,0.841171,0.713600,0.795662,0.777386,test,sample_10,RF
88,0.570679,0.615087,0.360183,0.570679,0.487635,0.820041,0.164868,0.702929,0.226198,0.570679,0.464563,ann2,sample_10,RF


In [8]:

def get_df_stats_2(df):
    print("##  metrics on test set of 10 samples")
    for mod_name in model_names:
        print("####  model:", mod_name)
        #print(get_df_stats_test(met_dict_df_1, mod_name))
        df_loc = df[(df["set"] == "test") & (df["mod"] == mod_name)][df.columns[:-2]]
        df_res = df_loc.mean().reset_index(drop=False)
        df_res[1] = df_loc.std().reset_index(drop=False)[0]
        df_res.columns = ["metric", "mean", "std"]
        print(df_res)
        for ann in [2,3]:
            print("\n##  10 samples using as test set the agreement on", ann, "annotators")
            #print("####  model:", mod_name)
            df_loc = df[(df["set"] == "ann"+str(ann)) & (df["mod"] == mod_name)][df.columns[:-3]]
            df_res = df_loc.mean().reset_index(drop=False)
            df_res[1] = df_loc.std().reset_index(drop=False)[0]
            df_res.columns = ["metric", "mean", "std"]
            print(df_res)
    return

get_df_stats_2(met_dict_df_2)

##  metrics on test set of 10 samples
####  model: LR
             metric      mean       std
0          accuracy  0.820662  0.009044
1      precision_ns  0.815262  0.012567
2    precision_sens  0.834209  0.014993
3   precision-micro  0.820662  0.009044
4   precision-macro  0.824735  0.008688
5         recall_ns  0.919337  0.010301
6       recall_sens  0.659760  0.030396
7             f1_ns  0.864071  0.005945
8           f1_sens  0.736316  0.017841
9          f1-micro  0.820662  0.009044
10         f1-macro  0.800194  0.011624

##  10 samples using as test set the agreement on 2 annotators
             metric      mean       std
0          accuracy  0.584073  0.003560
1      precision_ns  0.616099  0.001657
2    precision_sens  0.357317  0.011550
3   precision-micro  0.584073  0.003560
4   precision-macro  0.486708  0.006598
5         recall_ns  0.871597  0.007625
6       recall_sens  0.116157  0.007280
7             f1_ns  0.721898  0.003300
8           f1_sens  0.175229  0.009032
9 

In [10]:
met_dict_df_1.to_csv(results_path+"bow_agreements.csv", index=False)
met_dict_df_2.to_csv(results_path+"bow_samples.csv", index=False)