In [1]:
# Import scikit-learn libraries
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# import libraries for charting and manipulations with datasets
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# import self libraries
from lib.utils import get_data, get_two_classes, my_train_val_test_split
from sklearn import metrics


In [2]:
# directories
main_path = "./"
results_path = main_path+"experiments_03/"

# classes
Y_feat_names = ["ns", "sens"]

# models
models = [{"name": "LR", "model": LogisticRegression(max_iter=1000)},
          {"name": "lSVC", "model": LinearSVC(dual=False)},
          {"name": "KNN", "model": KNeighborsClassifier()},
          {"name": "RF", "model": RandomForestClassifier(n_estimators=100, random_state=0)},
         ]

# key columns
X_cols_dict = [{"name": "X", "cols": ['PrivTtl', 'Intimacy', 'Law', 'NegativePrivacy', 'NormsRequisites', 'OpenVisible', 'OutcomeState', 'PrivateSecret', 'Restriction', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they', 'ipron', 'article', 'prep', 'auxverb', 'adverb', 'conj', 'negate', 'verb', 'adj', 'compare', 'interrog', 'number', 'quant', 'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad', 'social', 'family', 'friend', 'female', 'male', 'cogproc', 'insight', 'cause', 'discrep', 'tentat', 'certain', 'differ', 'percept', 'see', 'hear', 'feel', 'bio', 'body', 'health', 'sexual', 'ingest', 'drives', 'affiliation', 'achieve', 'power', 'reward', 'risk', 'focuspast', 'focuspresent', 'focusfuture', 'relativ', 'motion', 'space', 'time', 'work', 'leisure', 'home', 'money', 'relig', 'death', 'informal', 'swear', 'netspeak', 'assent', 'nonflu', 'filler']},
               {"name": "XNP", "cols": ['pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they', 'ipron', 'article', 'prep', 'auxverb', 'adverb', 'conj', 'negate', 'verb', 'adj', 'compare', 'interrog', 'number', 'quant', 'affect', 'posemo', 'negemo', 'anx', 'anger', 'sad', 'social', 'family', 'friend', 'female', 'male', 'cogproc', 'insight', 'cause', 'discrep', 'tentat', 'certain', 'differ', 'percept', 'see', 'hear', 'feel', 'bio', 'body', 'health', 'sexual', 'ingest', 'drives', 'affiliation', 'achieve', 'power', 'reward', 'risk', 'focuspast', 'focuspresent', 'focusfuture', 'relativ', 'motion', 'space', 'time', 'work', 'leisure', 'home', 'money', 'relig', 'death', 'informal', 'swear', 'netspeak', 'assent', 'nonflu', 'filler']},
               {"name": "XP", "cols": ['PrivTtl', 'Intimacy', 'Law', 'NegativePrivacy', 'NormsRequisites', 'OpenVisible', 'OutcomeState', 'PrivateSecret', 'Restriction']}
              ]


In [3]:
# training
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

def get_indexes(main_path, datadict_path, ann, train_ratio, validation_ratio, test_ratio, random_state, text=True):
    data_loc = get_data(main_path, lim=ann)
    data_loc = get_two_classes(data_loc)
    x_train, x_val, x_test, y_train, y_val, y_test = my_train_val_test_split(data_loc['uri'].values, data_loc['class'].values, validation_ratio, test_ratio, random_state)
    if text:
        data_loc = pd.read_csv(main_path)
        data_loc = data_loc[["text"]].reset_index(drop=True)
        data_loc[["id"]] = data_loc.index
        data_loc = data_loc.merge(pd.read_csv(datadict_path, sep=";"))
    else:
        data_loc = pd.read_csv(datadict_path, sep=";")
    res_train = pd.DataFrame({"id": x_train}).merge(data_loc)
    res_test = pd.DataFrame({"id": x_test}).merge(data_loc)
    return [x_train, x_test]

def get_indexes_1(main_path, ann, train_ratio, validation_ratio, test_ratio, random_state):
    data_loc = get_data(main_path, lim=ann)
    data_loc = get_two_classes(data_loc)
    x_train, x_val, x_test, y_train, y_val, y_test = my_train_val_test_split(data_loc['uri'].values, data_loc['class'].values, validation_ratio, test_ratio, random_state)
    return [x_train, x_test]

def get_indexes_2(main_path, train_ratio, validation_ratio, test_ratio, random_state):
    data_loc = pd.read_csv(main_path)
    x_train, x_val, x_test, y_train, y_val, y_test = my_train_val_test_split(data_loc['uri'].values, data_loc['class'].values, validation_ratio, test_ratio, random_state)
    return [x_train, x_test]

def get_indexes(data_df, train_ratio, validation_ratio, test_ratio, random_state):
    x_train, x_val, x_test, y_train, y_val, y_test = my_train_val_test_split(data_df['uri'].values, data_df['class'].values, validation_ratio, test_ratio, random_state)
    return [x_train, x_test]

def get_metrics(y_test, y_pred, Y_feat_names):
    res = {}
    res["accuracy"] = metrics.accuracy_score(y_test, y_pred)
    tmp = metrics.precision_score(y_test, y_pred, average=None)
    for index, cls in enumerate(Y_feat_names):
        res["precision_"+cls] = tmp[index]
    tmp = metrics.recall_score(y_test, y_pred, average=None)
    for index, cls in enumerate(Y_feat_names):
        res["recall_"+cls] = tmp[index]
    tmp = metrics.f1_score(y_test, y_pred, average=None)
    for index, cls in enumerate(Y_feat_names):
        res["f1_"+cls] = tmp[index]
    res["f1-micro"] = metrics.f1_score(y_test, y_pred, average="micro")
    res["f1-macro"] = metrics.f1_score(y_test, y_pred, average="macro")
    return res

def calculate_metrics(data_df, datadict_path, model_dict, data_x_dict, train_ratio, validation_ratio, test_ratio, random_state):
    data = pd.read_csv(datadict_path, sep=';', error_bad_lines=False)
    dataY = data['class'].values
    dataX = data[data_x_dict["cols"]].values
    [train_index, test_index] = get_indexes(data_df, train_ratio, validation_ratio, test_ratio, random_state)
    X_train_k, X_test_k = dataX[train_index], dataX[test_index]
    Y_train_k, Y_test_k = dataY[train_index], dataY[test_index]
    model = model_dict["model"]
    model.fit(X_train_k, Y_train_k)
    model.predict(X_test_k)
    y_pred = model.predict(X_test_k)
    met_dict_loc = get_metrics(Y_test_k, y_pred, Y_feat_names)
    met_dict_loc["mod"] = model_dict["name"]
    met_dict_loc["x_col"] = data_x_dict["name"]
    return met_dict_loc


In [4]:
met_dict_loc = []
for ann in [2,3]:
    print("#### Agreement on", ann, "annotators")
    data_orig_path = main_path+'data/annotation_results.csv'
    data_df = get_two_classes(get_data(data_orig_path, lim=ann))
    datadict_path = main_path+"data/dictclass/datadict"+str(ann)+".csv"
    #model = DummyClassifier(strategy="most_frequent")
    for model_dict in models:
        print(" - model:", model_dict["name"])
        for X_cols in X_cols_dict: 
            met_dict_inn = calculate_metrics(data_df, datadict_path, model_dict, X_cols, train_ratio, validation_ratio, test_ratio, 512)
            met_dict_inn["data"] = "ann"+str(ann)
            met_dict_loc.append(met_dict_inn)

met_dict_df_1 = pd.DataFrame(met_dict_loc)
met_dict_df_1

#### Agreement on 2 annotators
 - model: LR
 - model: lSVC
 - model: KNN
 - model: RF
#### Agreement on 3 annotators
 - model: LR


  _warn_prf(average, modifier, msg_start, len(result))


 - model: lSVC


  _warn_prf(average, modifier, msg_start, len(result))


 - model: KNN
 - model: RF


Unnamed: 0,accuracy,precision_ns,precision_sens,recall_ns,recall_sens,f1_ns,f1_sens,f1-micro,f1-macro,mod,x_col,data
0,0.691781,0.709035,0.641256,0.85267,0.429429,0.774247,0.514388,0.691781,0.644318,LR,X,ann2
1,0.694064,0.709924,0.647059,0.856354,0.429429,0.776294,0.516245,0.694064,0.64627,LR,XNP,ann2
2,0.618721,0.619429,0.0,0.998158,0.0,0.764457,0.0,0.618721,0.382228,LR,XP,ann2
3,0.695205,0.707207,0.657143,0.867403,0.414414,0.779156,0.508287,0.695205,0.643722,lSVC,X,ann2
4,0.69863,0.709145,0.665072,0.871087,0.417417,0.781818,0.512915,0.69863,0.647367,lSVC,XNP,ann2
5,0.618721,0.619429,0.0,0.998158,0.0,0.764457,0.0,0.618721,0.382228,lSVC,XP,ann2
6,0.636986,0.715105,0.521246,0.688766,0.552553,0.701689,0.536443,0.636986,0.619066,KNN,X,ann2
7,0.634703,0.711575,0.518625,0.690608,0.543544,0.700935,0.530792,0.634703,0.615863,KNN,XNP,ann2
8,0.398402,0.648148,0.381995,0.064457,0.942943,0.117253,0.543723,0.398402,0.330488,KNN,XP,ann2
9,0.699772,0.707715,0.673267,0.878453,0.408408,0.783895,0.508411,0.699772,0.646153,RF,X,ann2


In [5]:
met_dict_loc = []
for index in range(10):
    print("#### Sample", (index+1))
    data_df = pd.read_csv(main_path+"data/sample_ann2_"+(("0"+str(index+1))[-2:])+".csv")
    data_df["uri"] = data_df.index
    datadict_path = main_path+"data/dictclass/whtw"+str(index+1)+".csv"
    for model_dict in models:
        print(" - model:", model_dict["name"])
        for X_cols in X_cols_dict: 
            met_dict_inn = calculate_metrics(data_df, datadict_path, model_dict, X_cols, train_ratio, validation_ratio, test_ratio, 512)
            met_dict_inn["data"] = "sample_"+str(("0"+str(index+1))[-2:])
            met_dict_loc.append(met_dict_inn)

met_dict_df_2 = pd.DataFrame(met_dict_loc)
met_dict_df_2

#### Sample 1
 - model: LR
 - model: lSVC
 - model: KNN
 - model: RF
#### Sample 2
 - model: LR
 - model: lSVC
 - model: KNN
 - model: RF
#### Sample 3
 - model: LR
 - model: lSVC
 - model: KNN
 - model: RF
#### Sample 4
 - model: LR
 - model: lSVC
 - model: KNN
 - model: RF
#### Sample 5
 - model: LR
 - model: lSVC
 - model: KNN
 - model: RF
#### Sample 6
 - model: LR
 - model: lSVC
 - model: KNN
 - model: RF
#### Sample 7
 - model: LR
 - model: lSVC
 - model: KNN
 - model: RF
#### Sample 8
 - model: LR
 - model: lSVC
 - model: KNN
 - model: RF
#### Sample 9
 - model: LR
 - model: lSVC
 - model: KNN
 - model: RF
#### Sample 10
 - model: LR
 - model: lSVC
 - model: KNN
 - model: RF


Unnamed: 0,accuracy,precision_ns,precision_sens,recall_ns,recall_sens,f1_ns,f1_sens,f1-micro,f1-macro,mod,x_col,data
0,0.787671,0.811518,0.742574,0.856354,0.675676,0.833333,0.707547,0.787671,0.770440,LR,X,sample_01
1,0.781963,0.810954,0.729032,0.845304,0.678679,0.827773,0.702955,0.781963,0.765364,LR,XNP,sample_01
2,0.634703,0.634176,0.644444,0.970534,0.087087,0.767103,0.153439,0.634703,0.460271,LR,XP,sample_01
3,0.789954,0.812174,0.747508,0.860037,0.675676,0.835420,0.709779,0.789954,0.772600,lSVC,X,sample_01
4,0.788813,0.812937,0.743421,0.856354,0.678679,0.834081,0.709576,0.788813,0.771828,lSVC,XNP,sample_01
...,...,...,...,...,...,...,...,...,...,...,...,...
115,0.734018,0.802734,0.637363,0.756906,0.696697,0.779147,0.665710,0.734018,0.722429,KNN,XNP,sample_10
116,0.616438,0.626683,0.474576,0.942910,0.084084,0.752941,0.142857,0.616438,0.447899,KNN,XP,sample_10
117,0.797945,0.807047,0.778571,0.885820,0.654655,0.844601,0.711256,0.797945,0.777928,RF,X,sample_10
118,0.791096,0.809278,0.755102,0.867403,0.666667,0.837333,0.708134,0.791096,0.772734,RF,XNP,sample_10


In [6]:
met_dict_df_1.to_csv(results_path+"dictclass_agreements.csv", index=False)
met_dict_df_2.to_csv(results_path+"dictclass_samples.csv", index=False)