# Automatic Winner Prediction

In this notebook, we:
* load the alignment measures previously calculated and saved with the calculate_alignment.ipynb notebook,
* Look at the values of these measures when either side wins and run tests of statistical significance,
* Run the classification experiment (training and evaluation)


In [1]:
import numpy as np
from scipy.stats import spearmanr
import pickle
from utils import obtain_winning_sides, load_cluster_info, load_iq2
from scipy.stats import shapiro, mannwhitneyu, ttest_ind
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from collections import defaultdict


ori_data = load_iq2()
convid_to_title = dict(zip(ori_data['ConvID'], ori_data['ConvTitle']))


winners, all_results = obtain_winning_sides()
measures_all_debates = pickle.load(open("measures_all_debates.pkl", "rb"))
cluster_data = load_cluster_info("debates_full_chains")

Dataset already exists at /home/aina/.convokit/downloads/iq2-corpus
Dataset already exists at /home/aina/.convokit/downloads/iq2-corpus


In [2]:


def select_features(measures_all_debates, setting, data_split, all_debate_results):
    
    sides = ['for','against']
    
    ours_symmetric = ["TUOS_cos", "TUOS_eucl", 'sApp_cos', 'sApp_eucl', 'SV']
    ours_asymmetric = ["TUSS_cos","TUSS_eucl", 'TASS_cos', 'TASS_eucl', 'DS_cos','DS_eucl','asApp_cos','asApp_eucl']
    dialign_symmetric = ['Expression Repetition', 'Voc. Overlap', 'Num. utterances', 'Num. tokens', 'Expression Lexicon Size (ELS)', 'Expression Variety (EV)', 'Expression Repetition (ER)', 'ENTR', 'L', 'LMAX']
    dialign_asymmetric = ['Initiated Expression', 'tokens (%)', 'SR/Voc. Overlap', 'SR/ELS', 'SR/EV', 'SR/ER', 'SR/ENTR', 'SR/L', 'SR/LMAX']
    
    if setting['measures_to_include'] == "ours":
        measures_to_include = ours_asymmetric + ours_symmetric
    elif setting['measures_to_include'] == "ours_asym":
        measures_to_include = ours_asymmetric
    elif setting['measures_to_include'] == "dialign_asym":
        measures_to_include = dialign_asymmetric
    elif setting['measures_to_include'] == "ours_asym+dialign_asym":
        measures_to_include = ours_asymmetric + dialign_asymmetric
    elif setting['measures_to_include'] == "baseline_tokens":
        measures_to_include = ['tokens (%)', 'Num. tokens', 'Num. utterances']    
    
    similarity_based_measure_names = [m for m in ours_symmetric+ours_asymmetric if "_cos" in m or "_eucl" in m]    
        
    list_of_feature_names = []    
   
    X = dict()
    y = dict()
    for subset in data_split:
        X[subset] = []
        y[subset] = []
        for i, debate_id in enumerate(data_split[subset]):
            v = []       
            # pick the relevant features, build the feature vector, and append it to X[subset]
            info_this_debate = measures_all_debates[debate_id][setting['vocabulary']][setting['mask']]
            for measure in info_this_debate:                               
                if measure in measures_to_include:                    
                    if measure in similarity_based_measure_names and setting['similarity'] not in measure:
                        continue                                                            
                    if measure in ours_symmetric+dialign_symmetric: # if measure is symmetric                    
                        v.append(info_this_debate[measure])
                        if i == 0 and subset == "train": # only do this once in the loop
                            list_of_feature_names.append(setting['vocabulary'] + "%" + measure)
                    else:   # if measure is asymmetric
                        for side in sides:
                            v.append(info_this_debate[measure][side])
                            if i == 0 and subset == "train":
                                list_of_feature_names.append(setting['vocabulary'] + "%" + measure + "-" + side)                        

            X[subset].append(v)                   
            
            yval = sides.index(all_debate_results[debate_id]['winner']) # 0 for for, 1 for against                                
            y[subset].append(yval)
            
        X[subset] = np.array(X[subset])
        y[subset] = np.array(y[subset])          
        
    return X, y, list_of_feature_names

### Measure values when different sides win the debate + significance 

In [3]:
def cohen_d(x,y):
    nx = len(x)
    ny = len(y)
    dof = nx + ny - 2
    return (np.mean(x) - np.mean(y)) / np.sqrt(((nx-1)*np.std(x, ddof=1) ** 2 + (ny-1)*np.std(y, ddof=1) ** 2) / dof)

mask_type = "no-mask"
vocab_option = "tfidf200"
xs = defaultdict(list)

all_debate_ids = [di for di in all_results if all_results[di]['winner'] != "tie"]
wnss = [all_results[debate_id]['winner'] for debate_id in all_debate_ids]

   
for debate_id in all_debate_ids:        
    for measure in measures_all_debates[debate_id][vocab_option][mask_type]:
        if type(measures_all_debates[debate_id][vocab_option][mask_type][measure]) == type(dict()): # asymmetric measures            
            for side in measures_all_debates[debate_id][vocab_option][mask_type][measure]:
                measurename = measure + "%" + side 
                xs[measurename].append(measures_all_debates[debate_id][vocab_option][mask_type][measure][side])            

# Run statistical tests
# Ttest if normality, Mann whitney U otherwise
winners_measures_significance = []
for measurename in xs:    
    assert len(wnss) == len(xs[measurename])
    for_values = [x for x, w in zip(xs[measurename], wnss) if w == "for"] # values in debates where FOR won
    against_values = [x for x, w in zip(xs[measurename], wnss) if w == "against"] # values in debates where AGAINST won
    
    if shapiro(for_values)[1] > 0.05 and shapiro(against_values)[1] > 0.05:
        pval = ttest_ind(for_values, against_values)[1]    
    else:
        pval = mannwhitneyu(for_values, against_values)[1]
        
    d = cohen_d(for_values, against_values) # effect size
        
    winners_measures_significance.append((measurename, pval, d))
    
sorted_winners_measures_significance = sorted(winners_measures_significance, key=lambda i: i[-2])
for m, p, d in sorted_winners_measures_significance:    
    print(m, "\tp =", p.round(3), "\td =", d.round(3))

TASS_cos%against 	p = 0.017 	d = -0.476
TASS_cos%for 	p = 0.037 	d = -0.412
TASS_eucl%against 	p = 0.046 	d = 0.394
TUSS_cos%against 	p = 0.063 	d = -0.367
TASS_eucl%for 	p = 0.084 	d = 0.341
SR/Voc. Overlap%for 	p = 0.096 	d = -0.328
SR/EV%for 	p = 0.19 	d = -0.254
TUSS_cos%for 	p = 0.223 	d = -0.239
SR/LMAX%against 	p = 0.228 	d = 0.294
TUSS_eucl%against 	p = 0.236 	d = 0.233
SR/EV%against 	p = 0.272 	d = -0.257
Initiated Expression%for 	p = 0.313 	d = 0.198
Initiated Expression%against 	p = 0.313 	d = -0.198
DS_eucl%for 	p = 0.32 	d = -0.195
SR/ENTR%against 	p = 0.358 	d = 0.123
SR/L%against 	p = 0.385 	d = 0.316
Expression Repetition%against 	p = 0.404 	d = 0.164
TUSS_eucl%for 	p = 0.406 	d = 0.163
SR/ELS%for 	p = 0.469 	d = 0.142
DS_eucl%against 	p = 0.477 	d = -0.139
tokens (%)%against 	p = 0.509 	d = -0.129
tokens (%)%for 	p = 0.509 	d = 0.129
asApp_eucl%against 	p = 0.523 	d = -0.125
DS_cos%against 	p = 0.542 	d = -0.119
SR/ELS%against 	p = 0.558 	d = 0.03
SR/ENTR%for 	p = 0.56

# Leave-one-out setting


In [4]:
# Set up leave-one-out splits

loo = LeaveOneOut()
all_debate_ids = [x for x in cluster_data if winners[x] != "tie"]
loo.get_n_splits(all_debate_ids)


105

In [5]:
# Prepare every setting we want to test

mask_type = "no-mask"

# possible measure combinations
# ours: symmetric + asymmetric
# ours_asym: ours only asymmetric
# dialign_asym: dialign only asymmetric
# ours_asym+dialign_asym: ours only asymmetric + dialign only asymmetric
# baseline_tokens: num tokens, num utterances, tokens (%)


all_settings = []
all_settings.append({'mask':mask_type,'vocabulary':'all','similarity':'', 'measures_to_include':'dialign_asym'})
all_settings.append({'mask':mask_type,'vocabulary':'all','similarity':'', 'measures_to_include':'baseline_tokens'})
for distance in ["cos","eucl"]:
    for vocab in ["all","tfidf200", "tfidf200_C"]:         
        all_settings.append({'mask':mask_type,'vocabulary':vocab,'similarity':distance, 'measures_to_include':'ours'})
        all_settings.append({'mask':mask_type,'vocabulary':vocab,'similarity':distance, 'measures_to_include':'ours_asym'})
        all_settings.append({'mask':mask_type,'vocabulary':vocab,'similarity':distance, 'measures_to_include':'ours_asym+dialign_asym'})
    


### Training loop

In [6]:

sides = ['for','against']

for setting in all_settings:
    setting['predictions_classification'] = []   
    

truths_classif = []
for loo_n, (train_idcs, test_idcs) in enumerate(loo.split(all_debate_ids)):    
    train_ids = [all_debate_ids[i] for i in train_idcs]
    test_ids = [all_debate_ids[i] for i in test_idcs]    
    truths_classif.append(sides.index(all_results[test_ids[0]]['winner']))    
    for setting in all_settings:                
        splitting = {'train':train_ids,"test":test_ids}
        X, y, list_of_feature_names = select_features(measures_all_debates, setting, splitting, all_results)               
        
        scaler = StandardScaler()
        X['train'] = scaler.fit_transform(X['train']) 
        X['test'] = scaler.transform(X['test'])
        
        logreg = LogisticRegression(solver='liblinear')        
        logreg.fit(X['train'],y['train'])        
        prediction = logreg.predict(X['test'])[0]
        setting['predictions_classification'].append(prediction)
        


In [7]:

# evaluate all settings
for setting in all_settings:
    setting['accuracy'] = accuracy_score(truths_classif, setting['predictions_classification'])
    
settings_results = dict()
for setting in all_settings:
    newsetting = {d:setting[d] for d in setting.keys() if 'predictions' not in d and "result" not in d}
    settings_results[tuple(newsetting.items())] = setting['accuracy']

    
majoritybaseline_setting = {'similarity':'','vocabulary':'', 'measures_to_include': 'majority baseline', 'accuracy': accuracy_score(truths_classif, [1]*len(truths_classif))}

   
sorted_results = sorted(all_settings + [majoritybaseline_setting], key= lambda i: i['accuracy'], reverse=True)


print("MEASURES\tSIM/DIST\tVOCAB\tACC")
for setting in sorted_results:
    print(setting['measures_to_include'] + "\t" + setting['similarity'] + "\t" + setting['vocabulary'] + "\t" + str(setting['accuracy'].round(2)))    

MEASURES	SIM/DIST	VOCAB	ACC
ours_asym	cos	all	0.57
ours_asym	cos	tfidf200_C	0.57
ours	eucl	tfidf200	0.57
ours_asym	eucl	tfidf200	0.57
ours_asym	eucl	tfidf200_C	0.57
ours	cos	tfidf200	0.55
ours_asym	cos	tfidf200	0.54
ours_asym+dialign_asym	cos	tfidf200_C	0.54
ours	eucl	tfidf200_C	0.54
ours_asym+dialign_asym	eucl	tfidf200_C	0.54
dialign_asym		all	0.52
ours_asym+dialign_asym	cos	all	0.52
ours_asym	eucl	all	0.52
ours_asym+dialign_asym	eucl	all	0.52
ours_asym+dialign_asym	eucl	tfidf200	0.51
ours	cos	tfidf200_C	0.5
majority baseline			0.5
ours_asym+dialign_asym	cos	tfidf200	0.5
baseline_tokens		all	0.49
ours	cos	all	0.49
ours	eucl	all	0.47
