In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
gebruikte_dataset = 'covid'

import pandas as pd
import numpy as np
import os
import pickle
import sklearn
import itertools
import functools
import operator
import collections
from sklearn.metrics.pairwise import pairwise_kernels

os.chdir('/content/drive/My Drive/Rumour Identification COVID-19/Modellen')

In [None]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

def explain_instance_with_data(neighborhood_data,
                               labels_column,
                               weights,
                               used_feat):
    
    easy_model = Ridge(alpha=1, fit_intercept=True)
    scaler = StandardScaler()
    selected_data = neighborhood_data[neighborhood_data.columns.intersection(used_feat)]
    selected_data = scaler.fit_transform(selected_data)
    easy_model.fit(selected_data, labels_column, sample_weight=weights)
    prediction_score = easy_model.score(selected_data, labels_column, sample_weight=weights)
    
    return (sorted(zip(used_feat, easy_model.coef_[0]),
                   key=lambda x: np.abs(x[1]), reverse=True),
            prediction_score)

def feat_selection(data, labels, weights, num_features):
    clf = Ridge(alpha=0.01, fit_intercept=True)
    scaler = StandardScaler()
    df = scaler.fit_transform(data)
    clf.fit(df, labels, sample_weight=weights)
    coef = abs(clf.coef_)
    used_features = data.columns[coef.argsort()][0][0:num_features]
    return(used_features)

In [None]:
files = os.listdir()
files = [each_string.lower() for each_string in files]
files = [el for el in files if gebruikte_dataset in el]
files = [el for el in files if 'train' in el]
files

['covid_train3',
 'covid_train5',
 'covid_train10',
 'covid_train4',
 'covid_train6',
 'covid_train1',
 'covid_train2',
 'covid_train7',
 'covid_train9',
 'covid_train8']

In [None]:
appended_data = []
for infile in files:
    data = pd.read_csv(infile)
    appended_data.append(data.reset_index(drop=True))
# see pd.concat documentation for more info
appended_data = pd.concat(appended_data)
appended_data = appended_data.drop_duplicates(subset='Unnamed: 0')
appended_data = appended_data.reset_index(drop=True)
appended_data['cleaned_text'] = np.where(appended_data['Unnamed: 0']==1629, 'empty text with url', appended_data['cleaned_text'])

In [None]:
permutations = os.listdir()
permutations = [each_string.lower() for each_string in permutations]
permutations = [el for el in permutations if gebruikte_dataset in el]
permutations = [el for el in permutations if 'permuted' in el][0]

with open(permutations,'rb') as f:
            save_dict = pickle.load(f)

In [None]:
import keras
final_stability_metrics = []

import warnings
warnings.filterwarnings("ignore")

for model_type in ['HybridCNN', 'HybridLSTM', 'HybridGRU']:
    for stap in range(10):        
        models = os.listdir()
        uitkomsten = []
        if 'Hybrid' in model_type:
            models = [el for el in models if gebruikte_dataset in el]
            models = [el for el in models if model_type in el]
            if stap==0:
              model_str = [el for el in models if str(stap+1) in el]
              fold = [el for el in files if str(stap+1) in el]
              model_str = [el for el in model_str if '10' not in el][0]
              fold = [el for el in fold if '10' not in el][0]
            else:  
              model_str = [el for el in models if str(stap+1) in el][0]
              fold = [el for el in files if str(stap+1) in el][0]
            print(model_type)
            print(stap)
     
            train = pd.read_csv(fold)
            model = keras.models.load_model(model_str)
            train['cleaned_text'] = np.where(train['Unnamed: 0']==1629, 'empty text with url', train['cleaned_text'])
            for i in range(train.shape[0]): #### AANPASSEN AAN LENGTE FOLD!
                true_obs = train.iloc[i,:]
                key = true_obs['Unnamed: 0']
                if key!=1629:
                  permutations_obs = save_dict[key]
                  tussen = permutations_obs.drop(columns = 'id')
                  interpretable_x = tussen.filter(regex='^\D')
                  x_lime = tussen[tussen.columns.drop(list(tussen.filter(regex='bow_')))]
                  x_lime = x_lime.drop(columns = 'text')
                  interpretable_x = interpretable_x.drop(columns = 'text')
                  x_lime = x_lime.drop(columns = [str(el) for el in range(200)])
                  true_obs = true_obs[x_lime.columns]

                  x_lime_text = x_lime.filter(regex='\.')
                  x_lime_meta = x_lime[x_lime.columns.drop(list(x_lime.filter(regex='\.')))]
                  x_lime_meta= np.asarray(x_lime_meta).astype(np.float32)
                  x_lime_text= np.asarray(x_lime_text).astype(np.float32)
                  y_lime = model.predict([x_lime_meta, x_lime_text], verbose = 0)

                  kernel_values = pairwise_kernels(true_obs.values.reshape(1, -1), x_lime, metric='linear')[0]
                  if min(kernel_values)<0:
                      kernel_values = kernel_values-(min(kernel_values)-1)
                  used_features = feat_selection(interpretable_x, y_lime, kernel_values, 20)
                  uitkomst = explain_instance_with_data(interpretable_x, y_lime, kernel_values, used_features)
                  uitkomsten.append(uitkomst)
        else:
            models = [el for el in models if 'Hybrid' not in el]
            models = [el for el in models if gebruikte_dataset in el]
            models = [el for el in models if model_type in el]
            print(model_type)
            print(stap)

            if stap==0:
              model_str = [el for el in models if str(stap+1) in el]
              fold = [el for el in files if str(stap+1) in el]
              model_str = [el for el in model_str if '10' not in el][0]
              fold = [el for el in fold if '10' not in el][0]
            else:  
              model_str = [el for el in models if str(stap+1) in el][0]
              fold = [el for el in files if str(stap+1) in el][0]     
            train = pd.read_csv(fold)
            model = keras.models.load_model(model_str)

            for i in range(train.shape[0]): #### AANPASSEN AAN LENGTE FOLD!
                true_obs = train.iloc[i,:]
                key = true_obs['Unnamed: 0']
                permutations_obs = save_dict[key]
                tussen = permutations_obs.drop(columns = 'id')
                interpretable_x = tussen.filter(regex='^\D')
                x_lime = tussen[tussen.columns.drop(list(tussen.filter(regex='bow_')))]
                x_lime = x_lime.drop(columns = 'text')
                interpretable_x = interpretable_x.drop(columns = 'text')
                x_lime = x_lime.drop(columns = [str(el) for el in range(200)])
                true_obs = true_obs[x_lime.columns]
                
                x_lime_text = x_lime.filter(regex='\.')
                y_lime = model.predict(x_lime_text, verbose = 0)

                kernel_values = pairwise_kernels(true_obs.values.reshape(1, -1), x_lime, metric='linear')[0]
                if min(kernel_values)<0:
                    kernel_values = kernel_values-(min(kernel_values)-1)
                used_features = feat_selection(interpretable_x, y_lime, kernel_values, 20)
                uitkomst = explain_instance_with_data(interpretable_x, y_lime, kernel_values, used_features)
                uitkomsten.append(uitkomst)
        fidelity = np.mean([el[1] for el in uitkomsten])
        var_importances = [dict(el[0]) for el in uitkomsten]
        res = []
        for el in var_importances:
            res.append({key: abs(val) for key, val in el.items()})
        result = dict(functools.reduce(operator.add, map(collections.Counter, res)))
        final_stability_metrics.append([fold, model_str, result, fidelity])

In [None]:
hybrids = [el for el in final_stability_metrics if 'Hybrid' in el[1]]

hybrid_cnn_metrics = [el for el in hybrids if 'CNN' in el[1]] 
hyrbid_lstm_metrics = [el for el in hybrids if 'LSTM' in el[1]] 
hybrid_gru_metrics = [el for el in hybrids if 'GRU' in el[1]] 

In [None]:
def jaccard_spearman(top_N, used_set):
    top = [list(metric[2].keys())[0:top_N] for metric in used_set]
    jaccard = []
    spearman = []
    for lst in top:
        for other_lst in top:
            st = set(lst)
            other_st = set(other_lst)
            jaccard.append(len(st.intersection(other_st))/len(st.union(other_st)))
            new_lst = lst + list(other_st.difference(st))
            new_other = other_lst + list(st.difference(other_st))
            zipbObj = zip(new_lst, range(len(new_lst)))
            dict1 = dict(zipbObj)
            zipbObj = zip(new_other, range(len(new_other)))
            dict2 = dict(zipbObj)
            res = {key: abs(dict2[key] - dict1.get(key, 0))  for key in dict2.keys()}
            rank_differences = list(res.values())
            summed_squared_differences = np.sum([diff ** 2 for diff in rank_differences])
            spearman.append(1-((6*summed_squared_differences)/((len(rank_differences)**3)-len(rank_differences))))
    result_one = np.mean(jaccard)
    result_two = np.mean(spearman)
    return(result_one, result_two)

def jaccard(top_N, used_set):
    top = [list(metric[2].keys())[0:top_N] for metric in used_set]
    jaccard = []
    for lst in top:
        for other_lst in top:
            st = set(lst)
            other_st = set(other_lst)
            jaccard.append(len(st.intersection(other_st))/len(st.union(other_st)))
    result_one = np.mean(jaccard)
    return(result_one)

def return_scores(used_set):
    avg_fidelity = np.mean([metric[3] for metric in used_set])
    jaccard20 = jaccard(20, used_set)
    jaccard10 = jaccard(10, used_set)
    jaccard5 = jaccard(5, used_set)
    return(avg_fidelity, jaccard20, jaccard10, jaccard5)

hybrid_cnn_scores = return_scores(hybrid_cnn_metrics)
hyrbid_lstm_scores = return_scores(hyrbid_lstm_metrics)
hybrid_gru_scores = return_scores(hybrid_gru_metrics)

In [None]:
final_scores = pd.DataFrame([hybrid_cnn_scores ,hyrbid_lstm_scores, hybrid_gru_scores])
final_scores.columns = ['avg_fidelity', 'jaccard20', 'jaccard10', 'jaccard5']
final_scores.index = [s +'_'+gebruikte_dataset for s in ['HybridCNN', 'HybridLSTM', 'HybridGRU']]

In [None]:
final_scores

Unnamed: 0,avg_fidelity,jaccard20,jaccard10,jaccard5
HybridCNN_covid,0.056503,0.26654,0.179267,0.165794
HybridLSTM_covid,0.049232,0.323502,0.20216,0.169444
HybridGRU_covid,0.044755,0.31973,0.207053,0.180238


In [None]:
save_name = 'FinalResults_DL'+gebruikte_dataset+'.csv'
final_scores.to_csv(save_name)