In [1]:
import pandas as pd
import numpy as np
import random
import json

import matplotlib.pyplot as plt
import pickle
from os import path, makedirs
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV, PredefinedSplit
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier

In [3]:
# Set random seed
random.seed(1337)

In [4]:
def load_data(data_path='../data/', is_clean=0, is_os=0):
    clean = '_clean' if is_clean else ''
    os = '_os' if is_os else ''
    data_sets = {}
    data_cols = [
        'data', 'X_train', 'X_val', 'X_train_val', 'X_test', \
        'y_train', 'y_val', 'y_train_val', 'y_test'
    ]
    
    for i, col in enumerate(data_cols):
        data_cols[i] = col + clean + os
    
    for col in data_cols:
        data_sets[col] = pickle.load(open(data_path+'{}.pkl'.format(col),'rb'))
    
    return data_sets

In [5]:
is_clean, is_os = 1, 0
clean, os = '_clean', ''

In [6]:
data_sets = load_data('../data/', is_clean=1, is_os=0)

In [7]:
target_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [8]:
data_sets['data'+clean+os].head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation edit username hardcore metallica f...,0,0,0,0,0,0
1,000103f0d9cfb60f,d'aww match background colour -pron- seemingly...,0,0,0,0,0,0
2,000113f07ec002fd,hey man -pron- not try edit war -pron- guy con...,0,0,0,0,0,0
3,0001b41b1c6bb37e,not real suggestion improvement wonder section...,0,0,0,0,0,0
4,0001d958c54c6e35,sir hero chance remember page,0,0,0,0,0,0
5,00025465d4725e87,congratulations good use tool good · talk,0,0,0,0,0,0
6,0002bcb3da6cb337,cocksucker piss work,1,1,1,0,1,0
7,00031b1e95af7921,vandalism matt shirvington article revert not ban,0,0,0,0,0,0
8,00037261f536c51d,sorry word nonsense offensive -pron- not inten...,0,0,0,0,0,0
9,00040093b2687caa,alignment subject contrary dulithgow,0,0,0,0,0,0


In [9]:
data_sets['X_train'+clean+os].head()

Unnamed: 0,id,comment_text
101689,202d64bd4c2bd9a4,champion hurdle delighted use 1 2_3 image uplo...
26429,4601831cd03ec679,edit war currently appear engage edit war note...
62499,a73c6f5b69d963be,-pron- wrong bud ready use taran boi verdict n...
10145,1ad558921b36c611,attempt change las vegas season las vegas augu...
16254,2adc0c27c1165eef,people arab homosexual bad thing d**n f***in h...


In [10]:
# Print value counts for each target
for col in target_cols:
    print('{}:'.format(col))
    for data in data_sets:
        if 'y' in data:
            value_counts = data_sets[data][col].value_counts()
            print('{}: {:.3f}%\t'.format(data, 100*value_counts[1]/sum(value_counts)), end='')
    print('\n')

toxic:
y_train_clean: 9.528%	y_val_clean: 9.551%	y_train_val_clean: 9.533%	y_test_clean: 9.789%	

severe_toxic:
y_train_clean: 1.005%	y_val_clean: 0.993%	y_train_val_clean: 1.002%	y_test_clean: 0.990%	

obscene:
y_train_clean: 5.321%	y_val_clean: 5.274%	y_train_val_clean: 5.309%	y_test_clean: 5.239%	

threat:
y_train_clean: 0.287%	y_val_clean: 0.295%	y_train_val_clean: 0.289%	y_test_clean: 0.342%	

insult:
y_train_clean: 4.960%	y_val_clean: 4.860%	y_train_val_clean: 4.935%	y_test_clean: 4.941%	

identity_hate:
y_train_clean: 0.902%	y_val_clean: 0.833%	y_train_val_clean: 0.885%	y_test_clean: 0.862%	



In [11]:
num_feats = 1000
n_grams = 2
ngram_range = list(map(lambda x: x+1,range(n_grams)))

In [12]:
def load_ngrams(data, num_feats, ngram_range, pickle_path='../pickle_objects/', is_clean=1, is_os=0):
    clean = '_clean' if is_clean else ''
    os = '_os' if is_os else ''
    ngrams = {}
    vec_params = {'analyzer': 'word', 'lowercase': True,'max_features': num_feats, 'ngram_range': ngram_range}
    
    for vec in ['countvec', 'tfidf']:
        # Load vectorizer if present
        file_name = '{}{}_{}_ngrams_{}{}.pkl'.format(pickle_path, vec, num_feats, n_grams, clean+os)
        if path.isfile(file_name):
            ngrams[vec] = pickle.load(open(file_name, 'rb'))
        else:
            # Fit, store, and load vectorizer
            ngrams_vec = CountVectorizer(**vec_params) if vec == 'countvec' else TfidfVectorizer(**vec_params)
            ngrams_vec.fit(data['comment_text'])
            ngrams[vec] = ngrams_vec
            if not path.exists(pickle_path):
                makedirs(pickle_path)
            pickle.dump(ngrams_vec, open(file_name, 'wb'))
    return ngrams

In [13]:
ngrams = load_ngrams(data_sets['X_train'+clean+os], num_feats, ngram_range, \
                     '../pickle_objects/', is_clean, is_os)

In [14]:
def transform_to_ngrams(data_set, data_cols, ngrams):
    for data in data_cols:
        for vec in ['countvec', 'tfidf']:
            data_sets[data+'_'+vec] = ngrams[vec].transform(data_sets[data]['comment_text'])
    return data_sets

In [15]:
data_sets = transform_to_ngrams(data_sets, ['X_train'+clean+os, 'X_val'+clean+os, \
                                            'X_train_val'+clean+os, 'X_test'+clean+os], ngrams)

In [16]:
def normalize_data(X_train, X_test):
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

In [17]:
def fit_model(base_model, X, y, params, target_cols, scoring='roc_auc', cv=None):
    models = {}
    for target in target_cols:
        models[target] = {}
        if cv:
            model_target = GridSearchCV(base_model, params, cv=cv, scoring=scoring, n_jobs=-1, refit=False)
            model_target.fit(X, y[target])
            models[target]['params'], models[target]['params'], models[target]['score'] = \
            model_target, model_target.best_params_, np.round(model_target.best_score_, 4)
        else:
            base_model(**params).fit(X, y)
            models[target] = model_target
    return models

In [23]:
def fit_all_models(data_sets, data_cols, model_list, param_grids, target_cols, cv=None):
    best_models, best_params, best_scores = {}, {}, {}
    for (X, y) in data_cols:
        best_models[X], best_params[X], best_scores[X] = {}, {}, {}
        for model in model_list:
            best_models[X][model] = fit_model(model_list[model], data_sets[X], \
                                              data_sets[y], param_grids[model], \
                                              target_cols, scoring='roc_auc', cv=cv)
            print('Model = {}'.format(model))
            print(json.dumps(best_models[X][model], indent=4))
    return best_models

In [19]:
def predict_labels_and_probas(model, X):
    probabilities = model.predict_proba(X)
    probabilities = np.squeeze(np.asarray(probabilities.todense()))
    return probabilities

In [20]:
model_list = {
    'bnb': BernoulliNB(),
    'gnb': GaussianNB(),
    'lrl1': LogisticRegression(penalty='l1'),
    'lrl2': LogisticRegression(penalty='l2'),
    'rf': RandomForestClassifier(),
    'xgb': XGBClassifier(),
    'svm': SVC(kernel='linear')
}
    
param_grids = {
    'bnb': {},
    'gnb': {},
    'lrl1': {'C': np.concatenate((np.reciprocal(np.arange(1, 13, 3)), np.logspace(1, 6, num=6, endpoint=True, base=10)))},
    'lrl2': {'C': np.concatenate((np.reciprocal(np.arange(1, 13, 3)), np.logspace(1, 6, num=6, endpoint=True, base=10)))},
    'rf': {
        'n_estimators': np.arange(50, 250, 50),
        'max_features': ['auto', 'log2'],
        'max_depth': np.arange(3, 13, 2)
    },
    'xgb': {'n_estimators': [1]},
    'svm': {'C': np.concatenate((np.arange(1, 13, 3), np.logspace(1, 6, num=6, endpoint=True, base=10)))},
}

In [21]:
# Set predefined split for CV
# 0 corresponds to val, -1 to train
val_fold = [-1]*len(data_sets['X_train'+clean+os]) + [0]*len(data_sets['X_val'+clean+os])
predefined_split = PredefinedSplit(test_fold=val_fold)

In [24]:
# Test runs
model_list = {
    'lrl1': LogisticRegression(penalty='l1')
}
    
param_grids = {
    'lrl1': {'C': [1e-1, 1e-2]}
}

best_models = fit_all_models(data_sets, [('X_train_val'+clean+os+'_countvec', 'y_train_val'+clean+os)], \
                             model_list, param_grids, target_cols, cv=predefined_split)

Model = lrl1
{
    "toxic": {
        "params": {
            "C": 0.1
        },
        "score": 0.8888
    },
    "severe_toxic": {
        "params": {
            "C": 0.1
        },
        "score": 0.9168
    },
    "obscene": {
        "params": {
            "C": 0.01
        },
        "score": 0.9192
    },
    "threat": {
        "params": {
            "C": 0.1
        },
        "score": 0.8457
    },
    "insult": {
        "params": {
            "C": 0.1
        },
        "score": 0.8948
    },
    "identity_hate": {
        "params": {
            "C": 0.1
        },
        "score": 0.8689
    }
}


In [None]:
# Fit individual BernoulliNB models for each target with Count Vectorizer
bnb_countvec = BinaryRelevance(BernoulliNB())
bnb_countvec.fit(X_train_val_countvec, y_train_val)
predictions_bnb_countvec = bnb_countvec.predict_proba(X_test_countvec)

# Fit individual GaussianNB models for each target with Count Vectorizer
gnb_countvec = BinaryRelevance(GaussianNB())
gnb_countvec.fit(X_train_val_countvec, y_train_val)
predictions_gnb_countvec = gnb_countvec.predict_proba(X_test_countvec)

# Fit individual GaussianNB models for each target with TF-IDF Vectorizer
gnb_tfidf = BinaryRelevance(GaussianNB())
gnb_tfidf.fit(X_train_val_tfidf, y_train_val)
predictions_gnb_tfidf = gnb_tfidf.predict_proba(X_test_tfidf)

# Fit individual Logistic Regression+l1 models for each target with TF-IDF Vectorizer
lr1_tfidf = BinaryRelevance(LogisticRegression(penalty='l1'))
lr1_tfidf.fit(X_train_val_tfidf, y_train_val)
predictions_lr1_tfidf = lr1_tfidf.predict_proba(X_test_tfidf)

# Fit individual Logistic Regression+l2 models for each target with TF-IDF Vectorizer
lr2_tfidf = BinaryRelevance(LogisticRegression(penalty='l2'))
lr2_tfidf.fit(X_train_val_tfidf, y_train_val)
predictions_lr2_tfidf = lr2_tfidf.predict_proba(X_test_tfidf)

# Fit RandomForestClassifier for each target with Count Vectorizer
rf_countvec = RandomForestClassifier(n_estimators=100)
rf_countvec.fit(X_train_val_countvec, y_train_val)
probabilities_rf_countvec = rf_countvec.predict_proba(X_test_countvec)

# Fit RandomForestClassifier for each target with TF-IDF Vectorizer
rf_tfidf = RandomForestClassifier(n_estimators=100)
rf_tfidf.fit(X_train_val_tfidf, y_train_val)
probabilities_rf_tfidf = rf_tfidf.predict_proba(X_test_tfidf)

# # Fit XGBClassifier for each target with Count Vectorizer
# xgb_countvec = BinaryRelevance(XGBClassifier(n_estimators=1))
# xgb_countvec.fit(X_train_val_countvec, y_train_val)
# predictions_xgb_countvec = xgb_countvec.predict_proba(X_test_countvec)

# # Fit XGBClassifier for each target with TF-IDF Vectorizer
# xgb_tfidf = BinaryRelevance(XGBClassifier(n_estimators=1))
# xgb_tfidf.fit(X_train_val_tfidf, y_train_val)
# predictions_xgb_tfidf = xgb_tfidf.predict_proba(X_test_tfidf)


# # Fit individual GaussianNB models for each target with TF-IDF Vectorizer
# scaler = preprocessing.StandardScaler().fit(X_train_val_tfidf)
# X_train_val_tfidf = scaler.transform(X_train_val_tfidf)
# X_test_tfidf = scaler.transform(X_test_tfidf)
# svc_tfidf = BinaryRelevance(SVC(kernel='linear'))
# svc_tfidf.fit(X_train_val_tfidf, y_train_val)
# predictions_svc_tfidf = svc_tfidf.predict_proba(X_test_tfidf)

In [None]:
# Predict probabilities of each class for Count Vectorizer (BernoulliNB)
probabilities_bnb_countvec = np.squeeze(np.asarray(predictions_bnb_countvec.todense()))

# Predict probabilities of each class for Count Vectorizer and TF-IDF Vectorizer (GaussianNB)
probabilities_gnb_countvec = np.squeeze(np.asarray(predictions_gnb_countvec.todense()))
probabilities_gnb_tfidf = np.squeeze(np.asarray(predictions_gnb_tfidf.todense()))


# Predict probabilities of each class for TF-IDF Vectorizer (LogisticRegression+l1)
probabilities_lr1_tfidf = np.squeeze(np.asarray(predictions_lr1_tfidf.todense()))

# Predict probabilities of each class for TF-IDF Vectorizer (LogisticRegression+l2)
probabilities_lr2_tfidf = np.squeeze(np.asarray(predictions_lr2_tfidf.todense()))

# # Predict probabilities of each class for TF-IDF Vectorizer (SVC(linear kernel))
# probabilities_svc_tfidf = np.squeeze(np.asarray(predictions_svc_tfidf.todense()))

In [None]:
# Compute ROC-AUC values of each class for Count Vectorizer (BernoulliNB)
auc_bnb_countvec = []
plt.figure(figsize=(10,8))
for i, col in enumerate(target_cols):
    fpr, tpr, threshold = roc_curve(y_test[col], probabilities_bnb_countvec[:,i])
    auc_value = auc(fpr, tpr)
    auc_bnb_countvec.append(auc_value)
    plt.plot(fpr, tpr, label='{}: {:0.5f}'.format('auc_'+col, auc_value))
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC Curve for BernoulliNB with Count Vectorizer')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
plt.show()

# Compute ROC-AUC values of each class for Count Vectorizer (GaussianNB)
auc_gnb_countvec = []
plt.figure(figsize=(10,8))
for i, col in enumerate(target_cols):
    fpr, tpr, threshold = roc_curve(y_test[col], probabilities_gnb_countvec[:,i])
    auc_value = auc(fpr, tpr)
    auc_gnb_countvec.append(auc_value)
    plt.plot(fpr, tpr, label='{}: {:0.5f}'.format('auc_'+col, auc_value))
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC Curve for GaussianNB with Count Vectorizer')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
plt.show()

# Compute ROC-AUC values of each class for TfIdf Vectorizer (GaussianNB)
auc_gnb_tfidf = []
plt.figure(figsize=(10,8))
for i, col in enumerate(target_cols):
    fpr, tpr, threshold = roc_curve(y_test[col], probabilities_gnb_tfidf[:,i])
    auc_value = auc(fpr, tpr)
    auc_gnb_tfidf.append(auc_value)
    plt.plot(fpr, tpr, label='{}: {:0.5f}'.format('auc_'+col, auc_value))
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC Curve for GaussianNB with TfIdf Vectorizer')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
plt.show()

# Compute ROC-AUC values of each class for TfIdf Vectorizer (LogisticRegression+l1)
auc_lr1_tfidf = []
plt.figure(figsize=(10,8))
for i, col in enumerate(target_cols):
    fpr, tpr, threshold = roc_curve(y_test[col], probabilities_lr1_tfidf[:,i])
    auc_value = auc(fpr, tpr)
    auc_lr1_tfidf.append(auc_value)
    plt.plot(fpr, tpr, label='{}: {:0.5f}'.format('auc_'+col, auc_value))
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC Curve for LogisticRegression+l1 with TfIdf Vectorizer')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
plt.show()

# Compute ROC-AUC values of each class for TfIdf Vectorizer (LogisticRegression+l2)
auc_lr2_tfidf = []
plt.figure(figsize=(10,8))
for i, col in enumerate(target_cols):
    fpr, tpr, threshold = roc_curve(y_test[col], probabilities_lr2_tfidf[:,i])
    auc_value = auc(fpr, tpr)
    auc_lr2_tfidf.append(auc_value)
    plt.plot(fpr, tpr, label='{}: {:0.5f}'.format('auc_'+col, auc_value))
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC Curve for LogisticRegression+l2 with TfIdf Vectorizer')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
plt.show()

# Compute ROC-AUC values of each class for Count Vectorizer (RandomForestClassifier)
auc_rf_countvec = []
plt.figure(figsize=(10,8))
for i, col in enumerate(target_cols):
    fpr, tpr, threshold = roc_curve(y_test[col], probabilities_rf_countvec[i][:,1])
    auc_value = auc(fpr, tpr)
    auc_rf_countvec.append(auc_value)
    plt.plot(fpr, tpr, label='{}: {:0.5f}'.format('auc_'+col, auc_value))
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC Curve for Random Forest with TfIdf Vectorizer')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
plt.show()

# Compute ROC-AUC values of each class for TfIdf Vectorizer (RandomForestClassifier)
auc_rf_tfidf = []
plt.figure(figsize=(10,8))
for i, col in enumerate(target_cols):
    fpr, tpr, threshold = roc_curve(y_test[col], probabilities_rf_tfidf[i][:,1])
    auc_value = auc(fpr, tpr)
    auc_rf_tfidf.append(auc_value)
    plt.plot(fpr, tpr, label='{}: {:0.5f}'.format('auc_'+col, auc_value))
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.title('ROC Curve for Random Forest with TfIdf Vectorizer')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
plt.show()

# # Compute ROC-AUC values of each class for Count Vectorizer (XGBClassifier)
# auc_xgb_countvec = []
# plt.figure(figsize=(10,8))
# for i, col in enumerate(target_cols):
#     fpr, tpr, threshold = roc_curve(y_test[col], probabilities_xgb_countvec[:,i])
#     auc_value = auc(fpr, tpr)
#     auc_xgb_countvec.append(auc_value)
#     plt.plot(fpr, tpr, label='{}: {:0.5f}'.format('auc_'+col, auc_value))
# plt.xlabel('fpr')
# plt.ylabel('tpr')
# plt.title('ROC Curve for XGBoost with Count Vectorizer')
# plt.xlim([0, 1])
# plt.ylim([0, 1])
# plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
# plt.show()

# # Compute ROC-AUC values of each class for TfIdf Vectorizer (XGBClassifier)
# auc_xgb_tfidf = []
# plt.figure(figsize=(10,8))
# for i, col in enumerate(target_cols):
#     fpr, tpr, threshold = roc_curve(y_test[col], probabilities_xgb_tfidf[:,i])
#     auc_value = auc(fpr, tpr)
#     auc_xgb_tfidf.append(auc_value)
#     plt.plot(fpr, tpr, label='{}: {:0.5f}'.format('auc_'+col, auc_value))
# plt.xlabel('fpr')
# plt.ylabel('tpr')
# plt.title('ROC Curve for XGBoost with TfIdf Vectorizer')
# plt.xlim([0, 1])
# plt.ylim([0, 1])
# plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
# plt.show()

# # Compute ROC-AUC values of each class for TfIdf Vectorizer (SVC(linear kernel))
# auc_svc_tfidf = []
# plt.figure(figsize=(10,8))
# for i, col in enumerate(target_cols):
#     fpr, tpr, threshold = roc_curve(y_test[col], probabilities_svc_tfidf[:,i])
#     auc_value = auc(fpr, tpr)
#     auc_svc_tfidf.append(auc_value)
#     plt.plot(fpr, tpr, label='{}: {:0.5f}'.format('auc_'+col, auc_value))
# plt.xlabel('fpr')
# plt.ylabel('tpr')
# plt.title('ROC Curve for LogisticRegression with TfIdf Vectorizer')
# plt.xlim([0, 1])
# plt.ylim([0, 1])
# plt.legend(bbox_to_anchor=(1.01, 1), loc=2, borderaxespad=0.)
# plt.show()

In [None]:
# Get mean column-wise ROC-AUC values
print('ROC-AUC for BernoulliNB with Count Vectorizer = {:.4f}'.format(np.mean(auc_bnb_countvec)))
print('ROC-AUC for GaussianNB with TfIdf Vectorizer = {:.4f}'.format(np.mean(auc_gnb_tfidf)))
print('ROC-AUC for GaussianNB with Count Vectorizer = {:.4f}'.format(np.mean(auc_gnb_countvec)))
print('ROC-AUC for RandomForest with Count Vectorizer = {:.4f}'.format(np.mean(auc_rf_countvec)))
print('ROC-AUC for RandomForest with TfIdf Vectorizer = {:.4f}'.format(np.mean(auc_rf_tfidf)))
print('ROC-AUC for LogisticRegression+l1 with TfIdf Vectorizer = {:.4f}'.format(np.mean(auc_lr1_tfidf)))
print('ROC-AUC for LogisticRegression+l2 with TfIdf Vectorizer = {:.4f}'.format(np.mean(auc_lr2_tfidf)))
# print('ROC-AUC for XGBoost with Count Vectorizer = {:.4f}'.format(np.mean(auc_xgb_countvec)))
# print('ROC-AUC for XGBoost with TfIdf Vectorizer = {:.4f}'.format(np.mean(auc_xgb_tfidf)))
# print('ROC-AUC for SVC with TfIdf Vectorizer = {:.4f}'.format(np.mean(auc_svc_tfidf)))

In [None]:
aucs = pd.DataFrame()
aucs['bnb_countvec'] = auc_bnb_countvec
aucs['gnb_tfidfvec'] = auc_gnb_tfidf
aucs['gnb_countvec'] = auc_gnb_countvec
aucs['rf_countvec'] = auc_rf_countvec
aucs['rf_tfidfvec'] = auc_rf_tfidf
aucs['lr1_tfidfvec'] = auc_lr1_tfidf
aucs['lr2_tfidfvec'] = auc_lr2_tfidf
# aucs['xgb_countvec'] = auc_xgb_countvec
# aucs['xgb_tfidfvec'] = auc_xgb_tfidf
# aucs['svc_tfidfvec'] = auc_svc_tfidf

aucs = aucs.T
aucs.columns = target_cols
aucs['mean'] = np.mean(aucs[target_cols], axis=1)
aucs