In [2]:
import sys
sys.path.append("..")
import configparser
from data_providers import TextDataProvider
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from globals import ROOT_DIR
import os
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score
import time
from utils import prepare_output_file
LABEL_MAPPING = {0: 'hateful', 1: 'abusive', 2: 'normal', 3: 'spam'}

In [3]:
config = configparser.ConfigParser()
config.read('../config.ini')
path_data = os.path.join(ROOT_DIR, config['DEFAULT']['PATH_DATA'])
path_labels = os.path.join(ROOT_DIR, config['DEFAULT']['PATH_LABELS'])

In [4]:
def get_confusion_matrix(y_true, preds, output, type_key, round_param=4):
    """
        the count of true negatives is 00
        false negatives is 10
        true positives is  11
        false positives is 01
    """
    scores = f1_score(y_true, preds, average=None)

    labels = set(y_true)
    MCM = multilabel_confusion_matrix(y_true, preds, labels=[i for i in range(len(labels))])
    confusion_matrices = {}
    confusion_matrix_map = {'true_negative': (0,0),
                            'false_negative': (1,0),
                            'true_positive': (1,1),
                            'false_positive': (0,1)}
    for label in labels:
        for key, value in confusion_matrix_map.items():
            row, col = value
            output[key + '_' + LABEL_MAPPING[label]] = np.around(MCM[label][row][col]/len(preds), round_param)

In [5]:
def get_f_scores(y_true, preds, output, type_key, round_param=4):
    f_score = f1_score(y_true, preds, average='weighted')  
    precision = precision_score(y_true, preds, average='weighted')  
    recall = recall_score(y_true, preds, average='weighted')  
    
    
    print("F Score {:.2f}".format(f_score))
    output['{}_f_score'.format(type_key)] = np.around(f_score, round_param)
    output['{}_precision'.format(type_key)] = np.around(precision, round_param)
    output['{}_recall'.format(type_key)] = np.around(recall, round_param)

    f1_scores = f1_score(y_true, preds, average=None)
    precision_scores = precision_score(y_true, preds, average=None)
    recall_scores = recall_score(y_true, preds, average=None)
    
    for i in range(len(f1_scores)):
        output[type_key + '_f_score_' + LABEL_MAPPING[i]] = np.around(f1_scores[i], round_param)
        output[type_key + '_precision_' + LABEL_MAPPING[i]] = np.around(recall_scores[i], round_param)
        output[type_key + '_recall_' + LABEL_MAPPING[i]] = np.around(recall_scores[i], round_param)

In [6]:
def results(model, type_key, x, y_true, output, round_param=4):
    # get accuracy 
    acc = model.score(x, y_true)
    print("Accuracy {:.2f}".format(acc))
    output['{}_acc'.format(type_key)] = np.around(acc, round_param)
    
    # get f score metrics
    preds = model.predict(x)     
    get_f_scores(y_true, preds, output, type_key)
   # get_confusion_matrix(y_true, preds, output, type_key)
    

In [7]:
def populate_missing_params(output):
    """
    Fills data with fields we are ignored for LR 
    """
    missing_params_class = [] #train_loss_class_hateful
    missing_params_overall = ['loss']
    for type_key in ['train', 'valid', 'test']:
        for item in missing_params_class:
            for label in range(4):
                output['{}_{}_class_{}'.format(type_key, item, LABEL_MAPPING[label])] = '-'                
        for item in missing_params_overall:
            output['{}_{}'.format(type_key, item)] = '-'
    

In [8]:
def output_to_csv(output, file_action_key='a+', experiment_name='logistic_regression_cv_baseline'):
    """
    Output results to .csv
    """
    output['title'] = experiment_name
    output['epoch'] = '-'
    output['learning_rate'] = '-'
    
    results_dir = os.path.join(ROOT_DIR, 'results/{}'.format(experiment_name))
    if not os.path.isdir(os.path.join(results_dir)):
        print("Directory added")
        os.mkdir(results_dir)
    prepare_output_file(filename=os.path.join(results_dir, 'results.csv'), output=[output], file_action_key=file_action_key)
    

In [9]:
import gensim

def process_word_embeddings(data):
    data_copy = {}
    for key, value in data.items():
        if 'x' in key:
            data_copy[key] = gensim.matutils.unitvec(np.array(value).mean(axis=1)).astype(np.float32)
        else:
            data_copy[key] = value
    return data_copy

In [15]:
import torch
"""
Experiments 

"""
experiment_seeds = [26, 27, 28]
for i, seed in enumerate(experiment_seeds):
    print("=== Experiment with seed {} running ===".format(seed))
    data_copy, data_map = TextDataProvider(path_data, path_labels, 1).generate_word_level_embeddings('bert', seed)
    
#     data_copy, data_map = TextDataProvider(path_data, path_labels, 1).generate_tdidf_embeddings(seed)
    
#     ## 
    x_train = [data_map[key]['embedded_tweet'] for key in data_copy['x_train']]
    x_valid = [data_map[key]['embedded_tweet'] for key in data_copy['x_valid']]
    x_test = [data_map[key]['embedded_tweet'] for key in data_copy['x_test']]
#     ##
    
    x_train = torch.Tensor(x_train)
    x_train = x_train.view(x_train.shape[0],-1)
    
    x_valid = torch.Tensor(x_valid)
    x_valid = x_valid.view(x_valid.shape[0],-1)
    
    x_test = torch.Tensor(x_test)
    x_test = x_test.view(x_test.shape[0],-1)
        
    print("=== Model Started Training ===")
    start = time.time()
    model = LogisticRegression(random_state=seed, solver='lbfgs', multi_class='multinomial')
    model = model.fit(x_train, data_copy['y_train'])
    
    print("=== Model Completed Training ({:2f} min) ===".format((time.time() - start) / 60))
    
    output = {}
    output['seed'] = seed
    type_key = ['train', 'valid', 'test']
    populate_missing_params(output) #so that we can add to same sheet as Neural Nets 
    for i, (x, y) in enumerate([(x_train, data_copy['y_train']), (x_valid, data_copy['y_valid']), (x_test, data_copy['y_test'])]):
        results(model, type_key[i], x, y, output)
        print('\n')
    file_action_key = 'w' if i == 0 else 'a+' 
    print(output)
    output_to_csv(output, file_action_key, experiment_name='logistic_regression_baseline_bert')

=== Experiment with seed 26 running ===
=== Extracting tweets from JSON ===
[Sizes] Training set: 64.00%, Validation set: 16.00%, Test set: 20.00%
Downloading Bert, Processed 1 / 11
Downloading Bert, Processed 2 / 11
Downloading Bert, Processed 3 / 11
Downloading Bert, Processed 4 / 11
Downloading Bert, Processed 5 / 11
Downloading Bert, Processed 6 / 11
Downloading Bert, Processed 7 / 11
Downloading Bert, Processed 8 / 11
Downloading Bert, Processed 9 / 11
Downloading Bert, Processed 10 / 11
Downloading Bert, Processed 11 / 11
Word embeddings generated
=== Model Started Training ===




=== Model Completed Training (10.009916 min) ===
Accuracy 0.93
F Score 0.93


Accuracy 0.72
F Score 0.72


Accuracy 0.71
F Score 0.71


{'seed': 26, 'train_loss': '-', 'valid_loss': '-', 'test_loss': '-', 'train_acc': 0.9312, 'train_f_score': 0.9308, 'train_precision': 0.9307, 'train_recall': 0.9312, 'train_f_score_hateful': 0.8763, 'train_precision_hateful': 0.8307, 'train_recall_hateful': 0.8307, 'train_f_score_abusive': 0.9508, 'train_precision_abusive': 0.9537, 'train_recall_abusive': 0.9537, 'train_f_score_normal': 0.9506, 'train_precision_normal': 0.9559, 'train_recall_normal': 0.9559, 'train_f_score_spam': 0.8269, 'train_precision_spam': 0.8155, 'train_recall_spam': 0.8155, 'valid_acc': 0.7206, 'valid_f_score': 0.7178, 'valid_precision': 0.7156, 'valid_recall': 0.7206, 'valid_f_score_hateful': 0.2245, 'valid_precision_hateful': 0.196, 'valid_recall_hateful': 0.196, 'valid_f_score_abusive': 0.7526, 'valid_precision_abusive': 0.7467, 'valid_recall_abusive': 0.7467, 'valid_f_score_

KeyboardInterrupt: 

In [None]:
x_train[0]

In [None]:
output_to_csv(output, file_action_key, experiment_name='lo')