In [1]:
import sys
sys.path.append("..")
import configparser
from data_providers import TextDataProvider
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from globals import ROOT_DIR
import os
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score, multilabel_confusion_matrix
import time
from utils import prepare_output_file
LABEL_MAPPING = {0: 'hateful', 1: 'abusive', 2: 'normal', 3: 'spam'}

In [2]:
config = configparser.ConfigParser()
config.read('../config.ini')
path_data = os.path.join(ROOT_DIR, config['DEFAULT']['PATH_DATA'])
path_labels = os.path.join(ROOT_DIR, config['DEFAULT']['PATH_LABELS'])

In [3]:
def get_confusion_matrix(y_true, preds, output, type_key, round_param=4):
    """
        the count of true negatives is 00
        false negatives is 10
        true positives is  11
        false positives is 01
    """
    scores = f1_score(y_true, preds, average=None)

    labels = set(y_true)
    MCM = multilabel_confusion_matrix(y_true, preds, labels=[i for i in range(len(labels))])
    confusion_matrices = {}
    confusion_matrix_map = {'true_negative': (0,0),
                            'false_negative': (1,0),
                            'true_positive': (1,1),
                            'false_positive': (0,1)}
    for label in labels:
        for key, value in confusion_matrix_map.items():
            row, col = value
            output[key + '_' + LABEL_MAPPING[label]] = np.around(MCM[label][row][col]/len(preds), round_param)

In [4]:
def get_f_scores(y_true, preds, output, type_key, round_param=4):
    f_score = f1_score(y_true, preds, average='weighted')  
    print("F Score {:.2f}".format(f_score))
    output['{}_f_score'.format(type_key)] = np.around(f_score, round_param)

    f1_scores = f1_score(y_true, preds, average=None)
    precision_scores = precision_score(y_true, preds, average=None)
    recall_scores = recall_score(y_true, preds, average=None)
    
    for i in range(len(f1_scores)):
        output[type_key + '_f_score_' + LABEL_MAPPING[i]] = np.around(f1_scores[i], round_param)
        output[type_key + '_precision_' + LABEL_MAPPING[i]] = np.around(recall_scores[i], round_param)
        output[type_key + '_recall_' + LABEL_MAPPING[i]] = np.around(recall_scores[i], round_param)

In [5]:
def results(model, type_key, x, y_true, output, round_param=4):
    # get accuracy 
    acc = model.score(x, y_true)
    print("Accuracy {:.2f}".format(acc))
    output['{}_acc'.format(type_key)] = np.around(acc, round_param)
    
    # get f score metrics
    preds = model.predict(x)     
    get_f_scores(y_true, preds, output, type_key)
   # get_confusion_matrix(y_true, preds, output, type_key)
    

In [6]:
def populate_missing_params(output):
    """
    Fills data with fields we are ignored for LR 
    """
    missing_params_class = [] #train_loss_class_hateful
    missing_params_overall = ['loss']
    for type_key in ['train', 'valid', 'test']:
        for item in missing_params_class:
            for label in range(4):
                output['{}_{}_class_{}'.format(type_key, item, LABEL_MAPPING[label])] = '-'                
        for item in missing_params_overall:
            output['{}_{}'.format(type_key, item)] = '-'
    

In [7]:
def output_to_csv(output, file_action_key='a+', experiment_name='logistic_regression_cv_baseline'):
    """
    Output results to .csv
    """
    output['title'] = experiment_name
    output['epoch'] = '-'
    output['learning_rate'] = '-'
    
    results_dir = os.path.join(ROOT_DIR, 'results/{}'.format(experiment_name))
    if not os.path.isdir(os.path.join(results_dir)):
        print("Directory added")
        os.mkdir(results_dir)
    prepare_output_file(filename=os.path.join(results_dir, 'results.csv'), output=[output], file_action_key=file_action_key)
    

In [None]:

"""
Experiments 

"""

experiment_seeds = [26, 27, 28]

for i, seed in enumerate(experiment_seeds):
    print("=== Experiment with seed {} running ===".format(seed))
            
    data = TextDataProvider(path_data, path_labels).generate_tdidf_embeddings(seed)
        
    print("=== Model Started Training ===")
    start = time.time()

    model = LogisticRegressionCV(Cs=10, random_state=seed, solver='lbfgs', multi_class='multinomial')
    model = model.fit(data['x_train'], data['y_train'])
    
    print("=== Model Completed Training ({:2f} min) ===".format((time.time() - start) / 60))
    
    output = {}
    output['seed'] = seed
    populate_missing_params(output) #so that we can add to same sheet as Neural Nets 
    for type_key in ['train', 'valid', 'test']:
        print("=== Processing {} set ===".format(type_key))
        results(model, type_key, data['x_{}'.format(type_key)], data['y_{}'.format(type_key)], output)
        print('\n')
    file_action_key = 'w' if i == 0 else 'a+' 

=== Experiment with seed 26 running ===
=== Extracting annotations ===
=== Extracting tweets from JSON ===
[Stats] Removed 3/58358 labels
[Stats] Average tweet length is 17 words
[Stats] Average tweet length is 121 characters
[Stats] Average favorite count is 15
[Stats] Average retweet count is 146
[Stats] Average follower count is 710
[Sizes] Training set: 64.00%, Validation set: 16.00%, Test set: 20.00%
[Sizes] Training set: 64.00%, Validation set: 16.00%, Test set: 20.00%
=== Model Started Training ===




In [None]:
output_keys = ['train_acc', 'train_loss', 'train_f_score', 'train_f_score_hateful', 'train_precision_hateful', 'train_recall_hateful', 'train_f_score_abusive', 'train_precision_abusive', 'train_recall_abusive', 'train_f_score_normal', 'train_precision_normal', 'train_recall_normal', 'train_f_score_spam', 'train_precision_spam', 'train_recall_spam', 'valid_acc', 'valid_loss', 'valid_f_score', 'valid_f_score_hateful', 'valid_precision_hateful', 'valid_recall_hateful', 'valid_f_score_abusive', 'valid_precision_abusive', 'valid_recall_abusive', 'valid_f_score_normal', 'valid_precision_normal', 'valid_recall_normal', 'valid_f_score_spam', 'valid_precision_spam', 'valid_recall_spam', 'test_experiment_acc', 'test_experiment_loss', 'test_experiment_f_score', 'test_experiment_f_score_hateful', 'test_experiment_precision_hateful', 'test_experiment_recall_hateful', 'test_experiment_f_score_abusive', 'test_experiment_precision_abusive', 'test_experiment_recall_abusive', 'test_experiment_f_score_normal', 'test_experiment_precision_normal', 'test_experiment_recall_normal', 'test_experiment_f_score_spam', 'test_experiment_precision_spam', 'test_experiment_recall_spam']
for item in output_keys:
    if item not in output:
        if 'test' in item:
            output[item] = output[item.replace('experiment_', '')]
        else:
            output[item] = '-'
print(output)
output_to_csv(output, file_action_key)
