In [67]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import random

In [2]:
def save_data(filename, data):
    #Storing data with labels
    a_file = open(filename, "wb")
    pickle.dump(data, a_file)
    a_file.close()
    

def load_data(filename):
    a_file = open(filename, "rb")
    output = pickle.load(a_file)
    a_file.close()
    return output

In [55]:
iteration = '0'
current_directory = 'iteration' + iteration + '/'

In [56]:
data = load_data( current_directory + 'iteration_' + iteration + '.pkl')

In [31]:
#find overall accuracy:
overall_accuracy = np.array([pred_label == label for (sentence, pred_label, label, score) in data])
print('Overall Accuracy:', np.sum(overall_accuracy)/len(data))

Overall Accuracy: 0.5092857142857142


In [32]:
#Store in different dictionaries based on actual label
pred_positive = []
pred_negative = []
pred_neutral = []
for (sentence, pred_label, label, score) in data:
    if pred_label == 'positive':
        pred_positive.append((sentence, pred_label, label, score))
    elif pred_label == 'negative':
        pred_negative.append((sentence, pred_label, label, score))
    elif pred_label == 'neutral':
        pred_neutral.append((sentence, pred_label, label, score))
        
print('Check:', (len(pred_positive) + len(pred_negative) + len(pred_neutral)) == len(data))

Check: True


In [33]:
##Sorting by predicting confidence
sorted_pred_positive = sorted(pred_positive, key=lambda k: k[3], reverse=True)
sorted_pred_negative = sorted(pred_negative, key=lambda k: k[3], reverse=True)
sorted_pred_neutral = sorted(pred_neutral, key=lambda k: k[3], reverse=True)

In [34]:
##Total number of samples in top10% high confidence predictions for each class
n_top10_positive = len(pred_positive)//10
n_top10_negative = len(pred_negative)//10
n_top10_neutral = len(pred_neutral)//10

In [57]:
top_confidence_positive = sorted_pred_positive[:n_top10_positive]
save_data(current_directory + 'predicted_positive_top10_conf_' + iteration + '.pkl', top_confidence_positive)

In [58]:
top_confidence_negative = sorted_pred_negative[:n_top10_negative]
save_data(current_directory + 'predicted_negative_top10_conf_' + iteration + '.pkl', top_confidence_negative)

In [59]:
###saving neutral sentences with right distribution choice
top_confidence_neutral = []
for (sentence, pred_label, label, score) in sorted_pred_neutral:
    if score > 0.8 and score < 0.85:
        if pred_label == 'neutral':
            top_confidence_neutral.append((sentence, pred_label, label, score))
            

    if len(top_confidence_neutral) == n_top10_neutral:
        break

save_data(current_directory + 'predicted_neutral_top10_conf'  + iteration + '.pkl', top_confidence_neutral)

In [61]:
#create new train data set that does not have saved sentences
new_directory = 'iteration' + str(int(iteration) + 1)
os.mkdir(new_directory)

In [80]:
fine_tune_data = top_confidence_positive + top_confidence_negative + top_confidence_neutral
random.shuffle(fine_tune_data)
print('Check:', len(fine_tune_data) == len(top_confidence_positive) + len(top_confidence_negative) + len(top_confidence_neutral))

save_data(new_directory + '/fine_tune_'  + str(int(iteration) + 1) + '.pkl', fine_tune_data)

Check: True


In [76]:
###collect all chosen sentences together
chosen_sentences = []
for (sentence, pred_label, label, score) in top_confidence_positive:
    chosen_sentences.append(sentence)
    
for (sentence, pred_label, label, score) in top_confidence_negative:
    chosen_sentences.append(sentence)
    
for (sentence, pred_label, label, score) in top_confidence_neutral:
    chosen_sentences.append(sentence)

In [77]:
original_dataset = load_data(current_directory + 'processed_data_train_' + iteration + '.pkl')

In [64]:
next_iteration_data = []
for (tweet_id, tweet, label) in original_dataset:
    if tweet not in chosen_sentences:
        next_iteration_data.append((tweet_id, tweet, label))

In [65]:
save_data( new_directory + '/processed_data_train_' + str(int(iteration) + 1) + '.pkl', next_iteration_data)