# Imports and Constants

In [None]:
lazy_copy = 'post_classification_random_sample_8-28'

import numpy as np
from datetime import datetime
import numpy as np
from math import floor
from utils import *
from functools import partial
import torch
from torch import nn
from math import floor, ceil
import pandas as pd
from collections import Counter
from tqdm import tqdm
from matplotlib import pyplot as plt
import random
import seaborn as sns

right_now = datetime.now().replace(microsecond=0, second=0)
label_col = 'answer_vector'
device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
seed = 42 # np.randint(0, 110)

mapping = lambda s: {'background': 'context',
                     'future_work': 'context',
                    'differences': 'context',
                     'future work': 'context',
                     'motivation': 'context',
                     'similarities': 'context',
                     'extends': 'extends',
                     'uses': 'uses',
                     '*': 'context'
                     }.get(s) or s


vector_from_string = lambda s: np.array(list(
                                map(float,
                                filter(None, 
                                map( str.strip, 
                                    s[1:-1].split(' ')
                    )))))

vector_from_string_bool = lambda s: np.array(list(
                                map(lambda s: {'true': 1, 'false': 0, '1': 1, '0': 0, '1.':1, '0.':0}.get(s.lower()),
                                filter(None, 
                                map( str.strip, 
                                    s[1:-1].split(' ')
                    )))))

# Load existing or default dataframe, query LLM

In [None]:
other_path = f'~/Desktop/2.FutureTech/uniform_sample/results/{lazy_copy.replace('/',':')}.csv'
df = pd.read_csv(other_path) #update_labels(other_path, save = True)


df = df.sort_values(by = 'multisentence').reset_index(drop = True)
df = df.sample(frac=1, random_state = seed)
#df.rename(columns={'json_booleans':'json_response'}, inplace = True)
print(len(df))
df.head()

df['answer_vector'] = df['answer_vector'].apply(lambda s: vector_from_string_bool(s))


In [None]:
total_questions = df['answer_vector'].apply(len).max()
assert(total_questions == df['answer_vector'].apply(len).min())


In [None]:
tt_split = 2/3
split = floor(tt_split * len(df))
print(f"Test-train split: {split}, {len(df) - split}")

df_train, df_test = df.iloc[:split], df.iloc[split:]

 
label_mask = lambda s: df_train['alex'].apply(mapping) == s
labels = sorted(list({mapping(item) for item in ['background', 'extends', 'uses']}))
per_label_samples = max([len(df_train[label_mask(label)]) for label in labels])
per_label_multipliers = [ceil(per_label_samples/len(df_train[label_mask(label)])) for label in labels]


#comment out this line to use the original distribution
#df_train = pd.concat([df_train[label_mask(label)] for label, multiplier in zip(labels, per_label_multipliers) for _ in range(0, multiplier)])
print(f"Training size {len(df_train)}")

# Creating a model!

In [None]:
X_train, y_train = (np.stack(df_train['answer_vector']), 
                        np.array(df_train['alex'].apply(mapping).apply(lambda s: {'context': 0, 'uses': 1, 'extends': 2}.get(s)))
                )

X_train = torch.tensor(X_train, dtype = torch.float32)
y_train = torch.tensor(y_train, dtype = torch.float32).reshape(-1, 1)


X_test = torch.tensor(np.stack(df_test['answer_vector']), dtype = torch.float32)
y_test = torch.tensor(np.array(df_test['alex'].apply(mapping).apply(lambda s: {'context': 0, 'uses': 1, 'extends': 2}.get(s))), dtype = torch.float32).reshape(-1, 1)
print("Training shape:", X_train.shape, y_train.shape)
print("Testing shape:", X_test.shape, y_test.shape)

c = Counter([el.item() for el in y_test])
background_composition = c[0]/len(df_test)
print(f"Background composition: {background_composition}")


In [None]:
plt.plot(question_frequency_distribution(X_train, y_train, 0))
plt.plot(question_frequency_distribution(X_train, y_train, 1))
plt.plot(question_frequency_distribution(X_train, y_train, 2))
plt.xticks(ticks = [i for i in range(X_train.shape[1]) if i % 2 == 0])

plt.legend(["Context", "Uses", "Extends"])

print(get_highest_relative_questions(X_train, y_train, target_index = 0, reference_index = 1, n = 10))

In [None]:
top_n_questions = 30
#best_uses_questions = get_highest_relative_questions(X_train, y_train, target_index = 1, reference_index = 0, n = top_n_questions)
#best_extends_questions = get_highest_relative_questions(X_train, y_train, target_index = 2, reference_index = 1, n = top_n_questions)

best_uses_questions = [10, 0, 13, 11, 23, 20, 8, 28, 3, 18, 15, 1, 2, 21, 17, 22, 4, 9, 14, 5, 16, 25, 19, 6, 26, 29, 27, 24, 52, 12]
best_extends_questions = [50, 30, 40, 31, 41, 51, 38, 42, 35, 32, 52, 48, 54, 58, 53, 33, 34, 44, 36, 37, 47, 56, 43, 1, 8, 12, 18, 22, 57, 3]

def heuristic_model(X, 
                    best_uses_questions = best_uses_questions, 
                    best_extends_questions = best_extends_questions, 
                    uses_threshold = 3/10,
                    extends_threshold = 3/10):
    uses = np.array([1 if answer else 0 for answer in (X[:, best_uses_questions].sum(axis = 1)/len(best_uses_questions) > uses_threshold)]).reshape(-1,)
    extends = np.array([2 if answer else 1 for answer in (X[:, best_extends_questions].sum(axis = 1)/len(best_extends_questions) > extends_threshold)]).reshape(-1,)
    return uses * extends

def get_acc(uses_t, extends_t, X = X_train, y = y_train):
    y_pred = heuristic_model(X, uses_threshold=uses_t, extends_threshold=extends_t)
    acc = (y_pred == y.cpu().detach().numpy().reshape(-1, )).sum()/len(X)
    return acc.item()



In [None]:
X_train[:, best_uses_questions]

In [None]:
results = [[get_acc(uses_t = i/10, extends_t = j/10, X = X_train, y = y_train) for j in range(1,20)] for i in range(1,20)]
sns.heatmap(results)
plt.xlabel('Extends threshold')
plt.ylabel('Uses threshold')

In [None]:
model = generateModel(
            input_size = total_questions,
            classes = len(labels),
            dropout_rate=0.7,
            depth = 15,
            base = 1.1
)
#model = model.to(device) # device = 'mps'


loss_fn = nn.CrossEntropyLoss(weight = torch.Tensor([1,5]).to(device))

X_train, y_train = X_train.to(device), y_train.to(device)
X_test, y_test = X_test.to(device), y_test.to(device)

In [None]:
def train_model(model, verbose = False, iters = 1000, lr = 1e-4):
    lr = lr * 10 # hacky and stupid
    learning_rate_horizons = {0, 500, 1000}
    train_accuracies, test_accuracies = [], []
    
    
    
    for epoch in range(iters):
        if (epoch in learning_rate_horizons):
            lr = lr / 10
            print(f"lr: {lr}")
            optimizer = torch.optim.Adam(params = model.parameters(), lr = lr)
            
        sample = random.sample([i for i in range(len(X_train))], ceil(len(X_train) * .1))
        X_sample, y_sample = X_train[sample], y_train[sample]
            

        output = model(X_sample)
        loss = loss_fn(output, y_sample.reshape(-1,))
        
    
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_accuracy = (output.argmax(dim = 1) == y_sample.reshape(-1, )).sum().item()/len(y_sample)
        test_accuracy = (model(X_test).argmax(dim = 1) == y_test.reshape(-1,)).sum().item()/len(y_test)
        
        train_accuracies.append(train_accuracy)
        test_accuracies.append(test_accuracy)

        if (epoch % 100 == 0 and verbose):
            if (np.abs(test_accuracy - background_composition) <= .0001):
                print("WARNING, YOU ARE CLASSIFYING EVERYTHING AS BACKGROUND")
            print(f"Epoch: {epoch}, train loss: {loss.item()}, train acc: {train_accuracy}, test ac {test_accuracy}")
            
    
    return train_accuracies, test_accuracies


#train_accuracies, test_accuracies = train_model(model, verbose=True, iters = 1000)

In [None]:
def run_model(model, v, threshold =  0.80):
    if v is None:
        return None
    mapping = {0: 'context', 1:'uses', 2:'extends'}
    
    ten = torch.Tensor(v).to(device).reshape(1, -1)
    rankings = model(ten)
    
    return mapping.get(rankings.item())

    if (type(rankings) == torch.Tensor):  
        rankings = rankings.cpu().detach().numpy().reshape(-1, )
        
    classification = rankings.argmax()
    
    if (rankings[classification] >= threshold):
        return mapping.get(classification) 
    
    return 'context'

df['learned_classification'] = df['answer_vector'].apply(lambda v: run_model(heuristic_model, v))
fp, fn = get_fp_fn(y_true=y_test, y_pred=df['learned_classification'].iloc[split:], verbose=False)

df.to_csv(other_path, index = False)
print(f"False positive {fp}, False negative {fn}")

In [None]:
print(f"Last test acc: {test_accuracies[-1]}")
print(Counter(df['learned_classification']))
print(f"Test labels hash: {hash_dataframe(list(df_test['alex']))}")
print(f"Test labels hash: {hash_dataframe(list(df['learned_classification'].iloc[split:]))}")


In [None]:
def pareto_plot(X, y, start = .5, end = 1, num = 10, verbose = False, test_or_train = "Test"):
    fps, fns = [], []
    thresholds = np.geomspace(start, end, num = num)
    print(thresholds)
    
    for t in tqdm(thresholds):
        pred = np.array([run_model(X[v,:], threshold = t) for v in range(len(X))])
        
        fp, fn = get_fp_fn(y, pred, positive_label = 'uses', positive_index=1, verbose = verbose)
    
        fps.append(fp)
        fns.append(fn)
    
    plt.title(f"{test_or_train} FN vs. FP")
    plt.plot(thresholds * 100,fps, 'xr-')
    plt.plot(thresholds * 100,fns, 'xb-')
    plt.legend(['False Positive', 'False Negative'])
    plt.xlabel("Confience threshold %")
    
#pareto_plot(X_train, y_train, verbose = False, test_or_train = "Train")


In [None]:
pareto_plot(X_test, y_test, test_or_train = "Test")