In [None]:
import pandas as pd
import numpy as np
import sys

In [None]:
df = pd.read_csv('data/labeled_data/CPC_labeled_proc_dataset.csv')

In [None]:
sys.path.insert(1, '/home/ahaque2/code/utils')
from WordEmbedding import WordEmbedding
we = WordEmbedding(1)

from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

import spacy
nlp = spacy.load("en_core_web_lg")

def get_masked_embedding(texts):
    
    sent_emb = []
    for txt in texts:
        emb_mapp = we.get_word_embeddings([txt.strip()])
        #print(emb_mapp.tolist()[0])
        #sys.exit()
        sent_emb.append(emb_mapp.tolist()[0])
        
    return np.array(sent_emb)

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def get_bert_sent_emb(sentences):

    # Load model from HuggingFace Hub
    tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
    model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

    # Tokenize sentences
    encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    # Perform pooling. In this case, max pooling.
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    return sentence_embeddings        

In [None]:
task = 'cpc'   # 'cpc'
if task == 'csi':
    y_true = df['comparison'].astype(int).tolist()
else:
    y_true = df['preferred'].tolist()
    
#y_true = df['preferred'].tolist()
input_text = df['proc_sent'].tolist()
X = np.array(input_text)
y = np.array(y_true)

In [None]:
def sample_data():

    from sklearn.model_selection import StratifiedShuffleSplit
    sss = StratifiedShuffleSplit(n_splits=2, test_size=0.2, random_state=0)
    #print(sss.get_n_splits(input_text, y_true))
    X = np.array(input_text)
    y = np.array(y_true)
    for train_index, test_index in sss.split(X, y):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

    #print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
    
    np.save('final_data/train_index.npy', train_index)
    np.save('final_data/test_index.npy', test_index)
    
    return X_train, X_test, y_train, y_test

def get_data(task):
    
    if task == 'csi':
        train_index = np.load('final_data/comp/train_index.npy')
        test_index = np.load('final_data/comp/test_index.npy')
        val_index = np.load('final_data/comp/val_index.npy')
    else:
        train_index = np.load('final_data/pref/train_index.npy')
        test_index = np.load('final_data/pref/test_index.npy')
        val_index = np.load('final_data/pref/val_index.npy')
        
    train_index = np.concatenate((train_index, val_index))
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    return  X_train, y_train, X_test, y_test

X_train, y_train, X_test, y_test = get_data('cpc')
#X_train, X_test, y_train, y_test = sample_data()

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
train_sent_emb = get_bert_sent_emb(X_train.tolist())
test_sent_emb = get_bert_sent_emb(X_test.tolist())

In [None]:
# train_sent_emb = get_embedding_features(X_train)
# test_sent_emb = get_embedding_features(X_test)

In [None]:
#sent_emb = np.array(sent_emb)
train_sent_emb.shape, test_sent_emb.shape

In [None]:
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
#clf = AdaBoostClassifier(n_estimators=100)
#clf = RandomForestClassifier(n_estimators=100)
#clf = tree.DecisionTreeClassifier()
#clf = GaussianNB()
#clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
#clf = MLPClassifier(random_state=1, max_iter=300)
clf = NearestCentroid()

clf.fit(train_sent_emb, y_train)

In [None]:
y_pred = clf.predict(test_sent_emb)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

### SKlearn Multi-Class multioutput classification

In [None]:
import numpy as np
from sklearn.datasets import make_multilabel_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier

def multiclass_multioutput_classifier(train_sent_emb, test_sent_emb, df):
    
    X = train_sent_emb
    y = np.array(pd.concat((df['comparison'], df['preferred']), axis = 1))
    y_train = y[train_index]
    clf = MultiOutputClassifier(KNeighborsClassifier()).fit(X, y_train)

    y_pred = clf.predict(test_sent_emb)
    y_pred_comp, y_pred_pref = y_pred[:,0], y_pred[:,1]
    y_true = y[test_index]
    y_true_comp, y_true_pref = y_true[:,0], y_true[:,1]
    
    return y_pred_comp, y_pred_pref

task = 'cpc'
if task == 'csi':
    train_index = np.load('final_data/comp/train_index.npy')
    test_index = np.load('final_data/comp/test_index.npy')
    val_index = np.load('final_data/comp/val_index.npy')
else:
    train_index = np.load('final_data/pref/train_index.npy')
    test_index = np.load('final_data/pref/test_index.npy')
    val_index = np.load('final_data/pref/val_index.npy')
        
train_index = np.concatenate((train_index, val_index))

y_pred_comp, y_pred_pref = multiclass_multioutput_classifier(train_sent_emb, test_sent_emb, df)

y_test = df['comparison'][test_index]
y_pred = y_pred_comp

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

y_test = df['preferred'][test_index]
y_pred = y_pred_pref

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))