# Building a RoBERTa-Classifier 


In this notebook we analyse our RoBERTa classifiers. The classifier consists of a fully connected layer ont top of a RoBERTa model.

The implemenation is based on:
- https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/
-https://huggingface.co/transformers/model_doc/bert.html
-https://huggingface.co/transformers/model_doc/roberta.html


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

## Loading all needed Libraries

In [None]:
! pip install transformers==3
! pip install tokenizers

In [None]:
import transformers
from transformers import RobertaModel, RobertaTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import re
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix
import random
import pickle

- Get GPU

In [None]:
# Get Device 
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'Used device is {device}')

## Building RoBERTa Classifier Model
Here we use the classical RoBERTa model 'roberta-base'. This might be replaced by more specific pretrained models. 

### Bert specific processing

In [None]:
# Special RoBERTa mdoe to be used: While slight information might be lost; capitalization in tweets is a neglectable characteristic
PRE_TRAINED_MODEL_NAME = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

- Get understanding of the distribution of token sizes for maximal length used in BERT

### Build PyTorch Dataset and DataLoader

This section builds the basic fucntionality for teh RoBERTa classifer, Dataloaders etc.

In [None]:
# Data Structure
class SentenceDataset(Dataset):
    def __init__(self, sents, labels, tokenizer, max_len):
        self.sents = sents
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.sents)
    
    def __getitem__(self, item):
        
        sent = str(self.sents[item])
        label = self.labels[item]

        encoding = tokenizer(sent,
                             truncation=True,
                             add_special_tokens=True, # Add '[CLS]' and '[SEP]'
                             return_token_type_ids=False,
                             padding = 'max_length',
                             max_length=self.max_len,
                             return_attention_mask=True,
                             return_tensors='pt')
      
        return { 'sent': sent, 'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'label': torch.tensor(label, dtype=torch.long)
        }
# Data Loader
def create_data_loader(sentences, labels, tokenizer, max_len, batch_size):
    ds = SentenceDataset(
        sents=sentences, #.to_numpy()
        labels=labels, #.to_numpy()
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=2
    )

### Actual Model

Two types of models with the same weights! First is the 'normal' one. The second additionally gives the [CLS]-vectors for clustering. 

In [None]:
class StyleClassifier(nn.Module):
    def __init__(self, n_classes, drop = 0.3):
        
        super(StyleClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=drop)
        self.out = nn.Linear(self.roberta.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        output = self.drop(pooled_output)
        return self.out(output)

class StyleClassifier_forCluster(nn.Module):
    def __init__(self, n_classes, drop = 0.3):
        
        super(StyleClassifier_forCluster, self).__init__()
        self.roberta = RobertaModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
        self.drop = nn.Dropout(p=drop)
        self.out = nn.Linear(self.roberta.config.hidden_size, n_classes)
        
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        output = self.drop(pooled_output)
        return self.out(output), pooled_output

### Define Helper functions

Some functionality used in evaluation

In [None]:
# This provides just a way to illustrate our confusion matrices in a nice and labeled way
def show_confusion_matrix(confusion_matrix, names = ['Scientific', 'Non-Scientifc'], save_path = None):
  confusion_df = pd.DataFrame(cm, index=names,columns=names)
  plt.figure(figsize=(5,5))
  sns.heatmap(confusion_df,annot=True,annot_kws={"size": 12},cbar=False, square=True,fmt='.2f')
  plt.ylabel(r'True categories',fontsize=14)
  plt.xlabel(r'Predicted categories',fontsize=14)
  plt.tick_params(labelsize=12)
  if save_path:
    plt.savefig(save_path)
  plt.show()

In [None]:
def get_predictions(model, data_loader):
  # put to eval mode to disable dropout 
  model = model.eval()

  predictions = []
  prediction_probs = []
  real_values = []
  with torch.no_grad():
    for d in data_loader:

      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      labels = d["label"].to(device)

      outputs = model(input_ids=input_ids, attention_mask=attention_mask) #dim BATCH_SIZE x 3
      # torch.max(outputs, dim=1) returns (vals, positions) of maxima -> positions are kept and correspond to class labels
      _, preds = torch.max(outputs, dim=1) # dim BATCH_SIZE x 1

      predictions.extend(preds)
      prediction_probs.extend(outputs)
      real_values.extend(labels)
  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return predictions, real_values, prediction_probs

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    
    model = model.eval()
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for d in data_loader:
            
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["label"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            loss = loss_fn(outputs, labels)
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())
            
    return correct_predictions.double() / n_examples, np.mean(losses)

### Parameter choices
- Set hyper parameters and day of mdoel to be loaded

In [None]:
import datetime
day = datetime.date(2021, 6, 16)
MAX_LEN = 100 # chosen acccroding to hist above
BATCH_SIZE = 32# tunable hyper parameter
EPOCHS = 3
lr = 1e-5
dropout = 0.3

data_save_path = f'/models/{day}/RoBERTA_Epochs{EPOCHS}_Bs{BATCH_SIZE}_lr{lr}_drop{dropout}/model_specific_data.pkl'# save model specific data
model_save_path = f'/models/{day}/RoBERTA_Epochs{EPOCHS}_Bs{BATCH_SIZE}_lr{lr}_drop{dropout}/RoBERTa.bin'# save model parameters, should include BERT and hyperparameters in name
#img_save_path = f'/content/gdrive/MyDrive/StyleClassifier/models/{day}/RoBERTA_Epochs{EPOCHS}_Bs{BATCH_SIZE}_lr{lr}_drop{dropout}/cm.png' # save confusion matrix, should include BERT and hyperparameters in name
#report_save_path = f'/content/gdrive/MyDrive/StyleClassifier/models/{day}/RoBERTA_Epochs{EPOCHS}_Bs{BATCH_SIZE}_lr{lr}_drop{dropout}/report.csv' # save classification report, should include BERT and hyperparameters in name

### Recreate data and LOAD model


This way we get the exact data partioion used in training, ensuring that test data is unseen by model.


In [None]:
# Load data
with open(data_save_path, 'rb') as f:
  data_dict = pickle.load(f)

# Assign data
data_train, labels_train = zip(*data_dict['train'])
data_val, labels_val = zip(*data_dict['val'])
data_test, labels_test = zip(*data_dict['test'])

# Create DataLoaders
train_data_loader = create_data_loader(data_train, labels_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(data_val, labels_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(data_test, labels_test, tokenizer, MAX_LEN, BATCH_SIZE)

print(f'Number of training sentences: \t {len(data_train)}')
print(f'Number of validation sentences:  {len(data_val)}')
print(f'Number of test sentences: \t {len(data_test)} \n')

- LOAD MODEL FROM GIVEN PATH


In [None]:
# LOAD model
model = StyleClassifier(2, drop = dropout).to(device)
model.load_state_dict(torch.load(model_save_path))
model = model.to(device)

model_cluster = StyleClassifier_forCluster(2, drop = dropout).to(device)
model_cluster.load_state_dict(torch.load(model_save_path))
model_cluster = model_cluster.to(device)

## Standard Model Evaluation

Compare values to notebook of trainign teh classifier to ensure its the same!

In [None]:
# Test Accuracy
test_acc, _ = eval_model(model, test_data_loader, nn.CrossEntropyLoss().to(device) , device, len(data_test))
print(f'Test Accuracy: {test_acc.item()}')

- Calculate Scores 

In [None]:
# Calculation of relevant scores
labels_pred, labels_test,_ = get_predictions(model, test_data_loader)
report = classification_report(labels_test, labels_pred, output_dict=True)
report_df = pd.DataFrame(report).transpose()


report_df

In [None]:
report_df
cm = confusion_matrix(labels_test, labels_pred, normalize = 'true')
show_confusion_matrix(cm, names = ['Reports', 'Arxiv'], save_path=None)

# Analysing the classifier


In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
# this path some csv's will be stored that can be used for further manual inspection of data
analysis_path = f'/models/{day}/analysis/'

! mkdir {analysis_path}

## Setup Data for Analysis
**Working with and setting up test data**

In [None]:
# Get Predictions and transfer to list
labels_pred, labels_test, probs = get_predictions(model, test_data_loader)

# Create DF for easier handling!
data = pd.DataFrame(columns = ['sent', 'true_label', 'pred_label'])
data['sent'] = data_test
data['true_label'] = [label.item() for label in labels_test]
data['pred_label'] = [label.item() for label in labels_pred]


#
probs = F.softmax(probs)
data['prob0'] = [prob[0].item() for prob in probs]
data['prob1'] = [prob[1].item() for prob in probs]
data['prob_max'] = [max(prob[0].item(),prob[1].item()) for prob in probs]
data.head()

## Sentences in dependece of their prediction probability
In the next cells, the sentences which
  - the classifier is unsure about
  - the classifier wrongly predicted with high probability
  - the classifier correctly predicted with high probability

will be analyzed. To that end, some sentences are exported to a csv-file allowing better manual annotation 


In [None]:
# Sentences the classifier is unsure about
cond = (0.45<data['prob0'])*(data['prob0']<0.55)
data_unsure = data.loc[cond]
data_unsure.head(50)

In [None]:
# Sentences the classifier is wrong about, buth with high confidence
# Arxiv mistaken as Medium
cond = (data['pred_label'] == 0)*(data['true_label']==1)*(data['prob_max']>0.9)
data_sure = data.loc[cond]

data_sure.to_csv( analysis_path+'CSArxiv_as_Medium_90.csv')

In [None]:
# Sentences the classifier is wrong about, buth with high confidence
# Medium mistaken as Arxiv
cond = (data['pred_label'] == 1)*(data['true_label']==0)*(data['prob_max']>0.9)
data_sure = data.loc[cond]

data_sure.to_csv( analysis_path+'Medium_as_CSArxiv_90.csv')

In [None]:
# Sentences the classifier is correct about, buth with high confidence // Medium
cond = (data['pred_label'] ==0)*(data['true_label'] == 0)*(data['prob_max']>0.9)
data_sure_medium = data.loc[cond]
data_sure_medium['len'] = [len(tokenizer.tokenize(sent)) for sent in data_sure_medium['sent']]
data_sure_medium.head(50)

In [None]:
# Sentences the classifier is correct about, buth with high confidence // Arxiv
cond = (data['pred_label'] ==1)*(data['true_label'] == 1)*(data['prob_max']>0.9)
data_sure_arxiv = data.loc[cond]
data_sure_arxiv['len'] = [len(tokenizer.tokenize(sent)) for sent in data_sure_arxiv['sent']]
data_sure_arxiv.head(50)

## Rewriting sentences and ccking their new classification

In [None]:
list_of_rewritten_sents = [
                           ('its vertices are enclosed by the dashed contour.', 'the vertices of this are surrounded by a shaded region.'),
                           ('however, it had never been empirically evaluated before this work.', 'no one has looked at this before in an empirical way.'),
                           ('let us now give a brief description of the concatenated coding scheme.', 'i will now decsribe the the concatenated coding scheme.'),
                           ('let us now give a brief description of the concatenated coding scheme.', 'This now decsribes the the concatenated coding scheme.'),
                           ('given the small difference in performance between the algebraic lattices, it is difficult to propose an optimal candidate.', 'it is very difficult to find an optimum candidate, because the differences are so tiny.'),
                           ('to lower bound the marginal joint replenishment cost we perform a similar analysis to the one in eq.', 'i try to lower bound the cost in the same way as before.'),
                           ('we perform multiple restarts of gradient ascend with random initializations for the nodes ( e ), ( f ), ( g ).', 'the start the optimization from different starting points for the nodes ( e ), ( f ), ( g ).'),
                           ('it is then possible to get an ordered ( from left to right ) overview of the articles on that topic.', 'you can get the overview of the articles on that topic from left to right.'),
                           ('it is then possible to get an ordered ( from left to right ) overview of the articles on that topic.', 'This way one can get the overview of the articles on that topic from left to right.'),
                           ('third, the bandwidths of the network connecting the nodes are limited.', 'also, there is some limitation of the bandwidths of the network.'),
                           ('thus, we obtain the following algorithm for computing the well - founded semantics.', 'We have built this algorithm for finding well - founded semantics.'),
                           ('is result implies that the mean packet delay and delay jitter can go to infinity even if the system is not saturated.', 'this shows cleary that the mean delay and delay jitter can gow super large also when the system is not saturated.'),
                           ('we perform the spectral analysis needed to prove our main result.', 'We do spectral analysis to show that our main result is true.'),
                           ('our focus on these two methods is due to their widespread adoption by the signal processing and statistics communities.', 'we look at these two methods, because they are often used in signal processing and statistics.'),
                           ('furthermore, the combination of friction effects and muscle-pose ambiguity leads to a strong hysteresis effect.', 'also, firction effects and and muscle-pose ambiguity together can result in to strong hysteresis effect.'),
                           ('this will cost large computations to obtain the accurate implicit surface.', 'getting an accurate implicit surface will take long computations.'),
                           ('comparison between the results show that the performance of egc is very close to the performance of oc receiver.', 'looking at the results we cane see that the performance of egc is almost the same as the performance of oc receiver.'),
                           ('the fast convergence of the derived formula is due to the identified mock-gaussian behavior.', 'our formula converges very fast because of the identified mock-gaussian behavior.'),
                           ('we therefore consider scores resulting from algorithms based on editorial changes to be less informative for this comparison.', 'because of this we think that scores from algorithms based on editorial changes are not really good for this comparison.'),
                           ('our work raises a number of questions which aim to further strengthen our understanding of the long term influence.', 'we look at some questions which are supposed to help understnd the long term effects.'),
                           ('of course, the feedback can be substantially reduced by exploiting channel correlations.', 'obviously, you need less feedback if taking into account that there is correlations between the channels.'),
                           ('the proposed optimum solution algorithm is numerically simulated to find the optimum departure region.', 'we simulate the optimum solution algorithm to find the best of the depature regions.'),
                           ('the proposed algorithm is implemented to estimate the wind power output considering practical wind data.', 'our algorithm is built to predict the of wind power using wind data.')

]

In [None]:
originals = [sent for (sent,_) in list_of_rewritten_sents]
rewritten= [sent for (_,sent) in list_of_rewritten_sents]
print(rewritten)
rewritten_data_loader = create_data_loader(rewritten, [1 for _ in rewritten], tokenizer, MAX_LEN, BATCH_SIZE)
labels_rewritten,_,_ = get_predictions(model, rewritten_data_loader)
print(labels_rewritten)
print(f'Out of {len(labels_rewritten)} rewritten sentences, {len(labels_rewritten)-sum(labels_rewritten)} were now classified differnetly')

## Quantitive differences between the classes

In [None]:
# Avg length
data['token_length'] = [len(tokenizer.tokenize(sent)) for sent in data['sent']]
data.head()

In [None]:
fig, ax = plt.subplots(1, 1, figsize = (18,8))
ax.hist([data['token_length'][data['true_label']==0], data['token_length'][data['true_label']==1]], label = ['Medium', 'Arxiv'], bins = 100, density=True)
ax.legend()
ax.set_title('Length of Tokens', fontsize = 16)
plt.show()


## Looking for Unscientific Keywords

In [None]:
def keyword_odds(dataframe, keywords):

  # Separate sentences
  all_test_sentences_sci = [sent.lower().split() for i,sent in enumerate(dataframe['sent'][data['true_label'] == 1])]
  all_test_sentences_nonsci = [sent.lower().split() for i,sent in enumerate(dataframe['sent'][data['true_label'] == 0])]

  #Calculate occurences
  keywords_sci = sum([1 for sent in all_test_sentences_sci if any([word in sent for word in keywords])])
  keywords_sci /= len(all_test_sentences_sci)
  keywords_nonsci = sum([1 for sent in all_test_sentences_nonsci if any([word in sent for word in keywords])])
  keywords_nonsci /= len(all_test_sentences_nonsci)

  print(f'Proportion of sentences with keywords in Arxiv: {keywords_sci}')
  print(f'Proportion of sentences with keywords in Medium: {keywords_nonsci}')

  return keywords_sci/keywords_nonsci


In [None]:
# keywords = ['I','you', 'we', 'should', 'obviously', 'us', 'our'] #['et']
keywords = ['you']
keyword_odds(dataframe = data, keywords=keywords)

Preprocessing helps!

## Vocabulary differences

In [None]:
def compare_vocabs(corpus1, corpus2):

  all_words_corpus1 = [word for sent in corpus1 for word in sent.split()]
  all_words_corpus1 = set(all_words_corpus1)
  print(f'# All words Corpus1: {len(all_words_corpus1)}')

  all_words_corpus2 = [word for sent in corpus2 for word in sent.split()]
  all_words_corpus2 = set(all_words_corpus2)
  print(f'# All words Corpus2: {len(all_words_corpus2)}')


  union = all_words_corpus1.union(all_words_corpus2)
  intersec = all_words_corpus1.intersection(all_words_corpus2)


  print(f'# All words: {len(union)}')
  print(f'# shared words: {len(intersec)}')

  #return union, intersec

In [None]:
# Get data
sci_data = list(data['sent'][data['true_label']==1])
nonsci_data = list(data['sent'][data['true_label']==0])
# Evaluate
print('Arxiv Medium')
compare_vocabs(corpus1 = sci_data, corpus2 = nonsci_data)


This does not take into account frequencies. Some sared words might even have noticable frequency differences. This still has punctuation

## Clustering with t-SNE


In [None]:
def get_cluster_vectors(data_loader, classifier):
  # get vectors of CLS token of last layer of actual roberta model
  classifier = classifier.eval()

  CLS_vectors = []
  with torch.no_grad():
    for d in data_loader:

      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      labels = d["label"].to(device)

      outputs, cls_out = classifier(input_ids=input_ids, attention_mask=attention_mask) #dim BATCH_SIZE x 3
      # torch.max(outputs, dim=1) returns (vals, positions) of maxima -> positions are kept and correspond to class labels
      CLS_vectors.extend(cls_out)
      #prediction_probs.extend(outputs)


  CLS_vectors = torch.stack(CLS_vectors).cpu().numpy()
  return CLS_vectors


def plot_clusters(vectors, labels, names = {1: 'Sci:', 0: 'NonSci'}, title = 't-SNE of CLS-vectors'):

  #Plotting
  dim = vectors.shape[1]
  labels_unique = list(set(labels))
  n_classes = len(labels_unique)
  assert n_classes == len(names)

  fig, ax = plt.subplots(1,1, figsize = (16, 5))
  for i, label in enumerate(labels_unique):
    x = vectors[labels == label,0]
    y = vectors[labels == label,1]
    ax.scatter(x,y, label = names[i] ,alpha=.8, edgecolors='none')


  ax.set_title(title, fontsize = 12)
  ax.legend()
  plt.show()

from sklearn.manifold import TSNE
def illustrate_cluster(dataframe, classifier, n_components = 2, names = {1: 'Sci:', 0: 'NonSci'}, title = 't-SNE of CLS-vectors'):

  # put data into correct form 
  data_loader = create_data_loader(dataframe['sent'], dataframe['true_label'], tokenizer, MAX_LEN, BATCH_SIZE)

  # get CLS emebddings
  print('Calculating CLS embeddings!')
  CLS_vectors = get_cluster_vectors(data_loader = data_loader, classifier = classifier)

  # Data dimesnionality reduction
  print('Reducing data dimesionality for illustration. This might take a while!')
  tsne = TSNE(n_components = n_components , init='pca', random_state=42)
  CLS_vectors_fitted = tsne.fit_transform(CLS_vectors)

  plot_clusters(vectors = CLS_vectors_fitted, labels = dataframe['true_label'], names = names, title = title)

  return CLS_vectors_fitted


def dislpay_sents_cluster(xmin, xmax, ymin, ymax, CLS_vecs, all_sents, labels, n_sents = 10):
  idxs = list(range(CLS_vecs.shape[0]))
  filtered_idxs = []
  for idx in idxs:
    x = CLS_vecs[idx,0]
    y = CLS_vecs[idx,1]
    if xmin<= x and x <= xmax:
      if ymin <= y and y<=ymax:
        filtered_idxs.append(idx)

  chosen_idxs = random.sample(filtered_idxs, n_sents)
  for idx in chosen_idxs:
    print(all_sents[idx], '\t |', labels[idx].item() , '\n')


In [None]:
cls_vecs = illustrate_cluster(dataframe = data, classifier = model_cluster, n_components = 2, names = {1: 'Sci:', 0: 'Medium'}, title = 't-SNE of CLS-vectors')

1) Cluster on Bottom: possesive 's

In [None]:
#Sci cluster
dislpay_sents_cluster(xmin = -30, xmax=-20, ymin = -40, ymax=-32, CLS_vecs=cls_vecs, all_sents=data['sent'], labels=data['true_label'], n_sents = 20)

2) Small Cluster on top. -> Authors

In [None]:
#Sci cluster
dislpay_sents_cluster(xmin = -40, xmax=-20, ymin = 38, ymax=50, CLS_vecs=cls_vecs, all_sents=data['sent'], labels=data['true_label'], n_sents = 10)

3) Small Cluster left ->farwell clauses/gratitude clauses

In [None]:
#Sci cluster
dislpay_sents_cluster(xmin = -60, xmax=-50, ymin = -5, ymax=15, CLS_vecs=cls_vecs, all_sents=data['sent'], labels=data['true_label'], n_sents = 20)

4) I sentences, authors

In [None]:
#Sci cluster
dislpay_sents_cluster(xmin = 25, xmax= 30, ymin = -10, ymax=0, CLS_vecs=cls_vecs, all_sents=data['sent'], labels=data['true_label'], n_sents = 20)

5) Special punctuation

In [None]:
#Sci cluster
dislpay_sents_cluster(xmin = 30, xmax=50, ymin = -30, ymax=-20, CLS_vecs=cls_vecs, all_sents=data['sent'], labels=data['true_label'], n_sents = 20)

6) Extremely short, casual sentences

In [None]:
#Sci cluster
dislpay_sents_cluster(xmin = 50, xmax=60, ymin = -22, ymax=-18, CLS_vecs=cls_vecs, all_sents=data['sent'], labels=data['true_label'], n_sents = 10)

7) Broken by tokenization/ prerpocessing

In [None]:
#Sci cluster
dislpay_sents_cluster(xmin = 30, xmax=35, ymin = -25, ymax=-20, CLS_vecs=cls_vecs, all_sents=data['sent'], labels=data['true_label'], n_sents = 10)

8) Main Medium

In [None]:
#Sci cluster
dislpay_sents_cluster(xmin = 20, xmax=40, ymin = 20, ymax=40, CLS_vecs=cls_vecs, all_sents=data['sent'], labels=data['true_label'], n_sents = 20)

9) Main Mixed

In [None]:
#Sci cluster
dislpay_sents_cluster(xmin = -20, xmax=20, ymin = -20, ymax=20, CLS_vecs=cls_vecs, all_sents=data['sent'], labels=data['true_label'], n_sents = 20)

10) Main Sci

In [None]:
#Sci cluster
dislpay_sents_cluster(xmin = -40, xmax=-20, ymin = -20, ymax=40, CLS_vecs=cls_vecs, all_sents=data['sent'], labels=data['true_label'], n_sents = 20)

**Take away from clusters**
  - still strong content dependence

## Manual Testing

In this section we do manual classification in order. The comaprison of model performance, aswell as the charcteristca the annotator uses in his choices will give importnant information about both the model and the data.

In [None]:
def test_manually(data, labels, tries = 100, rand_state = 42):
  testing_myself = 0

  data_ = [d for d in data]
  try:
    labels_ = [l.item() for l in list(labels)]
  except:
    labels_ = [l for l in list(labels)]

  query = random.Random(rand_state).sample(list(zip(data_,labels_)), tries)
  
  for i, (sent, label) in enumerate(query):
    print( f'{i}/{tries}' , sent, f'\t accuracy: { (testing_myself)/(i+1)}', '\n')
    a = input()
    a = int(a)
    if a == label:
      testing_myself +=1


  print(f'Accuracy: {testing_myself/tries:.2f}')
  return testing_myself/tries



In [None]:
my_acc = test_manually(data_test, labels_test, 100, 12)

Accuracy on 100 sentences: 75%
Main decison making criteria:
  - style
  - complexity of vocabulary / content
  

## Checking paraphrases

In this section we laod the paraphrases created by our M1 model (T5-phase2) and see whether paraphrasing has thrown off the classifier

In [None]:
data_path = '/content/gdrive/MyDrive/NLP_EvaluationMetrics/T5_paraphrases/'
filename_dataframe = 'data_paraphrased_F1_top75k.pkl'
paraphrase_data =  pd.read_pickle(data_path + filename_dataframe)

In [None]:
originals = list(paraphrase_data['original'])
label_originals = [1 for _ in originals]
paraphrases = list(paraphrase_data['paraphrase'])
label_paraphrases = [0 for _ in paraphrases]

paraphrase_data_for_cluster = pd.DataFrame(columns = ['sent', 'true_label'])
paraphrase_data_for_cluster['sent'] = originals+paraphrases
paraphrase_data_for_cluster['true_label'] = label_originals+label_paraphrases
paraphrase_data_for_cluster.head()
paraphrase_loader = create_data_loader(paraphrase_data_for_cluster['sent'], paraphrase_data_for_cluster['true_label'], tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
# Calculation of relevant scores
labels_para_pred, labels_para_test,_ = get_predictions(model, paraphrase_loader)
report_para = classification_report(labels_para_test, labels_para_pred, output_dict=True)
report_para_df = pd.DataFrame(report_para).transpose()

In [None]:
report_para_df

In [None]:
cm_para = confusion_matrix(labels_para_test, labels_para_pred, normalize = 'true')
#show_confusion_matrix(cm_para, names = ['Paraphrase', 'Orig. Arxiv'], save_path=None)
cm_para

## Reports

In this section we load our reports to see whether the classifier is sitbale to use: ideally it should consider most of these to be non-scientific

In [None]:
data_path_reports = '/content/gdrive/MyDrive/StyleClassifier/datasets/almost_scientific_reports/final_processed/clean_report.txt'

with open(data_path_reports, 'r') as f:
  data_reports = f.readlines()

data_reports

In [None]:
data_reports = [sent.lower().replace('\n','') for sent in data_reports]
data_reports

In [None]:
labels_reports = [0 for _ in data_reports]
report_loader = create_data_loader(data_reports, labels_reports, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
labels_reports_pred, labels_reports_test,_ = get_predictions(model, report_loader)
report_reports = classification_report(labels_reports_test, labels_reports_pred, output_dict=True)
report_reports_df = pd.DataFrame(report_reports).transpose()
report_reports_df

In [None]:
cm_reports = confusion_matrix(labels_reports_test, labels_reports_pred, normalize = 'true')
#show_confusion_matrix(cm_para, names = ['Paraphrase', 'Orig. Arxiv'], save_path=None)
cm_reports

# Results/Takeaways
