In [0]:
%%capture
!pip install transformers

In [0]:
import pandas as pd
import numpy as np

import transformers
from transformers import DistilBertModel, DistilBertTokenizer, DistilBertConfig

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split

from tqdm.notebook import tqdm

import torch

In [3]:
sentneces_df = pd.read_csv('drive/My Drive/!UBS/DW/scored_sentences.csv', index_col=0)
print(sentneces_df.shape)
sentneces_df.head()

(147062, 3)


Unnamed: 0,sha,sentence,score
0,f056da9c64fbf00a4645ae326e8a4339d015d155,Metagenomic classification methods are based o...,0.0
1,f056da9c64fbf00a4645ae326e8a4339d015d155,A representative set of recent methods are des...,0.0
2,f056da9c64fbf00a4645ae326e8a4339d015d155,genus or species) across the entire tree of life.,0.0
3,f056da9c64fbf00a4645ae326e8a4339d015d155,They generally require reference databases tha...,0.0
4,f056da9c64fbf00a4645ae326e8a4339d015d155,The gap SIANN is designed to fill is when the ...,0.051282


In [4]:
# Filter out links
sentneces_df = sentneces_df[~sentneces_df['sentence'].str.contains('https://')]
print(sentneces_df.shape)

(138169, 3)


### Select 10 best sentences per article as important, oversample them x3 times and take 30 wrong sentences.

In [0]:
# Parameters
no_good_sent = 5
good_samples = 20
bad_samples = 20

X = []
y = []
articles = set(sentneces_df['sha'])
for article in articles:
    article_sentneces = sentneces_df[sentneces_df['sha'] == article]
    if len(article_sentneces) < 15: # Filter out short anomalies
        continue
    article_sentneces_sorted = article_sentneces.sort_values('score',  ascending=False)
    good_sentneces = article_sentneces_sorted[:no_good_sent].sample(good_samples, replace=True)['sentence'].values
    bad_sentneces = article_sentneces_sorted[no_good_sent:].sample(bad_samples, replace=True)['sentence'].values
    X.append(list(good_sentneces) + list(bad_sentneces))
    y.append(len(good_sentneces)*[1] + len(bad_sentneces)*[0])

# Flatten lists
X = [item for sublist in X for item in sublist]
y = [item for sublist in y for item in sublist]

### Create embeddings of sentences

In [6]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
config = DistilBertConfig(output_hidden_states = True)
model = DistilBertModel.from_pretrained('distilbert-base-uncased', config=config)
model.cuda();

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




In [7]:
tokenized = [tokenizer.encode(x, add_special_tokens=True) for x in X]
max_len = 0
for i in tokenized:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized])
print(padded.shape)
attention_mask = np.where(padded != 0, 1, 0)
print(attention_mask.shape)

(73520, 83)
(73520, 83)


In [8]:
# Split  
no_batches = 500
batched_padded = np.array_split(padded, no_batches)
batched_attention_mask = np.array_split(attention_mask, no_batches)
# Create vector of features that are CLS tokens
features = []
for tokens, mask in tqdm(list(zip(batched_padded, batched_attention_mask))):
    # Create tensors and run through the model
    X_tensor = torch.tensor(tokens).cuda()  
    attention_mask_tensor = torch.tensor(mask).cuda()

    last_hidden_states = model(X_tensor, attention_mask=attention_mask_tensor)
    cls_tokens = last_hidden_states[0][:,0,:].cpu().detach().numpy()
    features.append(cls_tokens)

HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))




In [9]:
features_stacked = np.concatenate(features, axis=0)
features_stacked.shape

(73520, 768)

### Split for test and training

In [0]:
X_train, X_test, y_train, y_test = train_test_split(features_stacked, y, stratify=y, test_size=0.2, shuffle=True)

### Train Logistic Regression

In [0]:
clf = LogisticRegression(max_iter=1500)
dummy = DummyClassifier()

In [12]:
clf.fit(X_train, y_train)
dummy.fit(X_train, y_train);



In [0]:
pred = clf.predict(X_test)
dummy_pred = dummy.predict(X_test)

acc = accuracy_score(y_test, pred)
dummy_acc = accuracy_score(y_test, dummy_pred)

bacc = balanced_accuracy_score(y_test, pred)
dummy_bacc = balanced_accuracy_score(y_test, dummy_pred)

In [14]:
print('Acc: {:.2f}, dummy: {:.2f}'.format(acc, dummy_acc))
print('BAcc: {:.2f}, dummy: {:.2f}'.format(bacc, dummy_bacc))

Acc: 0.57, dummy: 0.50
BAcc: 0.57, dummy: 0.50
