In [2]:
!pip install transformers

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
from sklearn.metrics import precision_recall_fscore_support
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')



In [6]:
class BERT(object):
    def __init__(self, x, y, distilBERT=True, fullBERT=False):
        super(BERT, self).__init__()
        self.labels = y
        
        # make it list
        self.data_x = x
        self.features = None
        self.padded = None
        self.attention_m = None

        self.distilBERT = distilBERT
        self.fullBERT = fullBERT

        if distilBERT:
            # For DistilBERT:
            self.model_class, self.tokenizer_class, self.pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
        elif fullBERT:
            # Want BERT instead of distilBERT? 
            self.model_class, self.tokenizer_class, self.pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

        # Load pretrained model/tokenizer
        self.tokenizer = self.tokenizer_class.from_pretrained(self.pretrained_weights)
        self.model = self.model_class.from_pretrained(self.pretrained_weights)
    
    # tokeise sentences into index numbers
    def bert_tokenise(self):
        tokenized = self.data_x.apply((lambda x: self.tokenizer.encode(x, add_special_tokens=True)))
        return tokenized

    # pad the tokens with
    def padding(self, list_of_tokenized):
        print("starting to perform padding on the input length of: ", len(list_of_tokenized))
        max_len = 0
        for i in list_of_tokenized.values:
            if len(i) > max_len:
                max_len = len(i)
        self.padded = np.array([i + [0]*(max_len-len(i)) for i in list_of_tokenized.values])
        print("Padding over and we have padded array of shape: ", (self.padded).shape)
        return self

    def attention_mask(self):
        self.attention_m = np.where(self.padded != 0, 1, 0)
        print("Done with attention mask. Shape of attention mask matrix is: ", self.attention_m.shape)
        return self
    
    def bert_train(self):
        input_ids = torch.tensor(self.padded)
        attention_mask = torch.tensor(self.attention_m)

        with torch.no_grad():
            last_hidden_states = self.model(input_ids, attention_mask=attention_mask)
        self.features = last_hidden_states[0][:,0,:].numpy()
        self.last_hidden_states = last_hidden_states
        return self

    # def split(self):
    #     train_features, test_features, train_labels, test_labels = train_test_split(self.features, self.labels)
    #     return train_features, test_features, train_labels, test_labels
    
    def sigmoid_model(self, params=[0.0001, 100, 200]):
        #split data
        train_features, test_features, train_labels, test_labels = self.features[:4494], self.features[4494:], self.labels[:4494], self.labels[4494:]
        
        # parameter_tuning
        parameters = {'C': params}
        grid_search = GridSearchCV(LogisticRegression(), parameters)
        grid_search.fit(train_features, train_labels)

        print('best parameters: ', grid_search.best_params_)
        print('best scrores: ', grid_search.best_score_)

        # sigmoid_model
        lr_clf = LogisticRegression(C=grid_search.best_params_['C'])
        lr_clf.fit(train_features, train_labels)
        acc_test = lr_clf.score(test_features, test_labels)
        print("Accuracy Score on Test Data: ", acc_test*100)
        y_pred = lr_clf.predict(test_features)
        prf = precision_recall_fscore_support(test_labels, y_pred, average='weighted')
        print("Precision Score on Test Data: ", prf[0])
        print("Recall Score on Test Data: ", prf[1])
        print("F1-Score Score on Test Data: ", prf[2])
        

        

In [7]:
# comment this later and use your own data
# df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)
df1 = pd.read_csv('processed_tweets.csv')
df2 = pd.read_csv('processed_tweets_val.csv')
df = pd.concat([df1, df2])

# batch_1 = df[:200]
batch_1 = df

def sentence_truncate(x):
        return x[:min(len(x), 511)]
    
batch_1['tweets'] = batch_1['tweets'].map(sentence_truncate)

# uncomment this with our data
x, y = batch_1['tweets'], batch_1['label']

# x,y = batch_1[0], batch_1[1]

bert = BERT(x,y)
tokenized = bert.bert_tokenise()
bert.padding(tokenized)
bert.attention_mask()
res = bert.bert_train()
bert.sigmoid_model(params=[0.1,0.001,0.0001])

starting to perform padding on the input length of:  200
Padding over and we have padded array of shape:  (200, 83)
Done with attention mask. Shape of attention mask matrix is:  (200, 83)
best parameters:  {'C': 0.1}
best scrores:  0.8935672514619881
Accuracy Score on Test Data:  88.67924528301887
Precision Score on Test Data:  0.8955347756321586
Recall Score on Test Data:  0.8867924528301887
F1-Score Score on Test Data:  0.8859791802212102
