In [1]:
# installing the Transforming library that contains the bert models 
!pip install transformers



In [48]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

warnings.filterwarnings('ignore')

In [61]:
## Import the data set 
data_source_url = "https://raw.githubusercontent.com/kolaveridi/kaggle-Twitter-US-Airline-Sentiment-/master/Tweets.csv"
df = pd.read_csv(data_source_url, delimiter=',', header=None)

In [58]:
# Load the pretrain model 

model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [62]:
# Preparing the data set 
# TOKENIZAtION

df = df[:2000] #original dataset has over 14,000 inputs which effects peformance
tokenized = df[10].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [63]:
# Padding 
max_len =0 
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
np.array(padded).shape

(2000, 62)

In [36]:
# MASKING 
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(2000, 62)

In [37]:
# Run the model and the input 

input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

In [44]:
features = last_hidden_states[0][:,0,:].numpy()
labels = df.iloc[:,1]

In [64]:
# TRAIN TEST and SPLIT 
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

# Logistic regression
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [65]:
lr_clf.score(test_features, test_labels)

0.804