In [19]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.regularizers import l2
from tqdm import tqdm
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
df = pd.read_csv('OnionOrNot.csv')
batches = np.array_split(df,24)

In [37]:
# print(f'Number of records: {len(data)}')
# print(f'Number of Onion headlines: {len(data[data["label"] == 1])}')
# print(f'Number of r/NotTheOnion headlines: {len(data[data["label"] == 0])}')

Number of records: 2000
Number of Onion headlines: 746
Number of r/NotTheOnion headlines: 1254


In [3]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [4]:
tokenizeds = []
for batch in tqdm(batches):
    tokenizeds.append(batch['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True))))

100%|██████████| 24/24 [00:07<00:00,  3.11it/s]


In [5]:
paddeds = []
for tokenized in tokenizeds:
    max_len = 0
    for i in tokenized.values:
        if len(i) > max_len:
            max_len = len(i)
    
    paddeds.append(np.array([i + [0]*(max_len-len(i)) for i in tokenized.values]))

In [6]:
attention_masks = []
for padded in paddeds:
    attention_masks.append(np.where(padded != 0, 1, 0))

In [8]:
last_hidden_states = []
i = 0
for padded in tqdm(paddeds):
    input_ids = torch.tensor(padded).to(torch.int64)
    attention_mask = torch.tensor(attention_masks[i])
    
    with torch.no_grad():
        last_hidden_states.append(model(input_ids, attention_mask=attention_mask))

    i += 1
    

100%|██████████| 24/24 [11:51<00:00, 29.64s/it]


In [10]:
features = []
for last_hidden_state in last_hidden_states:
    features.append(last_hidden_state[0][:,0,:].numpy())

In [13]:
concated_features = np.concatenate(features)

In [15]:
labels = df['label']
labels

0        1
1        0
2        1
3        1
4        1
        ..
23995    1
23996    0
23997    1
23998    1
23999    0
Name: label, Length: 24000, dtype: int64

In [16]:
train_features, test_features, train_labels, test_labels = train_test_split(concated_features, labels, test_size=0.25)

In [18]:
parameters = {'C': np.linspace(0.0001, 100, 20)}
grid_search = GridSearchCV(LogisticRegression(), parameters)
grid_search.fit(train_features, train_labels)

print('best parameters: ', grid_search.best_params_)
print('best scores: ', grid_search.best_score_)

best parameters:  {'C': 5.263252631578947}
best scrores:  0.8253333333333334


In [30]:
lr_clf = LogisticRegression(C=2.5)
lr_clf.fit(train_features, train_labels)

# rf = RandomForestClassifier()
# rf.fit(train_features, train_labels)

# xg = XGBClassifier()
# xg.fit(train_features, train_labels)


print('lr train', lr_clf.score(train_features, train_labels))
print('lr test', lr_clf.score(test_features, test_labels))

# print('rf train', rf.score(train_features, train_labels))
# print('rf test', rf.score(test_features, test_labels))

# print('xg train', xg.score(train_features, train_labels))
# print('xg test', xg.score(test_features, test_labels))

lr train 0.8885
lr test 0.8638333333333333


In [26]:
model = Sequential()
model.add(Dense(units=64, activation='relu', kernel_regularizer=l2(0.001), input_dim=768))
model.add(Dense(units=64, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dense(units=1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [28]:
model.fit(train_features, train_labels, epochs=10, batch_size=8,
          validation_data=(test_features, test_labels))

Train on 18000 samples, validate on 6000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x2b9bc47b288>