In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import trange, tqdm
#-------------------------------
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report

In [5]:
import spacy
nlp = spacy.load('en_core_web_lg', disable = ["tagger", "parser", "ner"])

In [6]:
train_data = pd.read_csv("train_data.csv")
test_data = pd.read_csv("test_data.csv")

In [None]:
train_data.head()

In [None]:
test_data.head()

In [None]:
pipe = Pipeline([("vectorizer", TfidfVectorizer()), ("mnb", MultinomialNB())], verbose=True)
pipe.fit(train_data['review'], train_data['sentiment'])
predClass = pipe.predict(test_data['review'])

print(classification_report(test_data['sentiment'], predClass, target_names = train_data['sentiment'].unique()))

In [None]:
pipe2 = Pipeline([("vectorizer", TfidfVectorizer()), ("logreg", SGDClassifier(loss = 'log', random_state=1234))], verbose=True)
pipe2.fit(train_data['review'], train_data['sentiment'])
predClass2 = pipe2.predict(test_data['review'])

print(classification_report(test_data['sentiment'], predClass2, target_names = train_data['sentiment'].unique()))

In [None]:
from sklearn.svm import LinearSVC
pipe3 = Pipeline([("vectorizer", TfidfVectorizer()), ("SVM", LinearSVC(random_state=1234))], verbose=True)
pipe3.fit(train_data['review'], train_data['sentiment'])
predClass3 = pipe3.predict(test_data['review'])

print(classification_report(test_data['sentiment'], predClass3, target_names = train_data['sentiment'].unique()))

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
class MeanSentenceVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
    
    def tokenizer(self, sentence):
        doc = nlp(sentence)
        preprocessed = [token.text for token in doc]
        return preprocessed
    
    def transform(self, X):
        return np.array(
            [np.mean([nlp.vocab[word].vector for word in self.tokenizer(sentence)], axis=0) for sentence in tqdm(X)]
            )

In [None]:
pipe4 = Pipeline([("vectorizer", MeanSentenceVectorizer()), ("SVM", LinearSVC(random_state=1234))], verbose=True)
pipe4.fit(train_data['review'], train_data['sentiment'])
predClass4 = pipe4.predict(test_data['review'])

print(classification_report(test_data['sentiment'], predClass4, target_names = train_data['sentiment'].unique()))

# Using DistilBERT

In [7]:
from transformers import DistilBertTokenizer, TFDistilBertModel

In [8]:
import tensorflow as tf

In [9]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_transform', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [10]:
###############################
# Slice the data for trials
r = int(1/2 * len(train_data))

tr_d = pd.concat([train_data[:r], train_data[12500:(12500+r)]])
te_d = pd.concat([test_data[:r], test_data[12500:(12500+r)]])
###############################

In [11]:
def embed_sentences(X, maxlen=512):
    # tokenize sentences
    tokenized = []
    for sentence in tqdm(X, desc='Tokenizing sentences'):
        token_vec = tokenizer(sentence, return_tensors="tf", truncation=True, padding='max_length', max_length=maxlen)['input_ids']
        tokenized.append(token_vec)
    tokenized = tf.convert_to_tensor(tf.squeeze(tokenized))

    # embedding sentences
    vecs = []
    batches = list(tf.split(tokenized, 5000))
    for batch in tqdm(batches, desc='Processing sentences'):
        lhs = model(batch).last_hidden_state[:,0,:]
        vecs.append(lhs)
    return np.concatenate(vecs)

In [12]:
X_train = embed_sentences(tr_d['review'])
np.save('X_train', X_train)

HBox(children=(HTML(value='Tokenizing sentences'), FloatProgress(value=0.0, max=25000.0), HTML(value='')))




HBox(children=(HTML(value='Processing sentences'), FloatProgress(value=0.0, max=5000.0), HTML(value='')))




In [13]:
X_test = embed_sentences(te_d['review'])
np.save('X_test', X_test)

HBox(children=(HTML(value='Tokenizing sentences'), FloatProgress(value=0.0, max=25000.0), HTML(value='')))




HBox(children=(HTML(value='Processing sentences'), FloatProgress(value=0.0, max=5000.0), HTML(value='')))




In [15]:
from sklearn.svm import LinearSVC
clf = LinearSVC(random_state=1234).fit(X_train, tr_d['sentiment'])
preds = clf.predict(X_test)
print(classification_report(te_d['sentiment'], preds, target_names = tr_d['sentiment'].unique()))

              precision    recall  f1-score   support

    negative       0.85      0.85      0.85     12500
    positive       0.85      0.85      0.85     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000



# Using tensorflow:

In [None]:
import tensorflow as tf

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
tf.test.is_gpu_available(
    cuda_only=False, min_cuda_compute_capability=None
)

In [None]:
# preprocess Y
y_train = np.array([0 if sent == 'negative' else 1 for sent in tqdm(tr_d['sentiment'])])
y_test = np.array([0 if sent == 'negative' else 1 for sent in tqdm(te_d['sentiment'])])

In [None]:
# preprocess X
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import string
import re

def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
    return tf.strings.regex_replace(
        stripped_html, "[%s]" % re.escape(string.punctuation), ""
    )

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1234)

In [None]:
x_train = vectorize_layer(X_train)
x_val = vectorize_layer(X_val)

In [None]:
num_words = len(vectorize_layer.get_vocabulary())
embd_dim = 100

In [None]:
# build model
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Embedding, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.regularizers import l2

np.random.seed(seed=12345)

model = Sequential()

model.add(Embedding(num_words, embd_dim, input_length=x_train.shape[1]))
model.add(Dropout(0.5))
model.add(Conv1D(128, 7, padding="valid", activation="relu", strides=3))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation="relu"))
model.add(Dense(1, activation="sigmoid"))


""" model.add(Dense(
    300, activation='relu', input_shape = (X_train.shape[1],), 
    kernel_regularizer=l2(1e-5),
    bias_regularizer=l2(1e-5),
    activity_regularizer=l2(1e-5)))
model.add(Dropout(0.3))

model.add(
    Dense(100, activation='relu',
    kernel_regularizer=l2(1e-5),
    bias_regularizer=l2(1e-5),
    activity_regularizer=l2(1e-5)))
model.add(Dropout(0.3))

model.add(Dense(1, activation='sigmoid')) """


model.compile(loss = binary_crossentropy, optimizer = Adam(learning_rate=0.01), metrics=['accuracy'])

model.summary()

In [None]:
history = model.fit(x_train, y_train, batch_size = 125, epochs = 10, verbose = 1, validation_data = (x_val, y_val))

In [None]:
x_test = vectorize_layer(np.array(te_d['review']))
model.evaluate(x_test, y_test, batch_size = 125)

In [None]:
msv = MeanSentenceVectorizer()

In [None]:
# preprocess Y
y_train = np.array([0 if sent == 'negative' else 1 for sent in tqdm(tr_d['sentiment'])])
y_test = np.array([0 if sent == 'negative' else 1 for sent in tqdm(te_d['sentiment'])])
tr_d_msv = msv.transform(tr_d['review'])
te_d_msv = msv.transform(te_d['review'])

In [None]:
X_train, X_val, y_train, y_val = train_test_split(tr_d_msv, y_train, test_size=0.2, random_state=1234)

In [None]:
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Embedding, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import binary_crossentropy
from tensorflow.keras.regularizers import l2

np.random.seed(seed=12345)

model2 = Sequential()

model2.add(Dense(x_train.shape[1], activation="relu", input_shape=(x_train.shape[1],),
    kernel_regularizer=l2(1e-5),
    bias_regularizer=l2(1e-5),
    activity_regularizer=l2(1e-5)))
model2.add(Dense(x_train.shape[1], activation="relu",
    kernel_regularizer=l2(1e-5),
    bias_regularizer=l2(1e-5),
    activity_regularizer=l2(1e-5)))
model2.add(Dense(300, activation="relu",
    kernel_regularizer=l2(1e-5),
    bias_regularizer=l2(1e-5),
    activity_regularizer=l2(1e-5)))
model2.add(Dense(1, activation="sigmoid"))

model2.compile(loss = binary_crossentropy, optimizer = Adam(learning_rate=0.01), metrics=['accuracy'])

model2.summary()

In [None]:
history = model2.fit(x_train, y_train, batch_size = 125, epochs = 50, verbose = 1, validation_data = (x_val, y_val))

In [None]:
X_test = te_d_msv
model2.evaluate(X_test, y_test, batch_size = 125)