In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers



In [3]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertModel

In [4]:
data_path = '/content/drive/MyDrive/classification/test.ft.txt'

with open(data_path) as f:
  data = f.readlines()

In [5]:
data[:10]

['__label__2 Great CD: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I\'m in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life\'s hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"\n',
 "__label__2 One of the best game music soundtracks - for a game I didn't really play: Despite the fact that I have only played a small portion of the game, the music I heard (plus the connection to Chrono Trigger which was great as well) led me to purchase the soundtrack, and it remains one of my favorite albums. There is an incredible mix of fun, epic, and emotional songs. Those sad and beautiful tracks I especially like, as there's not too 

In [6]:
data[0][10:]

' Great CD: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I\'m in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life\'s hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"\n'

In [7]:
sentiments = []
reviews = []

for review in data[:10000]:
  sentiments.append(review[9])
  reviews.append(review[10:])

In [8]:
len(sentiments), len(reviews)

(10000, 10000)

In [9]:
data_dic = {
    'reviews' : reviews,
    'sentiment' : sentiments
}

df = pd.DataFrame(data_dic)

In [10]:
df.head()

Unnamed: 0,reviews,sentiment
0,Great CD: My lovely Pat has one of the GREAT ...,2
1,One of the best game music soundtracks - for ...,2
2,Batteries died within a year ...: I bought th...,1
3,"works fine, but Maha Energy is better: Check ...",2
4,Great for the non-audiophile: Reviewed quite ...,2


In [11]:
df.isnull().sum()

reviews      0
sentiment    0
dtype: int64

In [12]:
mapp={
    '1': 0,
    '2': 1}

In [13]:
df['sentiment'] = df['sentiment'].map(mapp)
df.head()

Unnamed: 0,reviews,sentiment
0,Great CD: My lovely Pat has one of the GREAT ...,1
1,One of the best game music soundtracks - for ...,1
2,Batteries died within a year ...: I bought th...,0
3,"works fine, but Maha Energy is better: Check ...",1
4,Great for the non-audiophile: Reviewed quite ...,1


In [14]:
df['sentiment'].value_counts()

1    5125
0    4875
Name: sentiment, dtype: int64

In [15]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [18]:
sample = df['reviews'][10]
sample

' Great book for travelling Europe: I currently live in Europe, and this is the book I recommend for my visitors. It covers many countries, colour pictures, and is a nice starter for before you go, and once you are there.\n'

In [19]:
encodings = tokenizer.encode_plus(
    sample,
    add_special_tokens = True,
    max_length = 256,
    padding = 'max_length',
    truncation = True,
    return_attention_mask = True,
    return_tensors = 'pt'
)

In [None]:
encodings

In [30]:
class Amazon_reviews(torch.utils.data.Dataset):
  def __init__(self,df):
    self.df = df
    self.maxlen = 256
    self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

  def __len__(self):
    return len(self.df)

  def __getitem__(self,index):
    review = self.df['reviews'].iloc[index].split()
    review = ' '.join(review)
    sentiment = int(self.df['sentiment'].iloc[index])

    encoding = self.tokenizer.encode_plus( 
        review,
        add_special_tokens = True,
        max_length = self.maxlen,
        padding = 'max_length',
        truncation = True,
        return_attention_mask = True,
        return_tensors = 'pt'
    )

    return {
        'input_ids':encoding.input_ids.flatten(),
        'attention_mask': encoding.attention_mask.flatten(),
        'labels' : torch.tensor(sentiment, dtype =torch.long)

    }

In [31]:
train_df, test_df = train_test_split(df,test_size = 0.3, random_state = 42)
len(train_df) , len(test_df)

(7000, 3000)

In [32]:
train_dataset = Amazon_reviews(train_df)
valid_dataset = Amazon_reviews(test_df)

In [33]:
train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size = 32,
    shuffle = True
)

valid_data_loader = torch.utils.data.DataLoader(
    valid_dataset,
    batch_size = 32
)

In [38]:
for batch in train_data_loader:
  print(batch['labels'].shape)
  break

torch.Size([32])


In [39]:
class SentimentClassifier(nn.Module):
  def __init__(self):
    super(SentimentClassifier, self).__init__()
    self.distilbert = DistilBertModel.from_pretrained("distilbert-base-uncased")
    self.drop0 = nn.Dropout(0.25)
    self.linear1 = nn.Linear(3072, 512)
    self.relu1 = nn.ReLU()
    self.drop1 = nn.Dropout(0.25)
    self.linear2 = nn.Linear(512, 2)
    self.relu2 = nn.ReLU()

  def forward(self, input_ids, attention_mask):
    outputs = self.distilbert(input_ids, attention_mask)
    last_hidden_state = outputs[0]
    pooled_output = torch.cat(tuple([last_hidden_state[:, i] for i in [-4, -3, -2, -1]]), dim=-1)
    x = self.drop0(pooled_output)
    x = self.relu1(self.linear1(x))
    x = self.drop1(x)
    x = self.relu2(self.linear2(x))
    return x

In [40]:
model = SentimentClassifier()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


SentimentClassifier(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Lin

In [41]:
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.AdamW(model.parameters(), lr=1e-3)
epochs = 5

In [43]:
for epoch in range(epochs):
  # TRAIN
  model.train()
  train_loop = tqdm(train_data_loader)
  for batch in train_loop:
    optimizer.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    output = model(input_ids, attention_mask)
    loss = criterion(output, labels)
    loss.backward()
    nn.utils.clip_grad_norm(model.parameters(), max_norm=1.0)
    optimizer.step()

    train_loop.set_description(f"Training Epoch: {epoch}")
    train_loop.set_postfix(loss=loss.item())

  # VALIDATION
  model.eval()
  valid_loop = tqdm(valid_data_loader)
  for batch in valid_loop:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)
    output = model(input_ids, attention_mask)
    loss = criterion(output, labels)

    valid_loop.set_description(f"Validation Epoch: {epoch}")
    valid_loop.set_postfix(loss=loss.item())

  0%|          | 0/219 [00:00<?, ?it/s]

  del sys.path[0]


  0%|          | 0/94 [00:00<?, ?it/s]

  0%|          | 0/219 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  0%|          | 0/219 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  0%|          | 0/219 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

  0%|          | 0/219 [00:00<?, ?it/s]

  0%|          | 0/94 [00:00<?, ?it/s]

In [58]:
test_sample = test_df['reviews'].iloc[99]
original_label = test_df['sentiment'].iloc[99]

print(test_sample)
print(original_label)

encodings = tokenizer.encode_plus(
    test_sample,
    add_special_tokens=True,
    max_length=256,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
)

with torch.no_grad():
  model.to('cpu')
  preds = model(encodings['input_ids'].to('cpu'), encodings['attention_mask'].to('cpu'))
  preds = np.argmax(preds)
  output = preds.item()
  print(output+1)

 Now this is fun....: Besides thrills,laughter,easy readings,great conversations and a bit of mystery and a lot of suspense,you will get a fine dose of humour.McBain at his funny best.

1
1
