In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir("drive/MyDrive/5153 Final Project")

In [3]:
import pandas as pd
import re
from bs4 import BeautifulSoup
import seaborn as sns
import numpy as np

## 1. Data Loading

In [4]:
#train = pd.read_csv('/kaggle/input/review/train.csv')
#test = pd.read_csv('/kaggle/input/review/test.csv')
df = pd.read_csv('Netflix_preprocessed_data_updated.csv',index_col=0)
df.rename(columns={col: col.replace(' ', '_') for col in df.columns}, inplace=True)

In [5]:
df.shape

(13298, 2091)

In [6]:
df.columns

Index(['series_movie', 'Director', 'Writer', 'Actors', 'View_rating',
       'IMDb_score', 'Awards_received', 'Awards_nominated_for', 'Boxoffice',
       'Summary',
       ...
       'Genre_History', 'Genre_Biography', 'Genre_Crime', 'Genre_Reality-tv',
       'Genre_Family', 'Genre_Short', 'Genre_Drama', 'Genre_Comedy',
       'Genre_Sci-fi', 'Genre_Game-show'],
      dtype='object', length=2091)

In [7]:
threshold = 7
df['Quality'] = (df['IMDb_score'] >= threshold ).astype(int)
data = df[['Summary','Quality']]
data

Unnamed: 0,Summary,Quality
0,A med student with a supernatural gift tries t...,1
1,"When nerdy Johanna moves to London, things get...",0
2,"Trapped in a frozen car during a blizzard, a p...",0
3,"Upon moving into a new place, a 20-something r...",0
4,Inspired by her moms rebellious past and a con...,0
...,...,...
15422,In an idyllic port town on Australias west coa...,1
15428,"In his third show, Daniël Arends argues that g...",1
15432,Madagascar goes wild with holiday spirit in th...,0
15433,Join your DreamWorks friends for these four ho...,0


##  BERT without Fine-Tune

#### Load pre-trained model

In [8]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m47.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1


In [9]:
import torch
from transformers import AutoTokenizer, AutoModel, Trainer
from torch import cuda
# Load tokenizer and model, create trainer
model_name = "siebert/sentiment-roberta-large-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of the model checkpoint at siebert/sentiment-roberta-large-english were not used when initializing RobertaModel: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Tokenization-Padding-Masking

In [10]:
tokenized = data['Summary'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512, truncation=True)))
#test_true = test['Text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512, truncation=True)))
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(13298, 67)

### Batch Inference

To save memory, 10 reviews are fed into the BERT model each time.

In [11]:
feature_list = []
with torch.no_grad():
    for batch_idx in range(0,padded.shape[0],10):
        #BERT check 10 sample each time.
        input_ids = torch.tensor(padded[batch_idx:batch_idx+10]) 
        used_attention_mask = torch.tensor(attention_mask[batch_idx:batch_idx+10])
        input_ids = input_ids.to(device)
        used_attention_mask = used_attention_mask.to(device)
        last_hidden_states = model(input_ids, attention_mask=used_attention_mask)
        #Get the embeddings for the [CLS] tag (position is 0)
        features = last_hidden_states[0][:,0,:].cpu().numpy()
        feature_list.append(features)


It should be noted that although the `[CLS]` acts as an "aggregate representation" for classification tasks, this is not the best choice for a high quality sentence embedding vector. [According to](https://github.com/google-research/bert/issues/164) BERT author Jacob Devlin: "*I'm not sure what these vectors are, since BERT does not generate meaningful sentence vectors. It seems that this is is doing average pooling over the word tokens to get a sentence vector, but we never suggested that this will generate meaningful sentence representations*."

(However, the [CLS] token does become meaningful if the model has been fine-tuned, where the last hidden layer of this token is used as the "sentence vector" for sequence classification.)

feature_list = []
with torch.no_grad():
    for batch_idx in range(0,padded.shape[0],10):
        #BERT check 10 sample each time.
        input_ids = torch.tensor(padded[batch_idx:batch_idx+10]) 
        used_attention_mask = torch.tensor(attention_mask[batch_idx:batch_idx+10])
        input_ids = input_ids.to(device)
        used_attention_mask = used_attention_mask.to(device)
        last_hidden_states = model(input_ids, attention_mask=used_attention_mask)
        #Get the embeddings for the [CLS] tag (position is 0)
        features = last_hidden_states[0][:,0,:].cpu().numpy()
        feature_list.append(features)

![picture](https://github.com/rz0718/colab_imgs/blob/main/imgs/bert_output_sentence.png?raw=true)

In [12]:
# preprare features
features = np.vstack(feature_list)
features.shape

(13298, 1024)

In [13]:
features

array([[-0.24156815,  0.58229893, -0.0614216 , ..., -0.16272336,
        -1.7242047 ,  1.5694544 ],
       [ 0.05462715,  0.56971216, -0.23192409, ...,  0.73967713,
        -1.237085  ,  0.8269837 ],
       [-0.00517806, -0.15224561, -0.5683238 , ...,  0.12241815,
        -0.16752039,  0.1711729 ],
       ...,
       [-0.04387463, -0.13038342, -0.5586646 , ...,  0.14641233,
         0.13954699, -0.3478933 ],
       [-0.05249314, -0.13384838, -0.55634886, ...,  0.12108692,
         0.12491636, -0.31232637],
       [-0.0508308 , -0.13649632, -0.5565323 , ...,  0.1084983 ,
         0.1052921 , -0.26366627]], dtype=float32)

In [14]:
save_features = pd.DataFrame(features)
save_features.to_csv('summary_features.csv')