In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
import os

In [2]:
from torch import cuda
#os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = 'cuda' if cuda.is_available() else 'cpu'
cuda.is_available()

True

In [3]:
df_data = pd.read_csv('kaggle_movie_train.csv')

In [4]:
df_data.head(5)

Unnamed: 0,id,text,genre
0,0,"eady dead, maybe even wishing he was. INT. 2ND...",thriller
1,2,"t, summa cum laude and all. And I'm about to l...",comedy
2,3,"up Come, I have a surprise.... She takes him ...",drama
3,4,ded by the two detectives. INT. JEFF'S APARTME...,thriller
4,5,"nd dismounts, just as the other children reach...",drama


In [5]:
df_data.drop(columns= 'id', inplace= True)

In [6]:
df_data['text']

0        eady dead, maybe even wishing he was. INT. 2ND...
1        t, summa cum laude and all. And I'm about to l...
2         up Come, I have a surprise.... She takes him ...
3        ded by the two detectives. INT. JEFF'S APARTME...
4        nd dismounts, just as the other children reach...
                               ...                        
22574    n in the world to decide what I'm going to do ...
22575    shards. BOJO LAZ! Laz pushes Deke back through...
22576    OTTIE You've got a thing about Ernie's, haven'...
22577    ....with marked skill and dexterity . LANA wry...
22578    rd walks off down the hallway, leaving his pos...
Name: text, Length: 22579, dtype: object

In [7]:
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
nltk.download('stopwords')


STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string

        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text


    return text
df_data['text'] = df_data['text'].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bidle\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
df_data['text']

0        eady dead, maybe even wishing was. int. 2nd fl...
1        t, summa cum laude all. i'm launch brand new m...
2        come, surprise.... takes hand leads hallway. s...
3        ded two detectives. int. jeff's apartment nigh...
4        nd dismounts, children reach him... throw arms...
                               ...                        
22574    n world decide i'm going that's me. think thin...
22575    shards. bojo laz! laz pushes deke back joint s...
22576    ottie got thing ernie's, you? judy well, all, ...
22577    ....with marked skill dexterity . lana wryly s...
22578    rd walks hallway, leaving post. end hallway se...
Name: text, Length: 22579, dtype: object

In [9]:
from sklearn import preprocessing 
label_encoder = preprocessing.LabelEncoder() 
  
df_data['genre']= label_encoder.fit_transform(df_data['genre']) 
  
df_data['genre'].unique() 

array([8, 2, 3, 0, 7, 5, 6, 4, 1])

In [10]:
df_data['genre'] = pd.get_dummies(df_data['genre'].values).values.tolist()
wordcounter = df_data['text'].apply(lambda x: x.count(' '))
print("Max number of words per plot: ", int(wordcounter.max()))
print("Avarage number of words per plot: ", int(wordcounter.mean()))

Max number of words per plot:  154
Avarage number of words per plot:  109


In [11]:
train_size = 0.2
val_size = 0.2
df_train = df_data.sample(frac=train_size,random_state=200)
df_test = df_data.drop(df_train.index)
df_test = df_test.sample(frac=val_size,random_state=200).reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
print(df_train)
print(df_test)

                                                   text  \
0     s. ichabod's dream young ichabod's kitchen nig...   
1     normal build head shaved. face, scalp, hands c...   
2     open. suddenly yuri's face appears window, cup...   
3     panting pappas! move car! mason move car, plea...   
4     tting dog. man want give biscuit? girl nods. m...   
...                                                 ...   
4511  mm mm... massive hand slaps mouth. crusted eye...   
4512  . basil exposition vanessa's one top agents. a...   
4513  get spend it, son. jabez eagerly really think ...   
4514  's house night power lines whack angerily door...   
4515  . i'm thrilled darcy consented... 142.05 scene...   

                            genre  
0     [0, 0, 0, 0, 0, 0, 0, 0, 1]  
1     [0, 0, 0, 1, 0, 0, 0, 0, 0]  
2     [0, 0, 0, 0, 0, 0, 0, 0, 1]  
3     [0, 0, 0, 0, 0, 0, 0, 0, 1]  
4     [0, 0, 0, 1, 0, 0, 0, 0, 0]  
...                           ...  
4511  [0, 0, 0, 1, 0, 0, 0, 0, 0]  
451

In [12]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 250
TRAIN_BATCH_SIZE = 10
VALID_BATCH_SIZE = 10
EPOCHS = 5
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [13]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.genre
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [19]:
training_set = CustomDataset(df_train, tokenizer, MAX_LEN)
testing_set = CustomDataset(df_test, tokenizer, MAX_LEN)
print(training_set.data)

                                                   text  \
0     s. ichabod's dream young ichabod's kitchen nig...   
1     normal build head shaved. face, scalp, hands c...   
2     open. suddenly yuri's face appears window, cup...   
3     panting pappas! move car! mason move car, plea...   
4     tting dog. man want give biscuit? girl nods. m...   
...                                                 ...   
4511  mm mm... massive hand slaps mouth. crusted eye...   
4512  . basil exposition vanessa's one top agents. a...   
4513  get spend it, son. jabez eagerly really think ...   
4514  's house night power lines whack angerily door...   
4515  . i'm thrilled darcy consented... 142.05 scene...   

                            genre  
0     [0, 0, 0, 0, 0, 0, 0, 0, 1]  
1     [0, 0, 0, 1, 0, 0, 0, 0, 0]  
2     [0, 0, 0, 0, 0, 0, 0, 0, 1]  
3     [0, 0, 0, 0, 0, 0, 0, 0, 1]  
4     [0, 0, 0, 1, 0, 0, 0, 0, 0]  
...                           ...  
4511  [0, 0, 0, 1, 0, 0, 0, 0, 0]  
451

In [15]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [16]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 9)
    
    def forward(self, ids, mask, token_type_ids):
        _, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids,return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device)

KeyboardInterrupt: 

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [None]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
def train(epoch):
    model.train()
    for it,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if it%100==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Epoch: {epoch} , Loss:  {loss.item()}')

In [None]:
for epoch in range(EPOCHS):
    train(epoch)

In [None]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
            loss = loss_fn(outputs, targets)
            loss.backward()
    return fin_outputs, fin_targets

In [None]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Epoch: {epoch} Accuracy Score = {accuracy}")
    print(f"Epoch: {epoch} F1 Score (Micro) = {f1_score_micro}")
    print(f"Epoch: {epoch} F1 Score (Macro) = {f1_score_macro}")