In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns 
import sklearn 
import nltk 
import re 
import string
import unicodedata
import os 
import warnings
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize , sent_tokenize 
from bs4 import BeautifulSoup 
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer 

warnings.filterwarnings('ignore')


### read data 
path  = 'E:\projects\sentiment analysis\IMDB_data\IMDB Dataset.csv'
data  = pd.read_csv(path)

In [2]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
### prepare preprocessing functions 

#now time for preprocessing 

#let's do some steps 

#1. remove HTML 
#2. remove squer prackets 
#3. remove special characters 
#4. remove stopwords 
#5. stemming 

# finally collect all functions in one preprocessing function 


def remove_html(text):
    soup = BeautifulSoup(text , 'html.parser')
    return soup.get_text()

def remove_squer_prackets(text):
    return re.sub('\[[^]]*\]','',text)

def remove_special_char(text):
    return re.sub('[^a-zA-Z0-9\s]','' , text)

def stemming(text):
    stem = nltk.porter.PorterStemmer()
    text = ' '.join([stem.stem(word) for word in text.split()])
    return text 

def remove_stopwords(text):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtering = [word for word in tokens if word.lower() not in stopwords]
    return ' '.join(filtering)

# collect
def preprocessing(text):
    docs = remove_html(text)
    docs = remove_squer_prackets(docs)
    docs = remove_special_char(docs)
    docs = stemming(docs)
    docs = remove_stopwords(docs)
    return docs


In [4]:
tokenizer = ToktokTokenizer()
stopwords = nltk.corpus.stopwords.words('english')


processed_data =  data['review'].apply(preprocessing)


In [5]:
data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [6]:
#load Tokenizer and Model 
from transformers import AutoTokenizer , AutoModelForSequenceClassification
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import BertTokenizer

print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
print('Loading BERT Model...')
Model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 2,   
    output_attentions = False,
    output_hidden_states = False,
)
                                        

Loading BERT tokenizer...
Loading BERT Model...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [7]:
#get max length  of sentences 

max_len = 0 
for sent in processed_data:
    tok_sent = tokenizer.encode(sent , add_special_tokens =True)
    get_max = max(max_len , len(tok_sent) )
    max_len = get_max 
    
print(f'max length: {max_len}')

Token indices sequence length is longer than the specified maximum sequence length for this model (587 > 512). Running this sequence through the model will result in indexing errors


max length: 2072


In [19]:
#now let's do our preprocessing to get it into model 
# we need to 
# 1.input_ids 
# 2.attention_mask 
# 3.Labels -----> it's the result
import torch 
from torch.utils.data import Dataset , DataLoader

data['sentiment'] = data['sentiment'].replace(['positive','negative'], [1,0])

labels=data['sentiment'].tolist()
input_ids = []
attention_mask=[]


for sent in processed_data:
    token_text = tokenizer.encode_plus(sent , max_length=512,
                                       add_special_tokens =True ,
                                       pad_to_max_length=True , 
                                       return_tensors = 'pt',
                                       return_attention_mask=True)
    input_ids.append(token_text['input_ids'])
    attention_mask.append(token_text['attention_mask'])

input_ids = torch.cat(input_ids ,dim=0)
attention_mask= torch.cat(attention_mask ,dim=0)
labels= torch.tensor(labels)


In [20]:
from torch.utils.data import TensorDataset , random_split


dataset =  TensorDataset(input_ids , attention_mask ,labels)

train_size = int(0.9*len(dataset)) 
val_size   = len(dataset) - train_size 

train_data ,val_data = random_split(dataset , [train_size , val_size])


In [21]:
len(train_data) , len(val_data)

(45000, 5000)

In [22]:
# create dataloader for train and val data 

from torch.utils.data import DataLoader , RandomSampler ,SequentialSampler 
batch_size = 32
train_loader = DataLoader(train_data , sampler = RandomSampler(train_data) , batch_size=batch_size)


val_loader   = DataLoader(val_data , sampler = SequentialSampler(val_data) , batch_size = batch_size)

In [23]:
## preparing for train 


from transformers import AdamW , get_linear_schedule_with_warmup


epochs = 4 
optimizer= AdamW(Model.parameters() ,lr=2e-5 , eps=1e-8 )
total_steps = epochs * len(train_loader)


scheduler = get_linear_schedule_with_warmup(optimizer , 
                                            num_warmup_steps=0,
                                           num_training_steps=total_steps)


In [24]:
# chck if cdua is available
import torch
if torch.cuda.is_available():
    device =torch.device('cuda')
    print(f'there are {torch.cuda.device_count()} GPU available')
    print(f'we will use{torch.cuda.get_device_name(0)}')
                         
else:
    print('there are no GPU is available, just cpu')


there are no GPU is available, just cpu


In [25]:
# helper functions 

# first to calculate accuracy in validation loop 

def flat_accuracy(preds ,labels):
    preds_arg = np.argmax(preds , axis=1).flatten()
    labels_flat = labels.flatten()
    return sum(preds_arg == labels_flat) / len(labels_flat)
    


# second to calculate time responding 
import time 
import datetime


def formate_time(giv_time):
    round_time = int(round(giv_time))
    return str(datetime.timedelta(seconds=round_time))


In [None]:
# let's  train the model 
import numpy as np 
import random 


device = 'cuda' if torch.cuda.is_available() else 'cpu'


epochs = 1
seed = 42 

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

Model.to(device)

training_state= []
total_time = time.time()

total_t0 =time.time()
for epoch in range(epochs):
    print(f'========epoch {epoch+1}/{epochs}===========')
    print(f'trianing loop....')
    
    t0 = time.time()
    total_train_loss = 0 
    Model.train()
    for step, train in enumerate(train_loader):
        
        if step%40 ==0 and not step ==0:
            timing = format_time(time.time()- t0)
            print(f'batch:\t{step} of:\t{len(train_loader)}\t:{timing}')
            
            
            
            
            
        input_ids = train[0].to(device)
        attention_mask= train[1].to(device)
        labels= train[2].to(device)
        
        Model.zero_grad()
        output = Model(input_ids,
                      attention_mask=attention_mask ,
                      labels=labels ,
                      return_dict=True)
        loss   = output.loss
        logits = output.logits
        total_train_loss += loss.item()
        loss.back()
        
        torch.nn.utils.clip_grad_norm(Model.parameters() , 1.0)
        optimizer.step()
        scheduler.step()
    average_train_loss = total_train_loss / len(train_loader)
    total_train_time   = format_time(time.time()-t0)
    
    print(f'\n')
    print(f'average train loss: {average_train_loss}\t total train time: {total_train_time}')
    
    ###############################
    # validation loop 
    ###############################
    
    print(f'\nvalidation loop....')
    
    total_val_accuracy=0
    total_val_loss=0
    t0=time.time()
    Model.eval()
    
    for val in val_loader:
        val_ids  = val[0].to(device)
        val_mask = val[1].to(device)
        val_label= val[2].to(device)
        
        with torch.no_grad():
            val_out = Model(val_ids ,
                           attention_mask=val_mask,
                           labels = val_labels ,
                           return_dict=True)
        val_loss  = val_out.loss  
        val_logits= val_out.logits
        
        total_val_loss+= loss.item()
        val_logits = val_logits.detach().cpu().numpy()
        val_labels = val_label.to('cpu').numpy()
        
        total_val_accuracy += flat_accuracy(val_logits , val_labels)
        
    average_val_accuracy = total_val_accuracy / len(val_loader)
    total_val_time = format_time(time.time() - t0)
    
    print("\n")
    print(f'average validation accuracy:{average_val_accuracy}\t total validation time:{total_val_time}')
    
    training_state.append({'train loss': average_train_loss ,
                          'train time': total_train_time ,
                          'validation accuracy':total_val_accuracy ,
                          'validation time':total_val_time})

    
print("\n")
print(f'trianing complet!')
print(f'total training took: { foramt_time(time.time()- total_t0)} . hh/mm/ss')

trianing loop....
