In [None]:
!pip install -q transformers 

In [None]:
import torch
import torch.nn as nn 
import numpy as np
from torch.utils.data import DataLoader , Dataset
import pandas as pd 
from tqdm import tqdm 
from transformers import BertModel , BertTokenizer, AdamW ,  get_linear_schedule_with_warmup , set_seed
from pylab import rcParams
import seaborn as sns 
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split , KFold , StratifiedKFold
import random 
import numpy as np

In [None]:
def seed_all(seed_value):
    random.seed(seed_value) 
    np.random.seed(seed_value) 
    torch.manual_seed(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True 
        torch.backends.cudnn.benchmark = False

seed =42
seed_all(seed)

In [None]:
class Config:
    NB_EPOCHS = 5
    LR = 3e-5
    EPS=1e-8
    MAX_LEN = 110
    N_SPLITS = 4
    TRAIN_BS = 60
    VALID_BS = 40
    MODEL_NAME = 'bert-base-cased'
    TRAIN_FILE = '../input/gvbclean/Mytrain.csv'
    TEST_FILE = '../input/gvbclean/Test.csv'
    SUB_FILE = '../input/genderbasedviolence/SampleSubmission.csv'
    TOKENIZER = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
rcParams["figure.figsize"] = 12,8

In [None]:
df =  pd.read_csv(Config.TRAIN_FILE)
df.head()

In [None]:
sns.countplot(df.type)

In [None]:
labels_ord = ['Harmful_Traditional_practice','Physical_violence', 'economic_violence', 'emotional_violence','sexual_violence']
df['label'] = df.type.astype('category').cat.codes

Y = to_categorical(df['label'])

for i in range(len(labels_ord)) :     
     df[labels_ord[i]] = Y[:,i]

In [None]:
df.head(3)

### We can count the max lenght size 

In [None]:
tokenizer = BertTokenizer.from_pretrained(Config.MODEL_NAME)

In [None]:
token_lens = []

for txt in df.tweet:
    tokens = tokenizer.encode(txt , max_length=512)
    token_lens.append(len(tokens)) 

In [None]:
sns.distplot(token_lens) 

In [None]:
class GBVDataset(Dataset):
    def __init__(self, tweets, targets=None, is_test=False):
        self.tweets = tweets
        self.targets = targets
        self.is_test = is_test
        self.tokenizer = Config.TOKENIZER
        self.max_len = Config.MAX_LEN
    
    def __len__(self):
        return len(self.tweets)
    
    def __getitem__(self, idx):
        tweet = str(self.tweets[idx])
        tweet = ' '.join(tweet.split())
        global inputs
       
        inputs = self.tokenizer(
                            tweet,
                            add_special_tokens=True,
                            max_length=self.max_len,
                            padding="max_length" ,
                            truncation = True ,
                            pad_to_max_length=True, 
                            )
        
        ids = torch.tensor(inputs['input_ids'], dtype=torch.long)
        mask = torch.tensor(inputs['attention_mask'], dtype=torch.long)
        token_type = torch.tensor(inputs['token_type_ids'], dtype=torch.long)
     
        
   
        if self.is_test:
            return {
                'ids': ids,
                'mask': mask,
                'token_type':token_type
               
            }
        else:    
            targets = torch.tensor(self.targets[idx], dtype=torch.long)
            return {
                'ids': ids,
                'mask': mask,
                'token_type':token_type,
                'targets': targets
            }

In [None]:
def GBVDataloader(df  , batch_size , is_test=False):
    dataset = GBVDataset(df["tweet"].values , df["label"].values  , is_test)
    dataloader = DataLoader(dataset , batch_size , shuffle=False)
    return dataloader

In [None]:
dataloader = GBVDataloader(df , 4)

In [None]:
data = next(iter(dataloader))

In [None]:
data

In [1]:
"""
class GBVClassifier(nn.Module):
    def __init__(self , n_classes):
        super(GBVClassifier,self).__init__()
        self.bert = BertModel.from_pretrained(Config.MODEL_NAME , return_dict=False)
        self.drop1=  nn.Dropout(p=0.7)
        self.drop2=  nn.Dropout(p=0.8)
        self.out = nn.Linear(768,n_classes)
        

    def forward(self , input_ids, attention_mask ,token_type_ids):
        
        _,sortie = self.bert(input_ids,attention_mask,token_type_ids)

        output1 = self.drop1(sortie)
        output1 = self.out(output1)
        
        output2 = self.drop2(sortie)
        output2 = self.out(output2)
        
        output= output1.add(output2)/2
     
        
        return output 
"""

In [2]:
class GBVClassifier(nn.Module):
    def __init__(self , n_classes):
        super(GBVClassifier,self).__init__()
        self.bert = BertModel.from_pretrained(Config.MODEL_NAME , return_dict=False)
        self.drop=  nn.Dropout(p=0.8)
        self.out = nn.Linear(768,n_classes)
        

    def forward(self , input_ids, attention_mask ,token_type_ids):
        
        _,pooledout = self.bert(input_ids,attention_mask,token_type_ids)

        output = self.drop(pooledout)
        output = self.out(output)
        
        return output 

In [None]:
model = GBVClassifier(len(labels_ord))
model.to(device)

In [None]:
def loss_fn(outputs, labels):
     
    return nn.CrossEntropyLoss()(outputs, labels)

def yield_optimizer(model):
    """
    Returns optimizer for specific parameters
    """
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]

    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]
    return AdamW(optimizer_parameters, lr=Config.LR, eps=Config.EPS) 

In [None]:
def train_epoch (model , data_loader , loss_fn , optimizer , device , scheduler , n_examples):
    model.train()
    losses = []
    correct_predictions = 0

    for step , d in tqdm(enumerate(data_loader) , total=len(data_loader)):
        
        input_ids =d['ids'].to(device) 
        token_type_ids = d['token_type'].to(device)
        attention_mask = d['mask'].to(device)
        targets = d['targets'].to(device)

        outputs = model(
             input_ids ,
            attention_mask ,
            token_type_ids)
        
        _, pred =  torch.max(outputs , dim=1)

        loss = loss_fn(outputs , targets)
        correct_predictions +=torch.sum(pred ==targets)
        losses.append(loss.item())


        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
    return correct_predictions.double() / n_examples  , np.mean(losses)

In [None]:
def eval_model (model , data_loader , loss_fn , device , n_examples):
    
    model.eval()
  
    losses = []
    correct_predictions = 0

    with torch.no_grad() :
        
        for step , d in tqdm(enumerate(data_loader) , total=len(data_loader)):
            
            input_ids =d['ids'].to(device)
            token_type_ids = d['token_type'].to(device)
            attention_mask = d['mask'].to(device)
            targets = d['targets'].to(device)
        
            outputs = model(
                   input_ids ,
                attention_mask ,
                token_type_ids  )

            _, pred =  torch.max(outputs , dim=1)
            loss = loss_fn(outputs , targets)
            correct_predictions +=torch.sum(pred ==targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples  , np.mean(losses)


In [None]:
def train(model , df , epochs):
    
    
    best_accuracy = 0
    some_val =0
    
    for epoch in range(epochs):

        print(f'Epoch {epoch + 1}')

        kf = StratifiedKFold(n_splits=4, random_state=seed, shuffle=False)

        for step , (train, valid ) in enumerate(kf.split(df , df["label"])) :
          
            train_data_loader= GBVDataloader(df.iloc[train],Config.TRAIN_BS)
            validation_data_loader = GBVDataloader(df.iloc[valid],Config.VALID_BS)

            nb_train_steps = int(len(train_data_loader) /Config.TRAIN_BS * epochs)
            optimizer = yield_optimizer(model)
            scheduler = get_linear_schedule_with_warmup(
                                        optimizer,
                                        num_warmup_steps=0,
                                        num_training_steps=nb_train_steps)

            train_acc,train_loss = train_epoch(model,train_data_loader,loss_fn,optimizer,device,scheduler,len(df.iloc[train])) 
            
            print(f"Train accuracy {train_acc} ,Train Loss {train_loss}")
                
            val_acc, val_loss = eval_model(model,validation_data_loader,loss_fn,device,len(df.iloc[valid]))
            
            print(f"Validation accuracy {val_acc} , Validation loss {val_loss}")

          
            if  val_acc > best_accuracy:

                torch.save(model.state_dict(),'best_model.bin')
                best_accuracy = val_acc
                print(f"Best accuracy {best_accuracy}")

In [None]:
train(model , df , 4)

In [None]:
test = pd.read_csv(Config.TEST_FILE)
test

In [None]:
test = test[['tweet']]
test

In [None]:
model = GBVClassifier(len(labels_ord))
model.load_state_dict(torch.load('./best_model.bin'))
model = model.to(device)

In [None]:
def TweetTestDataloader(df  , batch_size , is_test=True):
    dataset = GBVDataset(df["tweet"].values ,None  , is_test)
    dataloader = DataLoader(dataset , batch_size , shuffle=False)
    return dataloader

In [None]:
testdataloader = TweetTestDataloader(test  , 1)

In [None]:
proba= []

In [None]:
def get_predictions(model, df_test):
    
    model = model.eval()
  
    predictions = []
  
    data_loader = TweetTestDataloader(df_test  , 1)
   
    with torch.no_grad():
        for d in data_loader:
            
            input_ids = d["ids"].to(device)
            attention_mask = d["mask"].to(device)
            token_type_ids = d["token_type"].to(device)
            
            outputs = model(
                            input_ids,
                            attention_mask ,
                            token_type_ids
                              )
            proba.append(outputs)
            _, preds = torch.max(outputs, dim=1)
      
   

In [None]:
get_predictions(model ,test)

In [None]:
tab=[]

In [None]:
for i in proba:
    tab.append(i.flatten().tolist())

In [None]:
predict_pd = pd.DataFrame(data=tab , columns=labels_ord)

In [None]:
 predict_pd

In [None]:
#predict_pd["economic_violence"] = predict_pd["economic_violence"]*3.999 
#predict_pd["emotional_violence"] = predict_pd["emotional_violence"]*3.999 
#predict_pd["Harmful_Traditional_practice"] = predict_pd["Harmful_Traditional_practice"]*3.75 

In [None]:
#predict_pd["economic_violence"] = predict_pd["economic_violence"]*0.999 
predict_pd["sexual_violence"] = predict_pd["sexual_violence"]*2.35
#predict_pd["Harmful_Traditional_practice"] = predict_pd["Harmful_Traditional_practice"]*0.998  

In [None]:
#predict_pd["economic_violence"] = predict_pd["economic_violence"]*0.20
predict_pd["Physical_violence"] = predict_pd["Physical_violence"]*2.75
#predict_pd["Harmful_Traditional_practice"] = predict_pd["Harmful_Traditional_practice"]*0.998 

In [None]:
get_prediction = predict_pd.idxmax(axis=1)

In [None]:
#myprediction=[]

In [None]:
#for i in sortie:
 #   myprediction.append(labels_ord[i.item()])

In [None]:
sub = pd.read_csv("../input/genderbasedviolence/SampleSubmission.csv")

In [None]:
sub.head()

In [None]:
sub["type"]= get_prediction

In [None]:
sub

In [None]:
sub.to_csv("MA-FAAA10.csv",index=False)