In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
df_train = pd.read_csv('../input/nlp-getting-started/train.csv')
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
df_test = pd.read_csv('../input/nlp-getting-started/test.csv')
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [4]:
df_train.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [5]:
df_train = df_train[['text','target']]
df_train.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
from sklearn.model_selection import train_test_split
xtrain,xval,ytrain,yval = train_test_split(df_train.index.values, df_train.target.values,
                                           test_size = 0.2, random_state=15, stratify = df_train.target.values)
print(len(xtrain),len(xval))

6090 1523


In [7]:
df_train['set_type'] = 'nil'*df_train.shape[0]
df_train.loc[xtrain, 'set_type'] = 'train'
df_train.loc[xval, 'set_type'] = 'val'
df_train.head(10)

Unnamed: 0,text,target,set_type
0,Our Deeds are the Reason of this #earthquake M...,1,val
1,Forest fire near La Ronge Sask. Canada,1,val
2,All residents asked to 'shelter in place' are ...,1,train
3,"13,000 people receive #wildfires evacuation or...",1,val
4,Just got sent this photo from Ruby #Alaska as ...,1,train
5,#RockyFire Update => California Hwy. 20 closed...,1,train
6,#flood #disaster Heavy rain causes flash flood...,1,train
7,I'm on top of the hill and I can see a fire in...,1,train
8,There's an emergency evacuation happening now ...,1,train
9,I'm afraid that the tornado is coming to our a...,1,train


In [8]:
df_train.groupby(['target', 'set_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,text
target,set_type,Unnamed: 2_level_1
0,train,3473
0,val,869
1,train,2617
1,val,654


In [9]:
df_test=df_test[['text']]
df_test.head()

Unnamed: 0,text
0,Just happened a terrible car crash
1,"Heard about #earthquake is different cities, s..."
2,"there is a forest fire at spot pond, geese are..."
3,Apocalypse lighting. #Spokane #wildfires
4,Typhoon Soudelor kills 28 in China and Taiwan


In [10]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
tokenizer= BertTokenizer.from_pretrained('bert-base-uncased',
                                        do_lower_case=True)



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [11]:
encoded_train = tokenizer.batch_encode_plus(
    df_train[df_train.set_type=='train'].text.values,
    add_special_tokens=True,
    return_attention_masks=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)
encoded_val = tokenizer.batch_encode_plus(
    df_train[df_train.set_type=='val'].text.values,
    add_special_tokens=True,
    return_attention_masks=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)
encoded_test = tokenizer.batch_encode_plus(
    df_test.text.values,
    add_special_tokens=True,
    return_attention_masks=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)
input_ids_train = encoded_train['input_ids']
attention_masks_train = encoded_train['attention_mask']
labels_train = torch.tensor(df_train[df_train.set_type=='train'].target.values)

input_ids_val = encoded_val['input_ids']
attention_masks_val = encoded_val['attention_mask']
labels_val = torch.tensor(df_train[df_train.set_type=='val'].target.values)

input_ids_test = encoded_test['input_ids']
attention_masks_test = encoded_test['attention_mask']

In [12]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
dataset_test = TensorDataset(input_ids_test, attention_masks_test)

In [13]:
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                     num_labels = 2,
                                     output_attentions = False,
                                     output_hidden_states = False
                                     )

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




In [14]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
dataloader_train = DataLoader(
    dataset_train,
    sampler= RandomSampler(dataset_train),
    batch_size=32
)
dataloader_val = DataLoader(
    dataset_val,
    sampler = SequentialSampler(dataset_val),
    batch_size=32
)
dataloader_test = DataLoader(
    dataset_test,
    sampler = SequentialSampler(dataset_test),
    batch_size=32
)

In [15]:
from transformers import AdamW, get_linear_schedule_with_warmup
optimizer = AdamW(model.parameters(),
                 lr=1e-5,
                 eps=1e-8)
epochs = 4
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(dataloader_train)*epochs
)

In [16]:
from sklearn.metrics import f1_score
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis =1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')
def accuracy_per_class(preds, labels):
    preds_flat = np.argmax(preds, axis =1).flatten()
    labels_flat = labels.flatten()
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}')

In [17]:
import random

seed_val = 10
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
print(device)


cuda


In [18]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals


In [19]:
from tqdm.notebook import tqdm
for epoch in tqdm(range(1, epochs+1)):
    model.train()
    training_loss=0
    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch),
                        leave=False,
                        disable=False
                       )
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask':batch[1],
            'labels':batch[2]
        }
        outputs = model(**inputs)
        loss=outputs[0]
        training_loss+= loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
    
    
    tqdm.write(f'\nEpoch {epoch}')
    loss_train_avg = training_loss/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')
    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation Loss: {val_loss}')
    tqdm.write(f'F1 score: {val_f1}')
    
        

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=191.0, style=ProgressStyle(description_widt…


Epoch 1
Training loss: 0.47584084331677223
Validation Loss: 0.4440040613214175
F1 score: 0.8143610248918433


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=191.0, style=ProgressStyle(description_widt…


Epoch 2
Training loss: 0.3642551671928136
Validation Loss: 0.39436648072053987
F1 score: 0.8331319639790897


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=191.0, style=ProgressStyle(description_widt…


Epoch 3
Training loss: 0.319942449243905
Validation Loss: 0.4063481790944934
F1 score: 0.8333947498048607


HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=191.0, style=ProgressStyle(description_widt…


Epoch 4
Training loss: 0.2832077828611379
Validation Loss: 0.4362399308010936
F1 score: 0.8309228368762385



In [20]:
_, predictions, true_vals = evaluate(dataloader_val)

In [21]:
accuracy_per_class(predictions, true_vals)

Class: 0
Accuracy: 750/869
Class: 1
Accuracy: 516/654


In [22]:
len(dataloader_test)

102

In [23]:
model.eval()
predictions=[]
for batch in dataloader_test:
    batch = tuple(b.to(device) for b in batch)
        
    inputs = {'input_ids':      batch[0],
              'attention_mask': batch[1]
             }

    with torch.no_grad():        
        outputs = model(**inputs)
    
    logits = outputs[0]

    logits = logits.detach().cpu().numpy()
    predictions.append(np.argmax(logits,axis=1))

    

In [24]:
from itertools import chain
prediction = list(chain.from_iterable(predictions))


In [25]:
sub= pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
sub.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [26]:
sub['target']=prediction

In [27]:
sub.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [28]:
sub.to_csv('submission.csv', index=False)