In [None]:
import tensorflow as tf
from tensorflow import keras

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
print(train.columns)
train.head()

In [None]:
train.fillna('', inplace=True)

In [None]:
print('Locations: ', np.unique(train['location'].to_numpy()), '\n')
print('Keywords: ', np.unique(train['keyword'].to_numpy()), '\n')

In [None]:
text_lenghts = [len(text) for text in train['text']]
print('max len: ', max(text_lenghts))
plt.hist(text_lenghts, bins = int(max(text_lenghts)/2))
plt.show()

In [None]:
_, target_count = np.unique(train['target'].to_numpy(), return_counts=True)
target_count/len(train['target'])

In [None]:
split = int(len(train['text'])*0.8)

X_train, X_valid = train['text'][:split], train['text'][split:]
y_train, y_valid = train['target'][:split], train['target'][split:]

X_test = test['text']

In [None]:
_, target_count = np.unique(y_train.to_numpy(), return_counts=True)
target_count/len(y_train)

In [None]:
def preprocess(X_batch):
    #remove urls
    X_batch = tf.strings.regex_replace(X_batch, "(http|https)?:\/\/(\S+)", "")
    
    X_batch = tf.strings.regex_replace(X_batch, "[^a-zA-Z0-9,.!?#:']", " ")
    
    # replace sequence of punctuations with single character
    X_batch = tf.strings.regex_replace(X_batch, r"(([.?#@+]){1,})", r"\2")
    
    # creating a space between a word and the punctuation
    X_batch = tf.strings.regex_replace(X_batch, r"([?.!,])", r" \1 ")
    X_batch = tf.strings.regex_replace(X_batch, r'[" "]+', " ")
    
    #lowercase
    X_batch = tf.strings.lower(X_batch)
    
    X_batch = '<start> ' + X_batch + ' <end>'
    X_batch = tf.strings.split(X_batch)
    
    return X_batch.to_tensor(default_value=b"<pad>")

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)

In [None]:
def bert_encode(data,maximum_length) :
  input_ids = []
  attention_masks = []
  

  for i in range(len(data.text)):
      encoded = tokenizer.encode_plus(
        
        data.text[i],
        add_special_tokens=True,
        max_length=maximum_length,
        pad_to_max_length=True,
        
        return_attention_mask=True,
        
      )
      
      input_ids.append(encoded['input_ids'])
      attention_masks.append(encoded['attention_mask'])
  return np.array(input_ids),np.array(attention_masks)

In [None]:
train_input_ids,train_attention_masks = bert_encode(train,60)
test_input_ids,test_attention_masks = bert_encode(test,60)

In [None]:
from transformers import TFBertModel
bert_model = TFBertModel.from_pretrained('bert-large-uncased')

from tensorflow.keras.optimizers import Adam
def create_model(bert_model):
  input_ids = tf.keras.Input(shape=(60,),dtype='int32')
  attention_masks = tf.keras.Input(shape=(60,),dtype='int32')
  
  output = bert_model([input_ids,attention_masks])
  output = output[1]
  output = tf.keras.layers.Dense(32,activation='relu')(output)
  output = tf.keras.layers.Dropout(0.2)(output)

  output = tf.keras.layers.Dense(1,activation='sigmoid')(output)
  model = tf.keras.models.Model(inputs = [input_ids,attention_masks],outputs = output)
  model.compile(Adam(lr=6e-6), loss='binary_crossentropy', metrics=['accuracy'])
  return model

In [None]:
model = create_model(bert_model)

In [None]:
history = model.fit([train_input_ids,train_attention_masks],train.target,validation_split=0.2, epochs=2,batch_size=10)

In [None]:
pred = np.rint(pred).astype(np.int16)

In [None]:
submit = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
submit.head()

In [None]:
submit['target'] = result

In [None]:
submit.to_csv('submission.csv', index=False)