In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Importing and checking data

In [None]:
train_df = pd.read_csv('/kaggle/input/nlpgettingstarted/train.csv')
test_df = pd.read_csv('/kaggle/input/nlpgettingstarted/test.csv')
sample_sub_df = pd.read_csv('/kaggle/input/nlpgettingstarted/sample_submission.csv')

train_df.head() # column: 'keyword' and 'location' are a bit blank so those columns will be left out for now.
train_clean_df = train_df.drop(['keyword', 'location'], axis=1)

In [None]:
test_df.head()
test_clean_df = test_df.drop(['keyword', 'location'], axis=1)

Data cleaning process: For training data only the words should be existing in the text, no special characters (@,#,&...)

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import string

def clean_text(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return ' '.join(filtered_text)  # Join the words back into a string

train_clean_df['text'] = train_clean_df['text'].apply(lambda x: clean_text(x))
train_clean_df['text'] = train_clean_df['text'].apply(lambda x: remove_stopwords(x))

test_clean_df['text'] = test_clean_df['text'].apply(lambda x: clean_text(x))
test_clean_df['text'] = test_clean_df['text'].apply(lambda x: remove_stopwords(x))

In [None]:
train_clean_df[train_clean_df.target == 0]

# "saving" the cleaned dataset by copying
train_df = train_clean_df.copy()
test_df = test_clean_df.copy()

train_df.target.value_counts()

Loading pretrained BERT model 

There are some pretrained BERT models, uncased is loaded because lower and uppercase letters are not needed to differentiate upper cased letter words and lower cased ones and large for better accuracy.

In [None]:
import transformers

from transformers import AutoTokenizer,TFBertModel
tokenizer = AutoTokenizer.from_pretrained('bert-large-uncased')
bert = TFBertModel.from_pretrained('bert-large-uncased')

In [None]:
# converting 'text' data to tokens for BERT

print("max text lenght:",max([len(x.split()) for x in train_df.text])) # split words by space


In [None]:
# creating input train data:
# tokenizing and creating a tensor shape input

x_train = tokenizer(
    text=train_df.text.tolist(),
    add_special_tokens=True,
    max_length=23,
    truncation=True, # if length is bigger then truncating the text
    padding=True, # every text length be the same by padding
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

x_train

In [None]:
x_train['input_ids'].shape

In [None]:
# creating output train data:
y_train = train_df.target.values
y_train

Building a model

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy,BinaryCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy,BinaryAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.utils import plot_model

In [None]:
max_len = 23
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

# loading the BERT model:
embeddings = bert(input_ids,attention_mask = input_mask)[1]

out = tf.keras.layers.Dropout(0.2)(embeddings)

out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation = 'relu')(out)

y = Dense(1,activation = 'sigmoid')(out)
    
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True

In [None]:
model.summary()

In [None]:
optimizer = Adam(
    learning_rate=7e-06, # base value: https://huggingface.co/bert-large-uncased
    epsilon=2e-08,
    decay=0.01,
    clipnorm=1.0)

# Set loss and metrics
loss = BinaryCrossentropy()
metric = BinaryAccuracy('accuracy'),
# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

In [None]:
plot_model(model, show_shapes = True)

In [None]:
tf.config.experimental.list_physical_devices('GPU')

Training the model

In [None]:
train_history = model.fit(
    x ={'input_ids':x_train['input_ids'],'attention_mask':x_train['attention_mask']} ,
    y = y_train,
    validation_split = 0.1,
    epochs=10,
    batch_size=32
)

Testing:

In [None]:
test_df

In [None]:
# test input tokenizer:

x_test = tokenizer(
    text=test_df.text.tolist(),
    add_special_tokens=True,
    max_length=23,
    truncation=True, # if length is bigger then truncating the text
    padding=True, # every text length be the same by padding
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [None]:
predicted = model.predict({'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']})

In [None]:
y_predicted = np.where(predicted>0.5,1,0)

In [None]:
y_predicted = y_predicted.reshape((1,3263))[0]

In [None]:
sample_sub_df['id'] = test_df.id
sample_sub_df['target'] = y_predicted

In [None]:
sample_sub_df.to_csv('submission.csv',index = False)