In [2]:
import pandas as pd
import spacy
pd.set_option('display.max_columns', None)
nlp = spacy.load("en_core_web_sm")
from sklearn import feature_extraction, linear_model,  model_selection, preprocessing
from spacy.lang.en.stop_words import STOP_WORDS
import re # regular expression
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import BaggingClassifier
from gensim.models import Word2Vec
import numpy as np
from transformers import pipeline
import tensorflow as tf
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tensorflow import keras
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras import layers

In [3]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

### Utils

In [4]:
def remove_ambiguous_labels(df):
    df['target_relabeled'] = df['target'].copy()

    df.loc[df[
               'text'] == 'like for the music video I want some real action shit like burning buildings and police chases not some weak ben winston shit', 'target_relabeled'] = 0
    df.loc[df[
               'text'] == 'Hellfire is surrounded by desires so be careful and donÛªt let your desires control you! #Afterlife', 'target_relabeled'] = 0
    df.loc[df['text'] == 'To fight bioterrorism sir.', 'target_relabeled'] = 0
    df.loc[df[
               'text'] == '.POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4', 'target_relabeled'] = 1
    df.loc[df[
               'text'] == 'CLEARED:incident with injury:I-495  inner loop Exit 31 - MD 97/Georgia Ave Silver Spring', 'target_relabeled'] = 1
    df.loc[df[
               'text'] == '#foodscare #offers2go #NestleIndia slips into loss after #Magginoodle #ban unsafe and hazardous for #humanconsumption', 'target_relabeled'] = 0
    df.loc[df[
               'text'] == 'In #islam saving a person is equal in reward to saving all humans! Islam is the opposite of terrorism!', 'target_relabeled'] = 0
    df.loc[df[
               'text'] == 'Who is bringing the tornadoes and floods. Who is bringing the climate change. God is after America He is plaguing her\n \n#FARRAKHAN #QUOTE', 'target_relabeled'] = 1
    df.loc[df[
               'text'] == 'RT NotExplained: The only known image of infamous hijacker D.B. Cooper. http://t.co/JlzK2HdeTG', 'target_relabeled'] = 1
    df.loc[df[
               'text'] == "Mmmmmm I'm burning.... I'm burning buildings I'm building.... Oooooohhhh oooh ooh...", 'target_relabeled'] = 0
    df.loc[
        df['text'] == "wowo--=== 12000 Nigerian refugees repatriated from Cameroon", 'target_relabeled'] = 0
    df.loc[df[
               'text'] == "He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam", 'target_relabeled'] = 0
    df.loc[df[
               'text'] == "Hellfire! We donÛªt even want to think about it or mention it so letÛªs not do anything that leads to it #islam!", 'target_relabeled'] = 0
    df.loc[df[
               'text'] == "The Prophet (peace be upon him) said 'Save yourself from Hellfire even if it is by giving half a date in charity.'", 'target_relabeled'] = 0
    df.loc[df['text'] == "Caution: breathing may be hazardous to your health.", 'target_relabeled'] = 1
    df.loc[df[
               'text'] == "I Pledge Allegiance To The P.O.P.E. And The Burning Buildings of Epic City. ??????", 'target_relabeled'] = 0
    df.loc[df[
               'text'] == "#Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect", 'target_relabeled'] = 0
    df.loc[df[
               'text'] == "that horrible sinking feeling when youÛªve been at home on your phone for a while and you realise its been on 3G this whole time", 'target_relabeled'] = 0

    df['target'] = df['target_relabeled'].copy()
    df = df.drop(columns='target_relabeled')
    return df
def remove_stop_words(text):
    filtered_stop_words = []
    doc = nlp(text)
    for token in doc:
        if not token.is_stop and not token.is_punct: # we use token attribute .is_stop
            filtered_stop_words.append(token.text)
    return " ".join(filtered_stop_words)

def lemmatized_string(text):
    doc = nlp(text)
    lemmatized_string = []
    for token in doc:
        lemmatized_string.append(token.lemma_)
    return " ".join(lemmatized_string)

def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  
                           u"\U0001F300-\U0001F5FF"  
                           u"\U0001F680-\U0001F6FF"  
                           u"\U0001F1E0-\U0001F1FF"  
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)


def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    mean = words_vecs.mean(axis=0)
    return mean
def extract_label(text):
    text = text.get('label')
    if(text == 'LABEL_1'):
        return 1
    else:
        return 0

## Text Preprocessing

We are doing the following pre-processing:
1. Some of the texts had been labelled twice & ambiguously. Once having disaster = true & again as disaster = false. Remove such ambiguous labels.
2. Drop columns keyword, location & id because I did not use them
3. Remove URL, HTML, punctuation & emoji
4. Remove stop words - obtained from spacy
5. Lemmatize the String

In [5]:
df_train = remove_ambiguous_labels(df_train)
df_train.drop(['id', 'keyword', 'location'], axis=1, inplace=True)
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    7613 non-null   object
 1   target  7613 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 119.1+ KB


In [6]:
df_train.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [7]:
df_train['text']= df_train['text'].apply(remove_URL)
df_train['text']= df_train['text'].apply(remove_html)
df_train['text']= df_train['text'].apply(remove_emoji)
df_train['text']= df_train['text'].apply(remove_punct)

In [8]:
df_train['text_without_stop_words'] = df_train['text'].apply(remove_stop_words)
df_test['text_without_stop_words'] = df_test['text'].apply(remove_stop_words)

In [9]:
df_train['text_with_lemmatization'] = df_train['text_without_stop_words'].apply(lemmatized_string)
df_test['text_with_lemmatization'] = df_test['text_without_stop_words'].apply(lemmatized_string)

In [10]:
df_train.head()

Unnamed: 0,text,target,text_without_stop_words,text_with_lemmatization
0,Our Deeds are the Reason of this earthquake Ma...,1,Deeds Reason earthquake ALLAH Forgive,deed Reason earthquake ALLAH Forgive
1,Forest fire near La Ronge Sask Canada,1,Forest fire near La Ronge Sask Canada,forest fire near La Ronge Sask Canada
2,All residents asked to shelter in place are be...,1,residents asked shelter place notified officer...,resident ask shelter place notify officer evac...
3,13000 people receive wildfires evacuation orde...,1,13000 people receive wildfires evacuation orde...,13000 people receive wildfire evacuation order...
4,Just got sent this photo from Ruby Alaska as s...,1,got sent photo Ruby Alaska smoke wildfires pou...,got send photo Ruby Alaska smoke wildfire pour...


## Vectorize

Experimented with two ways to vectorize - 
1. scikit-learn's CountVectorizer
2. scikit-learn's TfidfVectorizer

In [39]:

#count_vectorizer = feature_extraction.text.CountVectorizer(ngram_range=(1,2))
#train_vectors = count_vectorizer.fit_transform(df_train["text_with_lemmatization"])
#test_vectors = count_vectorizer.transform(df_test["text_with_lemmatization"])
tfIdf_vectorizer =  TfidfVectorizer()
train_vectors = tfIdf_vectorizer.fit_transform(df_train["text_with_lemmatization"])
test_vectors = tfIdf_vectorizer.transform(df_test["text_with_lemmatization"])


In [40]:
train_vectors.todense().view()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

### Train & Validate after Vectorization

In [41]:
# clf = linear_model.RidgeClassifier()
# clf =  RandomForestClassifier()
clf = MultinomialNB() #Best Performing
# clf = BaggingClassifier()
scores = model_selection.cross_val_score(clf, train_vectors, df_train["target"], cv=3, scoring="f1")

scores

array([0.63113006, 0.61248761, 0.67786561])

### Tokenization - Submission

In [None]:
clf.fit(train_vectors, df_train["target"])
sample_submission = pd.read_csv("data/sample_submission.csv")
sample_submission["target"] = clf.predict(test_vectors)
sample_submission.to_csv("data/submission.csv", index=False)

## Embedding - Word2Vec

In [39]:
sentences = [sentence.split() for sentence in df_train['text_with_lemmatization']]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4)

In [122]:
#w2v_model.wv.index_to_key

In [123]:
#w2v_model.wv.most_similar('Massacre')

In [116]:
x_train = np.array([vectorize(sentence) for sentence in df_train['text_with_lemmatization']])
x_test = np.array([vectorize(sentence) for sentence in df_test['text_with_lemmatization']])

### Train & Evaluate after Embedding with Word2Vec

In [124]:
#clf = MultinomialNB() #Best Performing
#clf = BaggingClassifier()
clf = linear_model.RidgeClassifier()
scores = model_selection.cross_val_score(clf, x_train, df_train["target"], cv=3, scoring="f1")

scores

array([0.04468275, 0.05938865, 0.17171717])

### Word2Vec - Submission

In [33]:
clf.fit(train_vectors, df_train["target"])

In [34]:
sample_submission = pd.read_csv("data/sample_submission.csv")
sample_submission["target"] = clf.predict(test_vectors)
sample_submission.to_csv("data/submission.csv", index=False)

## Hugging Face with Tokenizer

In [25]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from tensorboard.plugins.hparams import api as hp

# Load your DataFrame with two columns: 'text' and 'label'
data = df_train  # Replace with your dataset file

# Define hyperparameters for the model and training
HP_LEARNING_RATE = hp.HParam("learning_rate", hp.RealInterval(1e-5, 1e-3))
HP_NUM_EPOCHS = hp.HParam("num_epochs", hp.IntInterval(3, 10))
METRIC_ACCURACY = 'accuracy'

with tf.summary.create_file_writer("logs/hparam_tuning").as_default():
    hp.hparams_config(
        hparams=[HP_LEARNING_RATE, HP_NUM_EPOCHS],
        metrics=[hp.Metric(METRIC_ACCURACY, display_name='Accuracy')],
    )

# Define a function to build and compile the model
def build_model(hparams):
    model_name = "hkayesh/twitter-disaster-nlp"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = TFAutoModelForSequenceClassification.from_pretrained(model_name)

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hparams[HP_LEARNING_RATE]),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])
    
    return model

# Define cross-validation using Stratified K-Fold
num_splits = 5  # Adjust the number of splits as needed
skf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)
accuracies = []

hparams = {
    HP_LEARNING_RATE: 1e-4,  # Adjust your learning rate here
    HP_NUM_EPOCHS: 5,        # Adjust the number of epochs here
}

for fold, (train_index, val_index) in enumerate(skf.split(data['text_with_lemmatization'], data['target'])):
    train_data, val_data = data.iloc[train_index], data.iloc[val_index]

    model = build_model(hparams)
    
    # Tokenize the text data
    train_inputs = tokenizer(list(train_data['text_with_lemmatization']), return_tensors="tf", padding=True, truncation=True, max_length=128)
    val_inputs = tokenizer(list(val_data['text_with_lemmatization']), return_tensors="tf", padding=True, truncation=True, max_length=128)

    # Prepare the labels
    train_labels = np.array(train_data['target'])
    val_labels = np.array(val_data['target'])

    # Define TensorBoard callbacks for visualization
    log_dir = "logs/fit/" + f"fold_{fold}"
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

    # Train the model
    history = model.fit(
        train_inputs.data,
        train_labels,
        validation_data=(val_inputs.data, val_labels),
        epochs=hparams[HP_NUM_EPOCHS],
        batch_size=32,
        callbacks=[tensorboard_callback]
    )

    # Evaluate the model on the validation set
    val_predictions = model.predict(val_inputs.data)
    val_predicted_classes = np.argmax(val_predictions.logits, axis=1)
    val_accuracy = accuracy_score(val_labels, val_predicted_classes)
    accuracies.append(val_accuracy)

    print(f"Fold {fold + 1} - Validation Accuracy: {val_accuracy:.4f}")

# Calculate and report mean accuracy across all folds
mean_accuracy = np.mean(accuracies)
print(f"Mean Accuracy across all folds: {mean_accuracy:.4f}")


Some layers from the model checkpoint at hkayesh/twitter-disaster-nlp were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at hkayesh/twitter-disaster-nlp and are newly initialized: ['dropout_139']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fold 1 - Validation Accuracy: 0.7945


Some layers from the model checkpoint at hkayesh/twitter-disaster-nlp were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at hkayesh/twitter-disaster-nlp and are newly initialized: ['dropout_159']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fold 2 - Validation Accuracy: 0.7997


Some layers from the model checkpoint at hkayesh/twitter-disaster-nlp were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at hkayesh/twitter-disaster-nlp and are newly initialized: ['dropout_179']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fold 3 - Validation Accuracy: 0.8083


Some layers from the model checkpoint at hkayesh/twitter-disaster-nlp were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at hkayesh/twitter-disaster-nlp and are newly initialized: ['dropout_199']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fold 4 - Validation Accuracy: 0.8233


Some layers from the model checkpoint at hkayesh/twitter-disaster-nlp were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at hkayesh/twitter-disaster-nlp and are newly initialized: ['dropout_219']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Fold 5 - Validation Accuracy: 0.7884
Mean Accuracy across all folds: 0.8028


### Hugging Face with Tokenizer - Submission

In [26]:
sample_submission = pd.read_csv("data/sample_submission.csv")
unseen_inputs = tokenizer(list(df_test['text_with_lemmatization']), return_tensors="tf", padding=True, truncation=True, max_length=128)

# Make predictions on the unseen data
unseen_predictions = model.predict(unseen_inputs.data)
unseen_predicted_classes = np.argmax(unseen_predictions.logits, axis=1)

sample_submission["target"] = unseen_predicted_classes



In [27]:
sample_submission.to_csv("data/submission.csv", index=False)

## Hugging Face with Pre-trained Model

In [42]:
pipe = pipeline("text-classification", model="hkayesh/twitter-disaster-nlp")

Some layers from the model checkpoint at hkayesh/twitter-disaster-nlp were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at hkayesh/twitter-disaster-nlp and are newly initialized: ['dropout_59']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Hugging Face with Pre-trained Model - Submission


In [25]:
sample_submission = pd.read_csv("data/sample_submission.csv")

results = []
for text in df_test['text_with_lemmatization']:
    result = pipe(text)[0]
    results.append(result)

sample_submission['target'] = results

In [36]:

sample_submission['target'] = sample_submission['target'].apply(extract_label)
sample_submission.to_csv("data/submission.csv", index=False)

## Transformer

In [11]:
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=input_dim, output_dim=output_dim)
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

    def get_config(self):
        config = super().get_config()
        config.update({
            "output_dim": self.output_dim,
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim,
        })
        return config
    
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        attention_output = self.attention(
            inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

In [12]:
max_length = 600
max_tokens = 20000
text_vectorization = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=max_length,
)

text_vectorization.adapt(df_train['text_with_lemmatization'].values)

In [13]:
vocab_size = 20000
sequence_length = 600
embed_dim = 256
num_heads = 2
dense_dim = 32

inputs = keras.Input(shape=(1,), dtype=tf.string)
x = text_vectorization(inputs)
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(x)
x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.summary()

model.fit(x=df_train['text_with_lemmatization'], y=df_train['target'], validation_split=0.3, epochs=20)

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1)]               0         
                                                                 
 text_vectorization (TextVe  (None, 600)               0         
 ctorization)                                                    
                                                                 
 positional_embedding (Posi  (None, 600, 256)          5273600   
 tionalEmbedding)                                                
                                                                 
 transformer_encoder (Trans  (None, 600, 256)          543776    
 formerEncoder)                                                  
                                                                 
 global_max_pooling1d (Glob  (None, 256)               0         
 alMaxPooling1D)                                             

INFO:tensorflow:Assets written to: model/full_transformer_encoder.x/assets


Epoch 2/20


INFO:tensorflow:Assets written to: model/full_transformer_encoder.x/assets


Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


OSError: No file or directory found at full_transformer_encoder.x

### Transformer - Submission

In [18]:
sample_submission = pd.read_csv("data/sample_submission.csv")
threshold = 0.5

sample_submission["target_float"] = model.predict(df_test['text_with_lemmatization'])
sample_submission["target"] = sample_submission.apply(lambda df: 1 if df["target_float"] > threshold else 0, axis=1)
sample_submission = sample_submission.drop(["target_float"], axis=1)
sample_submission.to_csv("data/submission.csv", index=False)
#sample_submission.to_csv("data/submission.csv", index=False)

