# **Setup**

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn.metrics import accuracy_score, roc_auc_score
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

In [2]:
import os , random , tensorflow as tf
# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)


SEED = 0

In [3]:
# Detect hardware, return appropriate distribution strategy
try:
    ############################################################################################################

    ########################################### " SEED HERE " #################################################
    
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    seed_everything(seed=SEED)
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
    ############################################################################################################

except ValueError:
    tpu = None

if tpu:
    ############################################################################################################

    ########################################### " SEED HERE " #################################################
    
    seed_everything(seed=SEED)
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
    ############################################################################################################
else:
    ############################################################################################################

    ########################################### " SEED HERE " #################################################

    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    seed_everything(seed=SEED)
    strategy = tf.distribute.get_strategy()

    ############################################################################################################
print("REPLICAS: ", strategy.num_replicas_in_sync)

REPLICAS:  1


# **Load Model Weights** : 


---

*   Those weights are generated the notebook ***transformers-baseline*** 
*   It will take lot of time to train and  get the model weights , so i've uploaded the weights into my drive . 
*   here is the link for the Model Weights : **[transformer weights fold1](https://drive.google.com/drive/folders/1nQDE9dUZ7UbwZXGuYXasvVfwZ4HkPlal?usp=sharing)**
* To speed the process you can **add a shortcut** to your drive and run the following code 




In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
!cp  "/content/drive/MyDrive/Code Review /Transformers_weights/training_folds_1.ckpt.data-00000-of-00001" .
!cp  "/content/drive/MyDrive/Code Review /Transformers_weights/training_folds_1.ckpt.index" .

-  **Load Data**

In [7]:
train = pd.read_csv('/content/drive/MyDrive/Code Review /Train.csv')
test = pd.read_csv('/content/drive/MyDrive/Code Review /Test.csv')

In [8]:
train.head()

Unnamed: 0,ID,Text,Label
0,ID_AASHwXxg,Mwangonde: Khansala wachinyamata Akamati achi...,POLITICS
1,ID_AGoFySzn,MCP siidakhutire ndi kalembera Chipani cha Ma...,POLITICS
2,ID_AGrrkBGP,Bungwe la MANEPO Lapempha Boma Liganizire Anth...,HEALTH
3,ID_AIJeigeG,Ndale zogawanitsa miyambo zanyanya Si zachile...,POLITICS
4,ID_APMprMbV,Nanga wapolisi ataphofomoka? Masiku ano sichi...,LAW/ORDER


# **Process - Configurations**

In [9]:
LB = LabelEncoder()
train['Label'] = LB.fit_transform(train['Label'])

In [10]:
############################################################################################################

########################################### " SEED HERE " #################################################

seed_everything(seed=SEED)
AUTO = tf.data.experimental.AUTOTUNE
# Configuration
EPOCHS = 30
N_LABELS = train['Label'].unique().shape[0]
BATCH_SIZE = 32

In [11]:
############################################################################################################

########################################### " SEED HERE " #################################################

seed_everything(seed=SEED)
df = pd.concat((train, test))
dataset = tf.data.Dataset.from_tensor_slices(df['Text'].values)

In [12]:
############################################################################################################

########################################### " SEED HERE " ##################################################

seed_everything(seed=SEED)
vocab_size = 100000
maxlen = 200
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=vocab_size, output_sequence_length=maxlen)
encoder.adapt(dataset)

In [13]:
%%time 
def reformat(x, y):
    return x, tf.cast(y, tf.float32)

def df_to_dataset(data, labels, data_type='Train'):
    x_token = data['Text'].values
    if data_type=='Train':
        y_label = labels.values
        dataset = (tf.data.Dataset
                    .from_tensor_slices((x_token, y_label))
                    .repeat()
                    .shuffle(2048)
                    .batch(BATCH_SIZE)
                    .prefetch(AUTO))
        dataset = dataset.map(reformat)
    elif data_type=='Val':
        y_label = labels.values
        dataset = (  tf.data.Dataset
                    .from_tensor_slices((x_token, y_label))
                    .batch(BATCH_SIZE)
                    .cache()
                    .prefetch(AUTO)
                     ) 
        dataset =dataset.map(reformat)
    else:
        dataset = (tf.data.Dataset
                    .from_tensor_slices(x_token)
                    .batch(BATCH_SIZE)
                    )
    return dataset

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 8.58 µs


In [14]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions
    

class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.6):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization()
        self.layernorm2 = layers.LayerNormalization()
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

def BERTModel(embed_dim = 130, num_heads = 6,  ff_dim = 32):
    ########################################### " SEED HERE " ###########################################
    seed_everything(seed=SEED)
    inputs = layers.Input(shape=(),dtype=tf.string)
    x = encoder(inputs)
    embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
    x = embedding_layer(x)
    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    x = transformer_block(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dense(256, activation = "relu")(x) 
    x = tf.keras.layers.Dropout(0.15)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dense(128, activation = "sigmoid")(x) 
    x = tf.keras.layers.Dropout(0.4)(x) 
    outputs = layers.Dense(N_LABELS, activation="sigmoid")(x)
    model = keras.Model(inputs=inputs, outputs=outputs)     
    return model 


def build_classifier():
    ########################################### " SEED HERE " ###########################################
    seed_everything(seed=SEED)
    model = BERTModel()
    checkpoint_path = "training_folds_1.ckpt"
    model.load_weights(checkpoint_path)
    # Freeze layers execpt last threee layers
    for layer in model.layers[:-3]: 
        layer.trainable=False      
    # Define Loss
    losses = tf.keras.losses.CategoricalCrossentropy( from_logits=True)
    # compile all
    model.compile(tf.keras.optimizers.Adam(1e-7), loss=losses, metrics=["accuracy"])    
    return model

def get_model():
    with strategy.scope():
        ########################################### " SEED HERE " ###########################################
        seed_everything(seed=SEED)
        model = build_classifier()
    return model

In [15]:
LABEL = 'Label'
N_LABELS = 20

In [18]:
n_splits = 5

kf = StratifiedKFold(n_splits=n_splits, random_state=47, shuffle=True)

y_oof = np.zeros([train.shape[0], N_LABELS])
y_test = np.zeros([test.shape[0], N_LABELS])

########################################### " SEED HERE " ###########################################
seed_everything(seed=SEED)
test_ds = df_to_dataset(test,labels=None,data_type='Test')

i = 0
metrics = list()

y_train = pd.get_dummies(train['Label'])

for tr_idx, val_idx in kf.split(train[['Text']], train['Label']):
    ########################################### " SEED HERE " ###########################################
    seed_everything(seed=SEED)
    df_tr = train.iloc[tr_idx, :]
    df_vl = train.iloc[val_idx, :]
    
    tr_ds = df_to_dataset(df_tr,y_train.iloc[tr_idx, :], data_type='Train')
    vl_ds = df_to_dataset(df_vl, y_train.iloc[val_idx, :],data_type='Val')
        
    
    model =  get_model()
    
    # Include the epoch in the file name (uses `str.format`)
    checkpoint_path = f"training/training_folds_{i}.ckpt"
    checkpoint_dir = os.path.dirname(checkpoint_path)
    
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_path,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)
 
    # Train the model  
    n_steps = df_tr.shape[0] // BATCH_SIZE
    train_history = model.fit(
        tr_ds,
        steps_per_epoch=n_steps,
        validation_data = vl_ds,
        epochs=15, callbacks=[model_checkpoint_callback]
    )
    
    # The model weights (that are considered the best) are loaded into the model.
    model.load_weights( checkpoint_path)
    y_pred = model.predict(vl_ds.map(lambda x,y:x))
    y_oof[val_idx, :] = y_pred
    y_vl = train['Label'].iloc[val_idx]   
    metric = accuracy_score(y_vl, np.argmax(y_pred, 1))
    print("fold #{} val_loss: {}".format(i, metric))


    
    i += 1
    y_test += model.predict(test_ds) / n_splits
    metrics.append(metric)


metrics = np.array(metrics).mean()
print(f'Full accuracy {metrics}')  # 

Epoch 1/15


  '"`categorical_crossentropy` received `from_logits=True`, but '


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
fold #0 val_loss: 0.6666666666666666
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
fold #1 val_loss: 0.7212543554006968
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
fold #2 val_loss: 0.6933797909407665
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
fold #3 val_loss: 0.6794425087108014
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
fold #4 

In [19]:
tmp = pd.DataFrame(y_oof)
tmp.to_csv(f'DNN_train.csv', index=False)

tmp_test = pd.DataFrame(y_test)
tmp_test.to_csv(f'DNN_test.csv', index=False)

In [21]:
!cp /content/DNN_test.csv '/content/drive/MyDrive/Code Review '
!cp /content/DNN_train.csv '/content/drive/MyDrive/Code Review '

# *transformers-baseline-ckpt.ipynb* Notebook io :
###  - **inputs:** Train.csv, Test.csv , **[transformer weights fold1](https://drive.google.com/drive/folders/1nQDE9dUZ7UbwZXGuYXasvVfwZ4HkPlal?usp=sharing)**
###  - **outputs:** DNN_train.csv, DNN_test.csv

**----------------------------------------------------------------------------------------------------------------**

# outputs will be used in **Distilation Notebook**