# SETUP

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
from transformers import AutoModel, AutoTokenizer
from transformers import BertTokenizer, TFBertModel
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
#nltk.download('punkt')
#!pip install contractions
import copy 
#nltk.download('stopwords')
#nltk.download('omw-1.4')
from textblob import TextBlob
import spacy
from spacy import displacy
from tensorflow.keras.utils import plot_model
from tensorflow.keras import layers
from tensorflow.keras import regularizers

import matplotlib.pyplot as plt

from tqdm import tqdm
tqdm.pandas()

from sklearn import model_selection
from sklearn import metrics

2022-11-14 05:09:29.997976: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-14 05:09:29.999108: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-14 05:09:30.000172: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-14 05:09:30.000953: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-14 05:09:30.001714: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from S

In [2]:
BASE_MODEL = '../input/huggingface-bert-variants/bert-base-cased/bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(BASE_MODEL)
nlp = spacy.load("en_core_web_sm")
maxlen = 100
maxsent = 10

In [3]:
df_trial = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')
df_trial.head()

Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


# HELPER FUNCTIONS

In [4]:
def clean(df, t_ids):
    clean_text = []
    nlp = spacy.load("en_core_web_sm")
    
    for t_id in tqdm(t_ids):
        temp_df = df.loc[df['text_id']==t_id]
        
        text = temp_df['full_text'].to_numpy()
        #print(text)
    
        
        text = text[0].lower()
        text = re.sub("[^\w\s]", " ", text)
        
        '''text = text.split()
        
        
        text_nostop = []
        for word in text:
            if word not in stopwords.words():
                text_nostop.append(word)
                
        text_lem = []
        for word in text_nostop:
            word = nlp(word)
            text_lem.append(word.lemma_)
            
        text = " ".join(text_lem)'''
        clean_text.append(text)
        
        # break
        
    #print(clean_text)
    df['clean_text'] = clean_text

In [5]:
def equalize_lists(lst, maxlen):
    length = len(lst)
    dif = abs(maxlen - length)
    
    if maxlen > length and dif != 0:
        lst += [0] * dif
        
    elif maxlen < length and dif > 0:
        lst = lst[:len(lst)-dif]
        
    #print(lst)
    return lst

In [6]:
def vocab_dictionary(df, t_ids):
    vocab_dict = {}
    
    for t_id in tqdm(t_ids):
        temp_df = df.loc[df['text_id']==t_id]
        temp_dict = {}
        
        text = temp_df['clean_text'].to_numpy()
        words = text[0].split()
        #print(words)
        
        for word in words:
            if word not in temp_dict.keys():
                temp_dict[word] = 1
            
            else:
                temp_dict[word] += 1
         
        #print()
        #print(temp_dict)
        
        vocab_dict[t_id] = temp_dict
        #print(vocab_dict)
        
        #break
        
    return vocab_dict

In [7]:
def frequency_ratio(vocab_dict, t_ids, maxlen):
    #print(vocab_dict)
    out = []
    
    for t_id in tqdm(t_ids):
        vals = vocab_dict[t_id]
        #print(vals)
        freq = list(vals.values())
        #print(freq)
        max_use = max(freq)
        #print(max_use)
        
        temp = [val/max_use for val in freq]
        temp = equalize_lists(temp, maxlen)
        temp = np.asarray(temp)
        #print(freq)
        #print(temp)
        out.append(temp)
        
        #break 
        
    #print(out)
    return out 

In [8]:
def get_processing_hash(df, t_ids, feature, maxlen):
    out = []
    
    all_text = df['full_text'].to_list()
    
    for i, text in enumerate(tqdm(all_text)):
        #print(i)
        temp_out = []
        #print(text)
        doc = nlp(text)
        
        for token in doc:
            #print(token, token.pos_)
            try: 
                if feature == 'pos':
                    temp_out.append(token.pos)

                elif feature == 'tag':
                    temp_out.append(token.tag)

                else:
                    temp_out.append(token.dep)
                    
            except Exception:
                pass
                
        #print(temp_out)
        length = len(temp_out)
        temp_out = equalize_lists(temp_out, maxlen)
            
        out.append(temp_out)
        #break 
        
    #print(out)
    return out

In [9]:
def tokenize_text(df, t_ids, tokenizer, maxlen, pairwise=False):
    input_ids = {}
    att_mask = {}
    token_ids = {}
    
    '''if clean == False: 
        use = 'full_text'
    else:
        use = 'clean_text'''
    
    for t_id in tqdm(t_ids):
        temp_df = df.loc[df['text_id']==t_id]
        
        temp_input_ids = []
        temp_att_mask = []
        temp_token_ids = []
        
        text = temp_df['full_text'].to_numpy() # THIS USED TO BE USE 
        #print(text)
        #sentences = text[0].split()
        sentences = nltk.tokenize.sent_tokenize(text[0])
        #print(sentences)
        
        if pairwise == False:
            #for sentence in sentences:
            encoded = tokenizer.batch_encode_plus(text, add_special_tokens=True, return_token_type_ids=True, truncation=True, 
                                                 padding='max_length', max_length=maxlen)

            temp_input_ids.append(encoded['input_ids'])
            temp_att_mask.append(encoded['attention_mask'])
            temp_token_ids.append(encoded['token_type_ids'])
            
            #print(encoded['input_ids'])
            #print(encoded['token_type_ids'])
                
        else:
            #print("pointwise")
            #print(sentences)
            for i in range(len(sentences)-1):
                #print(i)
                pair = sentences[i:i+2]
                #print(pair)
                
                encoded = tokenizer.encode_plus(pair[0], pair[1], add_special_tokens=True, return_token_type_ids=True, truncation=False,
                                                padding='max_length', max_length=maxlen)
                
                temp_input_ids.append(encoded['input_ids'])
                temp_att_mask.append(encoded['attention_mask'])
                temp_token_ids.append(encoded['token_type_ids'])
        
        input_ids[t_id] = temp_input_ids
        att_mask[t_id] = temp_att_mask
        token_ids[t_id] = temp_token_ids
        
        #print(input_ids)
        
                
    #print(input_ids)
    return input_ids, att_mask, token_ids

In [10]:
def jaccard(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [11]:
def similarity(pair):
    split_id = pair.index(102)
    #print(split_id)
    
    pair_1 = pair[1:split_id]
    pair_2 = pair[split_id+1:-1]
    #print(pair_1) 
    #print(pair_2)
    
    pair_similarity = jaccard(pair_1, pair_2)
    #print(pair_similarity)
    
    return(pair_similarity)

# DATA SETUP FOR INFERENCE 

In [12]:
t_ids = df_trial['text_id'].to_numpy()
clean(df_trial, t_ids)
df_trial.head()

100%|██████████| 3/3 [00:00<00:00, 539.65it/s]


Unnamed: 0,text_id,full_text,clean_text
0,0000C359D63E,when a person has no experience on a job their...,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...,do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde...",thomas jefferson once states that it is wonde...


In [13]:
v_dict = vocab_dictionary(df_trial, t_ids)
freq_ratio = frequency_ratio(v_dict, t_ids, maxlen)

pos_inputs = get_processing_hash(df_trial, t_ids, 'pos', maxlen)
tag_inputs = get_processing_hash(df_trial, t_ids, 'tag', maxlen)
dep_inputs = get_processing_hash(df_trial, t_ids, 'dep', maxlen)

100%|██████████| 3/3 [00:00<00:00, 910.29it/s]
100%|██████████| 3/3 [00:00<00:00, 7776.83it/s]
100%|██████████| 3/3 [00:00<00:00,  9.87it/s]
100%|██████████| 3/3 [00:00<00:00, 12.62it/s]
100%|██████████| 3/3 [00:00<00:00, 12.79it/s]


In [14]:
unclean_pairs_ids, unclean_pairs_masks, unclean_pairs_tokens = tokenize_text(df_trial, t_ids, tokenizer, maxlen, True)
batch_ids, batch_masks, batch_tokens = tokenize_text(df_trial, t_ids, tokenizer, maxlen, False)

100%|██████████| 3/3 [00:00<00:00, 28.61it/s]
100%|██████████| 3/3 [00:00<00:00, 76.60it/s]


In [15]:
unclean_similarity = []

for t_id in tqdm(t_ids):
    #print(unclean_pairs_ids[t_id])
    temp_sim = []
    i = 0
    for pair in unclean_pairs_ids[t_id]:
        if i < maxsent: 
            #print(pair)
            temp_sim.append(similarity(pair))
            i += 1
        else:
            break
    
    temp_sim = equalize_lists(temp_sim, maxsent)
    unclean_similarity.append(temp_sim)

100%|██████████| 3/3 [00:00<00:00, 7100.97it/s]


In [16]:
batch_ids, batch_masks, batch_tokens = tokenize_text(df_trial, t_ids, tokenizer, maxlen, False)

bert_ids = []
bert_masks = []
bert_tokens = []

for t_id in t_ids:
    bert_ids.append((batch_ids[t_id][0][0]))
    bert_masks.append((batch_masks[t_id][0][0]))
    bert_tokens.append((batch_tokens[t_id][0][0]))
    
    #break

100%|██████████| 3/3 [00:00<00:00, 72.06it/s]


In [17]:
bert_ids_np = np.asarray(bert_ids, dtype=np.float64)
bert_masks_np = np.asarray(bert_masks, dtype=np.float64)
bert_tokens_np = np.asarray(bert_tokens, dtype=np.float64)

freq_ratio_np = np.asarray(freq_ratio)
pos_inputs_np = np.asarray(pos_inputs)
tag_inputs_np = np.asarray(tag_inputs)
dep_inputs_np = np.asarray(dep_inputs)
unclean_similarity_np = np.asarray(unclean_similarity)

# MODEL SETUP FOR INFERENCE

In [18]:
def build_reg_model():
    backbone = TFBertModel.from_pretrained(BASE_MODEL)
    
    input_ids = layers.Input(
        shape=(maxlen,),
        dtype=tf.int32,
        name="input_ids",
    )
    
    attention_mask = layers.Input(
        shape=(maxlen,),
        dtype=tf.int32,
        name="attention_mask",
    )
    
    token_type_ids = layers.Input(
        shape=(maxlen,),
        dtype=tf.int32,
        name="token_type_ids",
    )
    
    bert_out = backbone({
            "input_ids": input_ids,
            "token_type_ids": token_type_ids,
            "attention_mask": attention_mask})
    
    similarity = tf.keras.layers.Input((maxsent,), dtype=tf.float32, name='similarity') 
    
    freq_ratio = tf.keras.layers.Input((maxlen,), dtype=tf.float32, name='freq_ratio') 
    
    pos = tf.keras.layers.Input((maxlen,), dtype=tf.float32, name='pos') 
    tag = tf.keras.layers.Input((maxlen,), dtype=tf.float32, name='tag') 
    dep = tf.keras.layers.Input((maxlen,), dtype=tf.float32, name='dep') 
    
    bert_out = tf.keras.layers.Dense(64, activation="linear", dtype="float32")(bert_out[0][:, 0, :])
    concat = tf.keras.layers.Concatenate()([similarity, freq_ratio, pos, tag, dep, bert_out])
    
    x = tf.keras.layers.LayerNormalization()(concat)
    x = tf.keras.layers.Dense(32)(x)
    x = tf.keras.layers.LayerNormalization()(x)
    x = tf.keras.layers.Dense(16)(x)
    x = tf.keras.layers.LayerNormalization()(x)
    
    out = tf.keras.layers.Dense(6, dtype='float32')(x)
    
    model = tf.keras.Model(
        inputs=[input_ids, attention_mask, token_type_ids, similarity, freq_ratio, pos, tag, dep],
        outputs=[out],
    )
    
    
    model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.MeanSquaredError()
    )
    
    return model

In [19]:
reloaded_reg_model = build_reg_model()
reloaded_reg_model.summary()

Some layers from the model checkpoint at ../input/huggingface-bert-variants/bert-base-cased/bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at ../input/huggingface-bert-variants/bert-base-cased/bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attention_mask (InputLayer)     [(None, 100)]        0                                            
__________________________________________________________________________________________________
input_ids (InputLayer)          [(None, 100)]        0                                            
__________________________________________________________________________________________________
token_type_ids (InputLayer)     [(None, 100)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     TFBaseModelOutputWit 108310272   attention_mask[0][0]             
                                                                 input_ids[0][0]              

In [20]:
reloaded_reg_model.load_weights('../input/ell-model-weights-callback/model_callback.h5')

In [21]:
reg_preds = reloaded_reg_model.predict([bert_ids_np, bert_masks_np, bert_tokens_np, 
                                        unclean_similarity_np, freq_ratio_np, 
                                        pos_inputs_np, tag_inputs_np, dep_inputs_np])

2022-11-14 05:10:18.287294: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2022-11-14 05:10:22.846265: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


In [22]:
length = len(t_ids)

cohesion_preds = np.zeros(length)
syntax_preds = np.zeros(length)
vocabulary_preds = np.zeros(length)
phraseology_preds = np.zeros(length)
grammar_preds = np.zeros(length)
conventions_preds = np.zeros(length)

for i in range(length):
    cohesion_preds[i] = reg_preds[i][0]
    syntax_preds[i] = reg_preds[i][1]
    vocabulary_preds[i] = reg_preds[i][2]
    phraseology_preds[i] = reg_preds[i][3]
    grammar_preds[i] = reg_preds[i][4]
    conventions_preds[i] = reg_preds[i][5]

In [23]:
predictions = pd.DataFrame()
predictions['text_id'] = t_ids

predictions['cohesion'] = cohesion_preds
predictions['syntax'] = syntax_preds
predictions['vocabulary'] = vocabulary_preds
predictions['phraseology'] = phraseology_preds
predictions['grammar'] = grammar_preds
predictions['conventions'] = conventions_preds

predictions.head()

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,3.142531,3.033324,3.215459,3.148757,3.008596,3.070784
1,000BAD50D026,3.142531,3.033324,3.215459,3.148757,3.008596,3.070784
2,00367BB2546B,3.142531,3.033324,3.215459,3.148757,3.008596,3.070784


In [24]:
predictions.to_csv("submission.csv", index=False)