In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import transformers
from transformers import AutoModel, AutoTokenizer
from transformers import BertTokenizer, TFBertModel
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('punkt')
!pip install contractions
import contractions
import copy 
nltk.download('stopwords')
nltk.download('omw-1.4')
from textblob import TextBlob
import spacy
from spacy import displacy
from tensorflow.keras.utils import plot_model
from tensorflow.keras import layers
from tensorflow.keras import regularizers

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
tqdm.pandas()

from lightgbm import LGBMRegressor
from sklearn import model_selection
from sklearn import metrics

In [None]:
BASE_MODEL = '../input/huggingface-bert-variants/bert-base-cased/bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(BASE_MODEL)
nlp = spacy.load("en_core_web_sm")
maxlen = 100
maxsent = 10

In [None]:
df_trial = pd.read_csv('../input/ell-clean-text/ELL-clean.csv')
df_trial.head()

# HELPER FUNCTIONS

In [None]:
def clean(df, t_ids):
    clean_text = []
    
    for t_id in tqdm(t_ids):
        temp_df = df.loc[df['text_id']==t_id]
        
        text = temp_df['full_text'].to_numpy()
        #print(text)
    
        
        text = text[0].lower()
        text = re.sub("[^\w\s]", " ", text)
        
        text = text.split()
        
        
        text_nostop = []
        for word in text:
            if word not in stopwords.words():
                text_nostop.append(word)
                
        lemmatizer = WordNetLemmatizer()
        text_lem = []
        for word in text_nostop:
            text_lem.append(lemmatizer.lemmatize(word))
            
        text = " ".join(text_lem)
        clean_text.append(text)
        
        # break
        
    #print(clean_text)
    df['clean_text'] = clean_text

In [None]:
def equalize_lists(lst, maxlen):
    length = len(lst)
    dif = abs(maxlen - length)
    
    if maxlen > length and dif != 0:
        lst += [0] * dif
        
    elif maxlen < length and dif > 0:
        lst = lst[:len(lst)-dif]
        
    #print(lst)
    return lst

In [None]:
def vocab_dictionary(df, t_ids):
    vocab_dict = {}
    
    for t_id in tqdm(t_ids):
        temp_df = df.loc[df['text_id']==t_id]
        temp_dict = {}
        
        text = temp_df['clean_text'].to_numpy()
        words = text[0].split()
        #print(words)
        
        for word in words:
            if word not in temp_dict.keys():
                temp_dict[word] = 1
            
            else:
                temp_dict[word] += 1
         
        #print()
        #print(temp_dict)
        
        vocab_dict[t_id] = temp_dict
        #print(vocab_dict)
        
        #break
        
    return vocab_dict

In [None]:
def frequency_ratio(vocab_dict, t_ids, maxlen):
    #print(vocab_dict)
    out = []
    
    for t_id in tqdm(t_ids):
        vals = vocab_dict[t_id]
        #print(vals)
        freq = list(vals.values())
        #print(freq)
        max_use = max(freq)
        #print(max_use)
        
        temp = [val/max_use for val in freq]
        temp = equalize_lists(temp, maxlen)
        temp = np.asarray(temp)
        #print(freq)
        #print(temp)
        out.append(temp)
        
        #break 
        
    #print(out)
    return out 

In [None]:
def get_processing_hash(df, t_ids, feature, maxlen):
    out = []
    
    all_text = df['full_text'].to_list()
    
    for i, text in enumerate(tqdm(all_text)):
        #print(i)
        temp_out = []
        #print(text)
        doc = nlp(text)
        
        for token in doc:
            #print(token, token.pos_)
            try: 
                if feature == 'pos':
                    temp_out.append(token.pos)

                elif feature == 'tag':
                    temp_out.append(token.tag)

                else:
                    temp_out.append(token.dep)
                    
            except Exception:
                pass
                
        #print(temp_out)
        length = len(temp_out)
        temp_out = equalize_lists(temp_out, maxlen)
            
        out.append(temp_out)
        #break 
        
    #print(out)
    return out

In [None]:
def tokenize_text(df, t_ids, tokenizer, maxlen, pairwise=False):
    input_ids = {}
    att_mask = {}
    token_ids = {}
    
    '''if clean == False: 
        use = 'full_text'
    else:
        use = 'clean_text'''
    
    for t_id in tqdm(t_ids):
        temp_df = df.loc[df['text_id']==t_id]
        
        temp_input_ids = []
        temp_att_mask = []
        temp_token_ids = []
        
        text = temp_df['full_text'].to_numpy() # THIS USED TO BE USE 
        #print(text)
        #sentences = text[0].split()
        sentences = nltk.tokenize.sent_tokenize(text[0])
        #print(sentences)
        
        if pairwise == False:
            #for sentence in sentences:
            encoded = tokenizer.batch_encode_plus(text, add_special_tokens=True, return_token_type_ids=True, truncation=True, 
                                                 padding='max_length', max_length=maxlen)

            temp_input_ids.append(encoded['input_ids'])
            temp_att_mask.append(encoded['attention_mask'])
            temp_token_ids.append(encoded['token_type_ids'])
            
            #print(encoded['input_ids'])
            #print(encoded['token_type_ids'])
                
        else:
            #print("pointwise")
            #print(sentences)
            for i in range(len(sentences)-1):
                #print(i)
                pair = sentences[i:i+2]
                #print(pair)
                
                encoded = tokenizer.encode_plus(pair[0], pair[1], add_special_tokens=True, return_token_type_ids=True, truncation=False,
                                                padding='max_length', max_length=maxlen)
                
                temp_input_ids.append(encoded['input_ids'])
                temp_att_mask.append(encoded['attention_mask'])
                temp_token_ids.append(encoded['token_type_ids'])
        
        input_ids[t_id] = temp_input_ids
        att_mask[t_id] = temp_att_mask
        token_ids[t_id] = temp_token_ids
        
        #print(input_ids)
        
                
    #print(input_ids)
    return input_ids, att_mask, token_ids

In [None]:
def jaccard(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [None]:
def similarity(pair):
    split_id = pair.index(102)
    #print(split_id)
    
    pair_1 = pair[1:split_id]
    pair_2 = pair[split_id+1:-1]
    #print(pair_1) 
    #print(pair_2)
    
    pair_similarity = jaccard(pair_1, pair_2)
    #print(pair_similarity)
    
    return(pair_similarity)

# PRETRAIN SETUP

In [None]:
t_ids = df_trial['text_id'].to_numpy()

In [None]:
v_dict = vocab_dictionary(df_trial, t_ids)
freq_ratio = frequency_ratio(v_dict, t_ids, maxlen)

pos_inputs = get_processing_hash(df_trial, t_ids, 'pos', maxlen)
tag_inputs = get_processing_hash(df_trial, t_ids, 'tag', maxlen)
dep_inputs = get_processing_hash(df_trial, t_ids, 'dep', maxlen)

In [None]:
unclean_pairs_ids, unclean_pairs_masks, unclean_pairs_tokens = tokenize_text(df_trial, t_ids, tokenizer, maxlen, True)
batch_ids, batch_masks, batch_tokens = tokenize_text(df_trial, t_ids, tokenizer, maxlen, False)

In [None]:
unclean_similarity = []

for t_id in tqdm(t_ids):
    #print(unclean_pairs_ids[t_id])
    temp_sim = []
    i = 0
    for pair in unclean_pairs_ids[t_id]:
        if i < maxsent: 
            #print(pair)
            temp_sim.append(similarity(pair))
            i += 1
        else:
            break
    
    temp_sim = equalize_lists(temp_sim, maxsent)
    unclean_similarity.append(temp_sim)

In [None]:
batch_ids, batch_masks, batch_tokens = tokenize_text(df_trial, t_ids, tokenizer, maxlen, False)

bert_ids = []
bert_masks = []
bert_tokens = []

for t_id in t_ids:
    bert_ids.append((batch_ids[t_id][0][0]))
    bert_masks.append((batch_masks[t_id][0][0]))
    bert_tokens.append((batch_tokens[t_id][0][0]))
    
    #break

In [None]:
cohesion_labels = df_trial['cohesion'].to_numpy()
syntax_labels = df_trial['syntax'].to_numpy()
vocabulary_labels = df_trial['vocabulary'].to_numpy()
phraseology_labels = df_trial['phraseology'].to_numpy()
grammar_labels = df_trial['grammar'].to_numpy()
conventions_labels = df_trial['conventions'].to_numpy()

bert_ids_np = np.asarray(bert_ids, dtype=np.float64)
bert_masks_np = np.asarray(bert_masks, dtype=np.float64)
bert_tokens_np = np.asarray(bert_tokens, dtype=np.float64)

freq_ratio_np = np.asarray(freq_ratio)
pos_inputs_np = np.asarray(pos_inputs)
tag_inputs_np = np.asarray(tag_inputs)
dep_inputs_np = np.asarray(dep_inputs)
unclean_similarity_np = np.asarray(unclean_similarity)

In [None]:
length = len(t_ids)
labels = np.zeros((length, 6))

for i in range(length):
    labels[i][0] = cohesion_labels[i]
    labels[i][1] = syntax_labels[i]
    labels[i][2] = vocabulary_labels[i]
    labels[i][3] = phraseology_labels[i]
    labels[i][4] = grammar_labels[i]
    labels[i][5] = conventions_labels[i]

# BUILD MODEL AND TRAIN

In [None]:
def build_reg_model():
    backbone = TFBertModel.from_pretrained(BASE_MODEL)
    
    input_ids = layers.Input(
        shape=(maxlen,),
        dtype=tf.int32,
        name="input_ids",
    )
    
    attention_mask = layers.Input(
        shape=(maxlen,),
        dtype=tf.int32,
        name="attention_mask",
    )
    
    token_type_ids = layers.Input(
        shape=(maxlen,),
        dtype=tf.int32,
        name="token_type_ids",
    )
    
    bert_out = backbone({
            "input_ids": input_ids,
            "token_type_ids": token_type_ids,
            "attention_mask": attention_mask})
    
    similarity = tf.keras.layers.Input((maxsent,), dtype=tf.float32, name='similarity') 
    
    freq_ratio = tf.keras.layers.Input((maxlen,), dtype=tf.float32, name='freq_ratio') 
    
    pos = tf.keras.layers.Input((maxlen,), dtype=tf.float32, name='pos') 
    tag = tf.keras.layers.Input((maxlen,), dtype=tf.float32, name='tag') 
    dep = tf.keras.layers.Input((maxlen,), dtype=tf.float32, name='dep') 
    
    bert_out = tf.keras.layers.Dense(64, activation="linear", dtype="float32")(bert_out[0][:, 0, :])
    concat = tf.keras.layers.Concatenate()([similarity, freq_ratio, pos, tag, dep, bert_out])
    
    x = tf.keras.layers.LayerNormalization()(concat)
    x = tf.keras.layers.Dense(32)(x)
    x = tf.keras.layers.LayerNormalization()(x)
    x = tf.keras.layers.Dense(16)(x)
    x = tf.keras.layers.LayerNormalization()(x)
    
    out = tf.keras.layers.Dense(6, dtype='float32')(x)
    
    model = tf.keras.Model(
        inputs=[input_ids, attention_mask, token_type_ids, similarity, freq_ratio, pos, tag, dep],
        outputs=[out],
    )
    
    
    model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.MeanSquaredError()
    )
    
    return model

In [None]:
regmodel = build_reg_model()
regmodel.summary()
#plot_model(model,show_shapes=True)

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [None]:
reghistory = regmodel.fit(x=[bert_ids_np, bert_masks_np, bert_tokens_np, unclean_similarity_np, freq_ratio_np, pos_inputs_np, tag_inputs_np, dep_inputs_np], 
                   y=[labels], validation_split=0.3, batch_size=32, 
                          epochs=100, verbose =1, callbacks=[callback])

In [None]:
regmodel.save_weights(f"model_callback.h5")