# Versions
1. Version 4- Used Glove 6B 200D
2. Version 5- Used Glove 840B 300d 

In [15]:
import pandas as pd 
import numpy as np 
from tqdm import tqdm
import re
import pickle

from sklearn.model_selection import train_test_split,KFold
from sklearn.metrics import mean_squared_error

import nltk
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow.keras.layers import LSTM,RNN,Conv2D,Dense,Flatten,GlobalAveragePooling2D,Embedding,Bidirectional,Input,Dropout,Conv1D,MaxPooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint,ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.optimizers import Adam,SGD


SEED=42

In [2]:
paths=["/kaggle/input/commonlitreadabilityprize/sample_submission.csv",
       '/kaggle/input/commonlitreadabilityprize/train.csv',
       '/kaggle/input/commonlitreadabilityprize/test.csv']
df_train=pd.read_csv(paths[1])
df_test=pd.read_csv(paths[2])
df_ss=pd.read_csv(paths[0])

In [3]:
df_train

Unnamed: 0,id,url_legal,license,excerpt,target,standard_error
0,c12129c31,,,When the young people returned to the ballroom...,-0.340259,0.464009
1,85aa80a4c,,,"All through dinner time, Mrs. Fayre was somewh...",-0.315372,0.480805
2,b69ac6792,,,"As Roger had predicted, the snow departed as q...",-0.580118,0.476676
3,dd1000b26,,,And outside before the palace a great garden w...,-1.054013,0.450007
4,37c1b32fb,,,Once upon a time there were Three Bears who li...,0.247197,0.510845
...,...,...,...,...,...,...
2829,25ca8f498,https://sites.ehe.osu.edu/beyondpenguins/files...,CC BY-SA 3.0,When you think of dinosaurs and where they liv...,1.711390,0.646900
2830,2c26db523,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,So what is a solid? Solids are usually hard be...,0.189476,0.535648
2831,cd19e2350,https://en.wikibooks.org/wiki/Wikijunior:The_E...,CC BY-SA 3.0,The second state of matter we will discuss is ...,0.255209,0.483866
2832,15e2e9e7a,https://en.wikibooks.org/wiki/Geometry_for_Ele...,CC BY-SA 3.0,Solids are shapes that you can actually touch....,-0.215279,0.514128


In [4]:
df_test

Unnamed: 0,id,url_legal,license,excerpt
0,c0f722661,,,My hope lay in Jack's promise that he would ke...
1,f0953f0a5,,,Dotty continued to go to Mrs. Gray's every nig...
2,0df072751,,,It was a bright and cheerful scene that greete...
3,04caf4e0c,https://en.wikipedia.org/wiki/Cell_division,CC BY-SA 3.0,Cell division is the process by which a parent...
4,0e63f8bea,https://en.wikipedia.org/wiki/Debugging,CC BY-SA 3.0,Debugging is the process of finding and resolv...
5,12537fe78,,,"To explain transitivity, let us look first at ..."
6,965e592c0,https://www.africanstorybook.org/#,CC BY 4.0,Milka and John are playing in the garden. Her ...


In [5]:
df_train['excerpt'][0]

'When the young people returned to the ballroom, it presented a decidedly changed appearance. Instead of an interior scene, it was a winter landscape.\nThe floor was covered with snow-white canvas, not laid on smoothly, but rumpled over bumps and hillocks, like a real snow field. The numerous palms and evergreens that had decorated the room, were powdered with flour and strewn with tufts of cotton, like snow. Also diamond dust had been lightly sprinkled on them, and glittering crystal icicles hung from the branches.\nAt each end of the room, on the wall, hung a beautiful bear-skin rug.\nThese rugs were for prizes, one for the girls and one for the boys. And this was the game.\nThe girls were gathered at one end of the room and the boys at the other, and one end was called the North Pole, and the other the South Pole. Each player was given a small flag which they were to plant on reaching the Pole.\nThis would have been an easy matter, but each traveller was obliged to wear snowshoes.'

# Cleaning

In [6]:
def clean(string):
    pattern='\n'
    pattern1= '\''
    test= re.sub(pattern,'',string)
    string= re.sub(pattern1,'',string)

    return string

In [7]:
#remove stopwords 
nltk.download('stopwords')
stop=stopwords.words('english')


def remove_stopwords(df):
    new_text=[]
    for i in range(len(df)):
        test=[j for j in df['excerpt'][i].split() if j not in stop]
        new_text.append(' '.join(test))
    new_text=pd.Series(new_text,name='cleaned_text')
    df=pd.concat([df,new_text],axis='columns',copy=False)
    return df

[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>


In [8]:
df_train['excerpt']=df_train['excerpt'].map(clean)
df_test['excerpt']=df_test['excerpt'].map(clean)


In [9]:
df_train= remove_stopwords(df_train)
df_test= remove_stopwords(df_test)


In [10]:
length=[]
for i in df_train['cleaned_text']:
    length.append(len(i))
    
length= np.array(length)
print(length.mean(),length.min(),length.max())

681.4446012702894 391 1101


In [11]:
X=df_train['cleaned_text']
y=df_train['target']
test=df_test['cleaned_text']



In [12]:
df_train['cleaned_text'][0]

'When young people returned ballroom, presented decidedly changed appearance. Instead interior scene, winter landscape. The floor covered snow-white canvas, laid smoothly, rumpled bumps hillocks, like real snow field. The numerous palms evergreens decorated room, powdered flour strewn tufts cotton, like snow. Also diamond dust lightly sprinkled them, glittering crystal icicles hung branches. At end room, wall, hung beautiful bear-skin rug. These rugs prizes, one girls one boys. And game. The girls gathered one end room boys other, one end called North Pole, South Pole. Each player given small flag plant reaching Pole. This would easy matter, traveller obliged wear snowshoes.'

In [13]:
VOCAB= 25000
max_len=681
oov_token='<OOV_TOKEN>'
truncate_type='post'
padding_type='post'
embedding_dim=16


# Tokenizing

In [14]:
#tokenize
tokenizer=Tokenizer(oov_token=oov_token,num_words=VOCAB)
tokenizer.fit_on_texts(X)
word_index = tokenizer.word_index
print(len(word_index))

#

train_sequences= tokenizer.texts_to_sequences(X)
test_sequences= tokenizer.texts_to_sequences(test)

train_padding = pad_sequences(train_sequences, maxlen=max_len, padding= padding_type, truncating= truncate_type)
test_padding = pad_sequences(test_sequences, maxlen=max_len, padding= padding_type , truncating= truncate_type)

28177


# Glove Embedding Prep

In [22]:
import pickle
from time import time

t = time()
with open('../input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl', 'rb') as fp:
    embeddings_index  = pickle.load(fp)

In [24]:
# create an embedding matrix for the words we have in the dataset
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        
print(f'Shape of Embedding: {embedding_matrix.shape}')

100%|██████████| 28177/28177 [00:00<00:00, 247916.22it/s]

Shape of Embedding: (28178, 300)





In [23]:
"""embeddings_index = {}
with open('../input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl', 'rb') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

embeddings_matrix = np.zeros(((len(word_index)+1),200))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector """

"embeddings_index = {}\nwith open('../input/pickled-glove840b300d-for-10sec-loading/glove.840B.300d.pkl', 'rb') as f:\n    for line in f:\n        values = line.split()\n        word = values[0]\n        coefs = np.asarray(values[1:], dtype='float32')\n        embeddings_index[word] = coefs\n\nembeddings_matrix = np.zeros(((len(word_index)+1),200))\nfor word, i in word_index.items():\n    embedding_vector = embeddings_index.get(word)\n    if embedding_vector is not None:\n        embeddings_matrix[i] = embedding_vector "

# Model Construction

In [26]:
input_layer = Input(shape=(681,))
x= embedding_layer= Embedding(len(word_index)+1,300,trainable=False,weights=[embedding_matrix])(input_layer)
x= Conv1D(32,3,activation='relu')(x)
x=MaxPooling1D(pool_size=2)(x)
x= Bidirectional(LSTM(150))(x)
x=Dense(128,activation='relu')(x)
x=Dense(64,activation='relu')(x)
x=Dense(32,activation='relu')(x)
x=Dense(16,activation='relu')(x)
predictions = Dense(1,activation='linear')(x)

model1=Model(inputs=input_layer, outputs= predictions)

print(model1.summary())


model1.compile(
    optimizer= Adam(learning_rate=1e-5),
    loss='mse',
    metrics='mae'
)

model_checkpoint=ModelCheckpoint('golve_840b300d.h5',monitor='loss',save_best_only=True,mode='min')
early_stopping=EarlyStopping(monitor="loss",min_delta=0,patience=10,verbose=0,mode="min",restore_best_weights=True)
reduce_lr=ReduceLROnPlateau(monitor="loss",factor=0.2,patience=10,min_lr=0.00001)


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 681)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 681, 300)          8453400   
_________________________________________________________________
conv1d (Conv1D)              (None, 679, 32)           28832     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 339, 32)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 300)               219600    
_________________________________________________________________
dense (Dense)                (None, 128)               38528     
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256  

# Submission code

In [27]:
model1.fit(train_padding,y,epochs=200,batch_size=256, callbacks = [model_checkpoint,reduce_lr,early_stopping])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f61d1b5b090>

In [28]:
y_pred = model1.predict(test_padding)

In [32]:
sub_scores=[]
for i in y_pred:
    sub_scores.append(i)
sub_scores

[-1.280384,
 -0.2589453,
 -0.74497914,
 -2.2587895,
 -1.6164868,
 -0.44256675,
 0.39995652]

In [33]:
sub=pd.DataFrame({'id':df_ss['id'],'target':y_pred})
sub.to_csv('submission.csv',index=False)
sub.head()

Unnamed: 0,id,target
0,c0f722661,-1.280384
1,f0953f0a5,-0.258945
2,0df072751,-0.744979
3,04caf4e0c,-2.25879
4,0e63f8bea,-1.616487
