In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("train.tsv", sep = "\t")

In [3]:
data.head()

Unnamed: 0,Id,EssaySet,Score1,Score2,EssayText
0,1,1,1,1,Some additional information that we would need...
1,2,1,1,1,"After reading the expirement, I realized that ..."
2,3,1,1,1,"What you need is more trials, a control set up..."
3,4,1,0,0,The student should list what rock is better an...
4,5,1,2,2,For the students to be able to make a replicat...


In [4]:
data["EssayText"][0]

'Some additional information that we would need to replicate the experiment is how much vinegar should be placed in each identical container, how or what tool to use to measure the mass of the four different samples and how much distilled water to use to rinse the four samples after taking them out of the vinegar.'

In [5]:
data.tail()

Unnamed: 0,Id,EssaySet,Score1,Score2,EssayText
17202,27584,10,1,1,white :: white becuase if you live in a hot pl...
17203,27585,10,1,1,light gray :: This color will affect the dogho...
17204,27586,10,1,1,light gray :: i think light gray would work th...
17205,27587,10,1,0,dark gray :: if they painted the doghouse dar...
17206,27588,10,0,1,white :: the black would affect the doghouse b...


In [6]:
data["EssayText"][17206]

'white :: the black would affect the doghouse because t he sun is attracted to black. so if the sun is beating down on the doghou se to long it is possible that the dog could over heat of get very sick. something eles could happen to the dog that is sleeping in the doghouse.'

In [7]:
# Score 2 is not considered because it is mentioned by the business analyst

data.drop(["Id", "Score2"], axis = 1, inplace = True)

In [8]:
data.head(11)

Unnamed: 0,EssaySet,Score1,EssayText
0,1,1,Some additional information that we would need...
1,1,1,"After reading the expirement, I realized that ..."
2,1,1,"What you need is more trials, a control set up..."
3,1,0,The student should list what rock is better an...
4,1,2,For the students to be able to make a replicat...
5,1,1,I would need the information of why you would ...
6,1,1,The information I would need in order to suces...
7,1,3,You would need many more pieces of information...
8,1,3,Some additional information you will need are ...
9,1,2,"Inorder to replicate the experiment, we will n..."


In [9]:
essay_text = data['EssayText']
essay_score = data['Score1']
essay_set = data['EssaySet']

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

sets = [0,1,2,3] # The 4 different classes of Scores

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

In [11]:
for label in sets:
    for t in essay_text[data['Score1']==label]:
        texts.append(t)
        labels.append(label)

In [12]:
texts

['The student should list what rock is better and what rock is the worse in the procedure.',
 'An additional information that i would need in order to replicate the experiment',
 'Well what i understand about this procedure is that you take four samples, put them in different containers that look the same, put vinegar on every sample + finally rinse them out with normal water + let them sit for 30 minutes till they dry. Then your finally going to see the difference between those samples.',
 "I don't know what is going on!",
 'The additional information I would need is to come up with an hypothesis to predict which sample will have the highest mass in acid rain. After that I will have to come up with my dependent and independent variables so I can tell and see each one is different in mass. I will organize the data to see if the information is accurate. Then I will write the conclusion to see if my hypothesis was correct and to see if my results were correct.',
 'In order to replicate t

In [13]:
len(max(texts))

96

In [14]:
# Creating one hot representation

from tensorflow.keras.preprocessing.text import one_hot

voc_size = 10000
encoded_text = [one_hot(word, voc_size) for word in texts]
encoded_text[0:2]

[[4171,
  9345,
  4286,
  1513,
  7051,
  4335,
  326,
  8913,
  390,
  7051,
  4335,
  326,
  4171,
  9828,
  1821,
  4171,
  7086],
 [5649, 3997, 5399, 7353, 7294, 6140, 3827, 1821, 5097, 862, 2062, 4171, 5886]]

In [15]:
# Pading encoded text to make each sentence of equal size

from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = 100
pad_text = pad_sequences(encoded_text, maxlen= max_len, padding='pre')
print(pad_text[0:1])

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0 4171
  9345 4286 1513 7051 4335  326 8913  390 7051 4335  326 4171 9828 1821
  4171 7086]]


In [16]:
# Embedding

from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential

model = Sequential()
model.add(Embedding(voc_size, 10, input_length=max_len))
model.compile("adam", "mse")

In [17]:
# summarize the model
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 10)           100000    
Total params: 100,000
Trainable params: 100,000
Non-trainable params: 0
_________________________________________________________________


In [18]:
# Embedded metrics

model.predict(pad_text[0:1])

array([[[ 0.03645409,  0.03577489,  0.02143929, -0.03585936,
          0.04629881,  0.02308437, -0.03385071, -0.04443775,
         -0.03342488,  0.0072187 ],
        [ 0.03645409,  0.03577489,  0.02143929, -0.03585936,
          0.04629881,  0.02308437, -0.03385071, -0.04443775,
         -0.03342488,  0.0072187 ],
        [ 0.03645409,  0.03577489,  0.02143929, -0.03585936,
          0.04629881,  0.02308437, -0.03385071, -0.04443775,
         -0.03342488,  0.0072187 ],
        [ 0.03645409,  0.03577489,  0.02143929, -0.03585936,
          0.04629881,  0.02308437, -0.03385071, -0.04443775,
         -0.03342488,  0.0072187 ],
        [ 0.03645409,  0.03577489,  0.02143929, -0.03585936,
          0.04629881,  0.02308437, -0.03385071, -0.04443775,
         -0.03342488,  0.0072187 ],
        [ 0.03645409,  0.03577489,  0.02143929, -0.03585936,
          0.04629881,  0.02308437, -0.03385071, -0.04443775,
         -0.03342488,  0.0072187 ],
        [ 0.03645409,  0.03577489,  0.02143929, -0.0