This notebook will serve as the master notebook which the TAs can run from start to finish

In [1]:
import pandas as pd
import numpy as np
from src.utils import get_batches, shuffle, train_val_split


%load_ext autoreload
%autoreload 2

In [2]:
# Define the path to the data
data_path = './data/essay_df.pkl'
df = pd.read_pickle(data_path)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8870 entries, 0 to 10683
Data columns (total 8 columns):
essay_id         8870 non-null int64
essay_set        8870 non-null int64
essay            8870 non-null object
domain1_score    8870 non-null int64
essays_embed     8870 non-null object
word_count       8870 non-null int64
max_score        8870 non-null float64
norm_score       8870 non-null float64
dtypes: float64(2), int64(4), object(2)
memory usage: 623.7+ KB


In [4]:
X = np.array(df['essays_embed'])
y = np.array(df['norm_score'])

In [5]:
X_stacked = np.stack(X, axis=0)
print(X_stacked.shape)

(8870, 600, 100)


In [6]:
X_flat = np.reshape(X_stacked, [X_stacked.shape[0], -1])
print(X_flat.shape)

(8870, 60000)


In [6]:
# instead of flattening, take the average value of each part of vectorized word in essay
X_flat = np.mean(X_stacked, axis = 1)


In [7]:
X_shuffled, y_shuffled = shuffle(X_flat, y)

In [8]:
print(X_shuffled.shape)

(8870, 100)


In [10]:
X_train, y_train, X_val, y_val = train_val_split(X_shuffled, y_shuffled, train_prop=0.8)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape )

(7096, 100) (1774, 100) (7096,) (1774,)


In [46]:
from src.mlp import MLP
input_dim = X_train.shape[1]
batch_size = 1

batch_gen = get_batches(X_train, y_train, batch_size, net_type='mlp')

my_net = MLP(input_dim=input_dim, hidden_dims=[30, 100], num_classes=12, l2_reg=1e-4)

In [47]:
my_net.train(gen=batch_gen, X_val=X_val, y_val=y_val, n_epochs=20, lr=1e-4)

loss for counter 7000 is 2.0067946910858154
counter 7000: valid acc = 0.45321306586265564
[9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9]
[ 9  9  9  9 12  9  6  6  9  6  6  9  9  6  3  9  9  6  6  9]
Best validation accuracy! iteration:7000 accuracy: 0.45321306586265564%
loss for counter 14000 is 0.004160015378147364
counter 14000: valid acc = 0.45321306586265564
[9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9]
[ 9  9  9  9 12  9  6  6  9  6  6  9  9  6  3  9  9  6  6  9]
loss for counter 21000 is 1.4545613527297974
counter 21000: valid acc = 0.4565952718257904
[9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9]
[ 9  9  9  9 12  9  6  6  9  6  6  9  9  6  3  9  9  6  6  9]
Best validation accuracy! iteration:21000 accuracy: 0.4565952718257904%
loss for counter 28000 is 1.2490569353103638
counter 28000: valid acc = 0.5180383324623108
[9 9 9 9 9 9 6 6 9 9 9 9 9 9 9 6 9 9 9 9]
[ 9  9  9  9 12  9  6  6  9  6  6  9  9  6  3  9  9  6  6  9]
Best validation accuracy! iteration:28000 accuracy: 0.5180383324623108%
lo