<a href="https://colab.research.google.com/github/carolflyjs/cs230/blob/master/Base_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import os
import tensorflow as tf
import tensorflow_hub as hub
import keras
import numpy as np
from keras.layers import Input, Dense, concatenate, Dot, Embedding
from keras.engine import Layer
from keras.models import Model
from keras import backend as K
from keras.preprocessing import text
from keras.preprocessing import sequence

In [0]:
def load_file(path, delimiter=","):
    df = pd.read_csv(path, delimiter=delimiter)
    class_dict = {
      "BT": 0,
      "NT": 1,
      "NPT": 2,
      "PT": 2,
      "RT": 3,
      "URT": 3
    }
    df["label"] = df["label"].apply(lambda x: class_dict[x])
    return df

def data_prep(df, x_columns, y_columns, train_percent = 0.8):
    msk = np.random.rand(len(df)) < train_percent
    df_train = df[msk]
    df_test = df[~msk]
    X_train = df_train[x_columns]
    Y_train = df_train[y_columns]
    X_test = df_test[x_columns]
    Y_test = df_test[y_columns]
    return X_train, Y_train, X_test, Y_test

In [0]:
df = load_file("./train.csv")
X_train_raw, Y_train, X_test_raw, Y_test = data_prep(df, ["source", "target"], ["label"])
Y_train_labels = keras.utils.to_categorical(Y_train, num_classes=4)
Y_test_labels = keras.utils.to_categorical(Y_test, num_classes=4)

In [310]:
display(df.groupby(df["label"]).count())

Unnamed: 0_level_0,source,target
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,10010,10010
1,10008,10008
2,19982,19982
3,20207,20207


48238

In [279]:

print("length of training =", len(X_train_raw))
print("length of testing =", len(X_test_raw))

length of training = 48061
length of testing = 12146


In [0]:
def tokenize(X_train, X_test):
    t = Tokenizer()
    fit_text = pd.concat([X_train["source"], X_train["target"], X_test["source"], X_test["target"]])
    t.fit_on_texts(fit_text)
    # test_text = "The earth is an great place live"
    test_text = X_train["source"]
    X_train_transformed, X_test_transformed = X_train.copy(), X_test.copy()
    X_train_transformed["source"] = t.texts_to_sequences(X_train["source"])
    X_train_transformed["target"] = t.texts_to_sequences(X_train["target"])
    X_test_transformed["source"] = t.texts_to_sequences(X_test["source"])
    X_test_transformed["target"] = t.texts_to_sequences(X_test["target"])
    max_length = 0
    for df in [X_train_transformed, X_test_transformed]:
        for series in ["source", "target"]:
            current_max = (df[series].apply(lambda ls: len(ls))).max()
            if current_max > max_length:
                max_length = current_max
    max_length = max_length
    vocab_size = len(t.index_word) + 1
    return X_train_transformed, X_test_transformed, max_length, vocab_size

In [0]:
X_train_tokenized, X_test_tokenized, max_value, vocab_size = tokenize(X_train_raw, X_test_raw)

In [0]:
def pad(X, max_value):
    source = sequence.pad_sequences(X["source"],  maxlen=max_value)
    target = sequence.pad_sequences(X["target"],  maxlen=max_value)
    return pd.DataFrame(data=np.concatenate((source, target), axis=1))

In [0]:
X_train_padded = pad(X_train_tokenized, max_value=max_value)
X_test_padded = pad(X_test_tokenized, max_value=max_value)

In [0]:
def build_naive_embedding_model(): 
    X_input = Input(shape=(20,), dtype="int32")
    X = Embedding(vocab_size, 10)(X_input)
    X = keras.layers.Flatten("channels_last")(X)
    # target_input = Input(shape=(1,), dtype="string")
    # target_embed = ElmoEmbeddingLayer()(target_input)
    # embedding = Dot(axes=1)([source_embed, target_embed])
    # pred = Dense(4, activation='softmax')(embedding)
    X = Dense(4, activation="softmax")(X)

    # model = Model(inputs=[source_input, target_input], outputs=pred)
    model = Model(inputs=[X_input], outputs=X)

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()

    return model

In [285]:
model = build_naive_embedding_model()
history = model.fit([X_train_padded], Y_train_labels, epochs=20, batch_size=32, shuffle=True) 

Model: "model_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_27 (InputLayer)        (None, 20)                0         
_________________________________________________________________
embedding_25 (Embedding)     (None, 20, 10)            112120    
_________________________________________________________________
flatten_21 (Flatten)         (None, 200)               0         
_________________________________________________________________
dense_20 (Dense)             (None, 4)                 804       
Total params: 112,924
Trainable params: 112,924
Non-trainable params: 0
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fac54643668>

In [286]:
model.evaluate(X_test_padded, Y_test_labels, verbose=0)

[0.8912734743661622, 0.7602502881508967]