<a href="https://colab.research.google.com/github/carolflyjs/cs230/blob/master/Data_Processing_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install pyspellchecker 

Collecting pyspellchecker
[?25l  Downloading https://files.pythonhosted.org/packages/04/d1/ec4e830e9f9c1fd788e1459dd09279fdf807bc7a475579fd7192450b879c/pyspellchecker-0.5.4-py2.py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 2.8MB/s 
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.5.4


In [0]:
from spellchecker import SpellChecker
import pandas as pd
import os
import tensorflow as tf
import tensorflow_hub as hub
import keras
import numpy as np
from keras.layers import Input, Dense, concatenate, Dot, Embedding, LSTM, GRU
from keras.engine import Layer
from keras.models import Model
from keras import backend as K
from keras.preprocessing import text
from keras.preprocessing import sequence
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

Using TensorFlow backend.


In [0]:
class_dict = {
  "BT": 0,
  "NT": 1,
  "NPT": 2,
  "PT": 3,
  "URT": 4
}

reverse_class_dict = {
    0: "BT",
    1: "NT",
    2: "NPT",
    3: "PT",
    4: "URT"
}

def load_file(path, delimiter=","):
    df = pd.read_csv(path, delimiter=delimiter)
    df.columns = ["source", "target", "label"]
    df = shuffle(df).reset_index(drop=True)
    class_dict = {
      "BT": 0,
      "NT": 1,
      "NPT": 2,
      "PT": 3,
      "URT": 4
    }
    df["label"] = df["label"].apply(lambda x: class_dict[x])
    return df

def data_prep(df, x_columns, y_columns):
    X_train = df[x_columns]
    Y_train = df[y_columns]
    Y_train = keras.utils.to_categorical(Y_train, num_classes=5)
    return X_train, Y_train

print('load_file(path, delimiter=","): return df')
print('data_prep(df, x_columns, y_columns): return X_train, Y_train')

load_file(path, delimiter=","): return df
data_prep(df, x_columns, y_columns): return X_train, Y_train


In [0]:
def removeSpellingErrors(df):
    new_list = []
    spell = SpellChecker()
    raw_df_list = df.values.tolist()

    for (source, target, label) in raw_df_list:
        if len(spell.unknown(source.split())) == 0 and len(spell.unknown(target.split())) == 0:
            new_list.append((source, target, label))
    new_df = pd.DataFrame(new_list, columns=["source", "target", "label"])
    return new_df

print('removeSpellingErrors(df): return new_df')

removeSpellingErrors(df): return new_df


In [0]:
def tokenize(X_list):
    t = text.Tokenizer()
    fitter_container = []
    for X_data in X_list:
      fitter_container.append(X_data["source"])
      fitter_container.append(X_data["target"])
    fit_text = pd.concat(fitter_container)
    t.fit_on_texts(fit_text)
    
    transformed_container = []
    for X_data in X_list:
      X_transformed = X_data.copy() 
      X_transformed["source"] = t.texts_to_sequences(X_data["source"])
      X_transformed["target"] = t.texts_to_sequences(X_data["target"])
      transformed_container.append(X_transformed)

    max_value = 0
    for X_data in transformed_container:
      for series in ["source", "target"]:
          current_max = (X_data[series].apply(lambda ls: len(ls))).max()
          if current_max > max_value:
              max_value = current_max
    max_value = max_value
    vocab_size = len(t.index_word) + 1
    return transformed_container, max_value, vocab_size, t


In [0]:
def pad(X, max_value):
    source = sequence.pad_sequences(X["source"],  maxlen=max_value)
    target = sequence.pad_sequences(X["target"],  maxlen=max_value)
    return pd.DataFrame(data=np.concatenate((source, target), axis=1))



In [0]:
def tokenize_and_pad(X_train):


    X_train_tokenized, max_value, vocab_size, tokenizer = tokenize(X_train)
    X_train_padded = pad(X_train_tokenized, max_value=max_value)
    return X_train_padded, max_value, vocab_size, tokenizer

print('tokenize_and_pad(X_train), return X_train_padded, max_value, vocab_size, tokenizer')

tokenize_and_pad(X_train), return X_train_padded, max_value, vocab_size, tokenizer


In [0]:
container = list()

df_train = load_file("/content/drive/My Drive/cs230/train.csv")
X_train_raw, Y_train = data_prep(df_train, ["source", "target"], ["label"])

df_dev = load_file("/content/drive/My Drive/cs230/dev.csv")
X_dev_raw, Y_dev = data_prep(df_dev, ["source", "target"], ["label"])

df_test = load_file("/content/drive/My Drive/cs230/test.csv")
X_test_raw, Y_test = data_prep(df_test, ["source", "target"], ["label"])

In [0]:
tokenized_list, max_value, vocab_size, tokenizer = tokenize([df_train, df_dev, df_test])

In [0]:
X_train_tokenized = pad(tokenized_list[0], max_value)
X_dev_tokenized = pad(tokenized_list[1], max_value)
X_test_tokenized = pad(tokenized_list[2], max_value)

In [0]:
class_weight = {0: 1, 1: 1, 2: 5, 3: 5, 4: 2}