# HW05: Deep Learning

Remember that these homework work as a completion grade. **You can skip one section without losing credit.**

In [None]:
#Import the AG news dataset (same as hw01)
#Download them from here 
#!wget https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv

import pandas as pd
import nltk
df = pd.read_csv('train.csv')

df.columns = ["label", "title", "lead"]
label_map = {1:"world", 2:"sport", 3:"business", 4:"sci/tech"}
def replace_label(x):
	return label_map[x]
df["label"] = df["label"].apply(replace_label) 
df["text"] = df["title"] + " " + df["lead"]
df = df.sample(n=10000) # # only use 10K datapoints
df.head()

In [None]:
##TODO create a new variable "business" that takes value 1 if the label is business and 0 otherwise
df['business'] = df['label'].apply(lambda x: 1 if x == 'business' else 0)

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
from sklearn.feature_extraction.text import CountVectorizer

##TODO pre-process text as you did in HW02
def preprocess(text):
    return [wd.lemma_.lower() for wd in list(nlp(text)) if not wd.is_punct and not wd.is_stop]

df['tokens'] = df['text'].apply(lambda x: preprocess(x))

In [None]:
df['tokens_clean'] = df['tokens'].apply(lambda x: ' '.join(x))

In [None]:
#TODO vectorize the pre-processed text using CountVectorizer

vec = CountVectorizer(min_df=0.01,
                        max_df=.9,  
                        max_features=1000,
                        stop_words='english',
                        ngram_range=(1,2))
X = vec.fit_transform(df['tokens_clean'])

features = vec.get_feature_names()
Y = df['business']

#Alternatively, use the output from HW02 if you saved it 

## MLP

Your goal here is to use features from the Vectorized text to predict whether the snippet is from a business article.

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.callbacks import EarlyStopping

## TODO build a MLP model with at least 2 hidden layers with ReLU activation,
# followed by dropout and an output layer with sigmoid activation
model = Sequential()
model.add(Dense(30, input_dim = X.shape[1], activation = 'relu'))
model.add(Dense(50, activation = 'relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.summary()

## TODO compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
## TODO fit the model using early stopping to predict the business label
es = EarlyStopping(monitor='val_accuracy', min_delta=0.0001,
                   patience=3, mode='auto')

fit = model.fit(X.todense(), Y, batch_size=64, epochs=100, callbacks=[es], validation_split=0.3)

## Autoencoders

In [None]:
from keras import backend as K

def r2(y_true, y_pred):
    SS_res =  K.sum(K.square( y_true-y_pred )) 
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) ) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

##TODO build a simple autoencoder with two compression layers and two reconstruction layers using ReLu
model_a = Sequential()
model_a.add(Dense(100, input_dim = X.shape[1], activation = 'relu'))
model_a.add(Dense(50, activation = 'relu', name = 'compression_2'))
model_a.add(Dense(100, activation = 'relu')) # first reconstruction layer
model_a.add(Dense(X.shape[1], activation = 'relu'))
model_a.summary()

##TODO compile and fit the model minimizing "mean_squared_error"
##report r_squared during training (the function r2 defined above)
model_a.compile(loss='mean_squared_error', optimizer = 'adam', metrics=[r2])
model_a_summary = model_a.fit(X.todense(), X.todense(), epochs=10, validation_split=0.3)

In [None]:
import keras

##TODO compress the vectorized text (X.todense())
compression = keras.Model(inputs = model_a.input, outputs = model_a.get_layer("compression_2").output)
X_compressed = compression(X.todense())
print(f'Compressed size: {X_compressed.shape}')
print(f'Original size: {X.todense().shape}')

## Embeddings

In [None]:
from keras.preprocessing.text import text_to_word_sequence

##TODO tokenize the text using text_to_word_sequence
tokenized_text = [text_to_word_sequence(text) for text in df['text']]

print(tokenized_text[10])

In [None]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences

length_vocab = 1000
max_seq_length = 100

#TODO create a one_hot representation for each word and truncate/pad the sequences such that they are all of the same length
X_one_hot = [one_hot(token, n=length_vocab) for token in df['text']]

X_padded = pad_sequences(X_one_hot, padding='post', 
                         maxlen=max_seq_length, truncating='post')
X_padded.shape

In [None]:
from keras.layers import Embedding
from keras.layers import Embedding

##TODO create a sequential model with just one embedding layer and show the model summary
model_em = Sequential()
model_em.add(Embedding(length_vocab,32,input_length=max_seq_length))
model_em.summary() with just one embedding layer and show the model summary

## LSTM

In [None]:
from keras.layers import LSTM

##TODO create a sequential model with an embedding layer,
# a LSTM layer and two hidden layers with ReLu activation function, followed by dropout

model_lstm = Sequential()
model_lstm.add(Embedding(length_vocab,32,input_length=max_seq_length,name='embedding'))
model_lstm.add(LSTM(32))
model_lstm.add(Dense(32, activation='relu'))
model_lstm.add(Dense(32, activation='relu'))
model_lstm.add(Dropout(0.3))
model_lstm.add(Dense(1, activation="sigmoid"))
model_lstm.summary()

In [None]:
##TODO compile the model and fit it to predict the business label
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model_lstm.fit(X_padded, Y, batch_size=32, validation_split=0.3, epochs=5)