In [1]:
import numpy as np
import pandas as pd
import re
import pickle
import nltk
import string

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, LSTM, Bidirectional,Flatten,Dropout
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical


## Define cleaning functions

In [2]:
def remove_html(text):
    soup = BeautifulSoup(text, 'lxml')
    html_free = soup.get_text()
    return html_free

def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct

def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

## Load Data

In [3]:
df = pd.read_csv('wars_trek.csv', encoding='ISO-8859-1')

## Change data type to avoid conflict

In [4]:
df['Input'] = df['Input'].astype(str)

## Remove punctuation

In [5]:
df['clean'] = df['Input'].apply(lambda x: remove_punctuation(x))
df['clean']

0        turmoil has engulfed the galactic republic the...
1                     outlaying star systems is in dispute
2        hoping to resolve the matter with a blockade o...
3        greedy trade federation has stopped all shippi...
4                                                    naboo
                               ...                        
78276     Now I don  t pretend to tell you how to find ...
78277      To survive is not enough To simply exist is ...
78278      It  s not safe out here It  s wondrous with ...
78279      Your will to survive your love of life your ...
78280    You might also like these memorable Supernatur...
Name: clean, Length: 78281, dtype: object

## Tokenize the rows

In [6]:
tokenizer1 = RegexpTokenizer(r'\w+')

In [7]:
df['clean'] = df['clean'].apply(lambda x: tokenizer1.tokenize(x.lower()))
df['clean']

0        [turmoil, has, engulfed, the, galactic, republ...
1              [outlaying, star, systems, is, in, dispute]
2        [hoping, to, resolve, the, matter, with, a, bl...
3        [greedy, trade, federation, has, stopped, all,...
4                                                  [naboo]
                               ...                        
78276    [now, i, don, t, pretend, to, tell, you, how, ...
78277    [to, survive, is, not, enough, to, simply, exi...
78278    [it, s, not, safe, out, here, it, s, wondrous,...
78279    [your, will, to, survive, your, love, of, life...
78280    [you, might, also, like, these, memorable, sup...
Name: clean, Length: 78281, dtype: object

## Remove stopwords

In [8]:
#df['clean'] = df['clean'].apply(lambda x: remove_stopwords(x))

## Tokenize the words

In [9]:
tokenizer = Tokenizer(num_words=5000,lower=True,split=' ')

In [10]:
tokenizer.fit_on_texts(df['Input'])

## Split the data

In [11]:
df = df.sample(frac = 1, random_state=13)

In [12]:
X = tokenizer.texts_to_sequences(df['Input'])
X = pad_sequences(X,maxlen=500)
Y = df['Value']
vocab_size = len(tokenizer.word_index) + 1

In [13]:
x, X_test, y, Y_test = train_test_split(X, Y, test_size=0.3, random_state = 24)

In [14]:
X_train, X_eval, Y_train, Y_eval = train_test_split(x, y, test_size=0.01, random_state = 24)

In [15]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
def prepare_targets(y_train, y_test, y_eval):
    le = LabelEncoder()
    le.fit(y_train)
    le.fit(y_test)
    le.fit(y_eval)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    y_eval_enc = le.transform(y_eval)
    
    return y_train_enc, y_test_enc, y_eval_enc

In [17]:
#y_train,y_test,y_eval  = prepare_targets(Y_train,Y_test,Y_eval)

In [18]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=500)) 
model.add(Bidirectional(LSTM(128))) #number of batches
model.add(Dropout(0.5)) #randomily removing some of the neurons from the architecture to decrease overfiting
model.add(Dense(1,activation='sigmoid')) #it regulates the outputs 
model.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy']) #regulate dropout
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 50)           919100    
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               183296    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 257       
Total params: 1,102,653
Trainable params: 1,102,653
Non-trainable params: 0
_________________________________________________________________


## Train the model

In [19]:
history=model.fit(X_train, Y_train, batch_size=128, epochs=3, validation_data=[X_test, Y_test])

Epoch 1/3
Epoch 2/3
Epoch 3/3


## Evaluate the model

In [20]:
results = model.evaluate(X_eval, Y_eval, verbose = 1)



In [21]:
model.save('stars13.h5')