In [1]:
import numpy as np
import pandas as pd
import re
import pickle
import nltk
import string

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, LSTM, Bidirectional,Flatten,Dropout
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical


In [2]:
def remove_html(text):
    soup = BeautifulSoup(text, 'lxml')
    html_free = soup.get_text()
    return html_free

def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation and c not in string.digits])
    return no_punct

def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

In [3]:
df = pd.read_csv('wars_trek.csv', encoding='ISO-8859-1')

In [4]:
df['Input'] = df['Input'].astype(str)

In [5]:
df['clean'] = df['Input'].apply(lambda x: remove_punctuation(x))
df['clean']

0        turmoil has engulfed the galactic republic the...
1                     outlaying star systems is in dispute
2        hoping to resolve the matter with a blockade o...
3        greedy trade federation has stopped all shippi...
4                                                    naboo
                               ...                        
70419                         where no man has gone before
70420     she is moving out now passing camera and heading
70421    toward the distant stars she is beautiful and ...
70422      are beautiful and as she slowly disappears from
70423                                                 view
Name: clean, Length: 70424, dtype: object

In [6]:
df['clean'] = df['clean'].apply(lambda x: remove_html(x))

In [7]:
tokenizer1 = RegexpTokenizer(r'\w+')

In [8]:
df['clean'] = df['clean'].apply(lambda x: tokenizer1.tokenize(x.lower()))
df['clean']

0        [turmoil, has, engulfed, the, galactic, republ...
1              [outlaying, star, systems, is, in, dispute]
2        [hoping, to, resolve, the, matter, with, a, bl...
3        [greedy, trade, federation, has, stopped, all,...
4                                                  [naboo]
                               ...                        
70419                  [where, no, man, has, gone, before]
70420    [she, is, moving, out, now, passing, camera, a...
70421    [toward, the, distant, stars, she, is, beautif...
70422    [are, beautiful, and, as, she, slowly, disappe...
70423                                               [view]
Name: clean, Length: 70424, dtype: object

In [9]:
df['clean'] = df['clean'].apply(lambda x: remove_stopwords(x))
df['clean']

0        [turmoil, engulfed, galactic, republic, taxati...
1                      [outlaying, star, systems, dispute]
2        [hoping, resolve, matter, blockade, deadly, ba...
3        [greedy, trade, federation, stopped, shipping,...
4                                                  [naboo]
                               ...                        
70419                                          [man, gone]
70420                   [moving, passing, camera, heading]
70421                  [toward, distant, stars, beautiful]
70422                      [beautiful, slowly, disappears]
70423                                               [view]
Name: clean, Length: 70424, dtype: object

In [10]:
tokenizer = Tokenizer(num_words=5000,lower=True,split=' ')

In [11]:
tokenizer.fit_on_texts(df['Input'])

In [12]:
X = tokenizer.texts_to_sequences(df['Input'])
X = pad_sequences(X,maxlen=500)
Y = df['Value']
vocab_size = len(tokenizer.word_index) + 1

In [None]:
x, x_test, y, y_test = train_test_split(xtrain,labels,test_size=0.2,train_size=0.8)
x_train, x_cv, y_train, y_cv = train_test_split(x,y,test_size = 0.25,train_size =0.75)

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state = 24)

In [14]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [15]:
def prepare_targets(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    le.fit(y_test)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    
    return y_train_enc, y_test_enc

In [16]:
y_train,y_test = prepare_targets(Y_train,Y_test)

In [17]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=500)) 
model.add(Bidirectional(LSTM(128))) #number of batches
model.add(Dropout(0.5)) #randomily removing some of the neurons from the architecture to decrease overfiting
model.add(Dense(1,activation='sigmoid')) #it regulates the outputs 
model.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy']) #regulate dropout
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 50)           909200    
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               183296    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 257       
Total params: 1,092,753
Trainable params: 1,092,753
Non-trainable params: 0
_________________________________________________________________


In [28]:
#es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
history=model.fit(X_train, y_train, batch_size=128, epochs=3, validation_data=[X_test, y_test])

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [35]:
model.save('stars12.h5')

In [42]:
results = model.evaluate(X_test, y_test, verbose = 1)

