In [None]:
'''
Warning: 1.5 GB download.
Download the bag of words (from Google News) from this link: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing
'''

In [2]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from gensim.models import KeyedVectors
from keras.layers import Flatten
from keras.layers import MaxPooling1D
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from nltk.corpus import stopwords

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [90]:
MX_NB_WORDS = 200000
MAX_SEQ_LEN = 30
EMBEDDING_DIM = 300

In [4]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)

In [21]:
# Peek on the data
print('Data: \t', "\n---------------------------------------------------\n".join(twenty_train.data[:2]))

Data: 	 From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----





---------------------------------------------------
From: guykuo@carson.u.washington.edu (Guy Kuo)
Subject: SI Clock Poll - Final Call
Summary: Final call for SI clock reports
Keywords: SI,acceleration,clock,upgrade
Article-I.D.: shelley.1qvfo9INNc3s
Organization: Unive

In [36]:
[(idx, val) for idx, val in enumerate(twenty_train.target_names)]

[(0, 'alt.atheism'),
 (1, 'comp.graphics'),
 (2, 'comp.os.ms-windows.misc'),
 (3, 'comp.sys.ibm.pc.hardware'),
 (4, 'comp.sys.mac.hardware'),
 (5, 'comp.windows.x'),
 (6, 'misc.forsale'),
 (7, 'rec.autos'),
 (8, 'rec.motorcycles'),
 (9, 'rec.sport.baseball'),
 (10, 'rec.sport.hockey'),
 (11, 'sci.crypt'),
 (12, 'sci.electronics'),
 (13, 'sci.med'),
 (14, 'sci.space'),
 (15, 'soc.religion.christian'),
 (16, 'talk.politics.guns'),
 (17, 'talk.politics.mideast'),
 (18, 'talk.politics.misc'),
 (19, 'talk.religion.misc')]

In [42]:
EMBEDDING_FILE = "GoogleNews-vectors-negative300.bin"
category_index = {}
for idx, val in enumerate(twenty_train.target_names):
    category_index[idx] = val 
category_reverse_index = dict((y,x) for (x,y) in category_index.items())
STOPWORDS = set(stopwords.words("english"))

In [49]:
import pandas as pd
def preprocess(text):
    text = text.strip().lower().split()
    text = filter(lambda word: word not in STOPWORDS, text)
    return " ".join(text)

def create_dict(arr): # ramdom name - 'title'
    return {'title': arr}

train_df = pd.DataFrame(create_dict(twenty_train.data))
test_df = pd.DataFrame(create_dict(twenty_test.data))

dataset = [train_df, test_df]

for data in dataset:
    data['title'] = data['title'].apply(preprocess)

In [59]:
all_texts = train_df['title'] + ' ' + test_df['title'] # ti include the ' ' between ending of last and new statment.
all_texts = all_texts.drop_duplicates(keep=False)

tokenizer = Tokenizer(num_words=MX_NB_WORDS)
tokenizer.fit_on_texts(all_texts)

train_sequences = tokenizer.texts_to_sequences(train_df['title'])
test_sequences = tokenizer.texts_to_sequences(test_df['title'])

train_data = pad_sequences(train_sequences, maxlen=MAX_SEQ_LEN)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQ_LEN)

In [101]:
print('Train data: \t', train_data.shape)
print('Test data: \t', test_data.shape)
print('-'*20)
category = twenty_train.target
category = to_categorical(category)
print('Labels:\t', category.shape)

Train data: 	 (11314, 30)
Test data: 	 (7532, 30)
--------------------
Labels:	 (11314, 20)


In [115]:
# train - test split

y_test_cat = to_categorical(twenty_test.target)

X_train = train_data
y_train = category
X_test = test_data
y_test = y_test_cat

In [67]:
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

In [92]:
from keras.layers import Embedding
word_index = tokenizer.word_index
nb_words = min(MX_NB_WORDS, len(word_index))+1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)
        
print('NULL word embedding: %d'% np.sum(np.sum(embedding_matrix, axis = 1) == 0))
embedding_layer = Embedding(embedding_matrix.shape[0], # or len(word_index) + 1
                            embedding_matrix.shape[1], # or EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQ_LEN,
                            trainable=False)

NULL word embedding: 112568
----------
Embedding matrix shape: (159848, 300)


In [104]:
from keras.models import Sequential
from keras.layers import Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(250, 3, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPooling1D())
model.add(Dense(250))
model.add(Dropout(0.2))
model.add(Activation('relu'))
model.add(Dense(20))
model.add(Activation('sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer = 'rmsprop', metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 30, 300)           47954400  
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 28, 250)           225250    
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 250)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 250)               62750     
_________________________________________________________________
dropout_4 (Dropout)          (None, 250)               0         
_________________________________________________________________
activation_8 (Activation)    (None, 250)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 20)                5020      
__________

In [110]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=128)
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss: \t', score[0])
print('Test Accuracy: \t', score[1])

model.save('my_model_20.h5')

Train on 11314 samples, validate on 7532 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test loss: 	 2.6497940551202728
Test Accuracy: 	 0.5541688794476899


In [119]:
model_1 = Sequential()
model_1.add(embedding_layer)
model_1.add(Dropout(0.2))
model_1.add(Conv1D(512, 3, padding='same',activation='relu',strides=1))
model_1.add(Conv1D(256, 3, padding='same',activation='relu',strides=1))
model_1.add(Conv1D(128, 3, padding='same',activation='relu',strides=1))
model_1.add(Flatten())
model_1.add(Dropout(0.2))
model_1.add(Dense(150,activation='sigmoid'))
model_1.add(Dropout(0.2))
model_1.add(Dense(20,activation='sigmoid'))

model_1.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['acc'])

model_1.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 30, 300)           47954400  
_________________________________________________________________
dropout_11 (Dropout)         (None, 30, 300)           0         
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 30, 512)           461312    
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 30, 256)           393472    
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 30, 128)           98432     
_________________________________________________________________
flatten_3 (Flatten)          (None, 3840)              0         
_________________________________________________________________
dropout_12 (Dropout)         (None, 3840)              0         
__________

In [120]:
model_1.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=128)
score = model.evaluate(X_test, y_test, verbose=0)
print('Test loss: \t', score[0])
print('Test Accuracy: \t', score[1])

model.save('my_model_1_20.h5')

Train on 11314 samples, validate on 7532 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test loss: 	 2.6497940551202728
Test Accuracy: 	 0.5541688794476899


In [140]:
print(category_index)

{0: 'alt.atheism', 1: 'comp.graphics', 2: 'comp.os.ms-windows.misc', 3: 'comp.sys.ibm.pc.hardware', 4: 'comp.sys.mac.hardware', 5: 'comp.windows.x', 6: 'misc.forsale', 7: 'rec.autos', 8: 'rec.motorcycles', 9: 'rec.sport.baseball', 10: 'rec.sport.hockey', 11: 'sci.crypt', 12: 'sci.electronics', 13: 'sci.med', 14: 'sci.space', 15: 'soc.religion.christian', 16: 'talk.politics.guns', 17: 'talk.politics.mideast', 18: 'talk.politics.misc', 19: 'talk.religion.misc'}


In [146]:
example_prediction = open('real_test.txt', 'r').read()
example_prediction = preprocess(example_prediction)
example_sequence = tokenizer.texts_to_sequences([example_prediction])
example_padded_sequence = pad_sequences(example_sequence, maxlen=MAX_SEQ_LEN)
print("Predicted category: ", category_index[model_1.predict_classes(example_padded_sequence, verbose=0)[0]])

Predicted category:  sci.space
