# Character-level Convolutional Networks for TextClassification

### Import the required libraries

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Input, Embedding, Activation, Flatten, Dense
from keras.layers import Conv1D, MaxPooling1D, Dropout
from keras.models import Model

### Define model configurations

In [8]:
input_size = 3000
conv_layers = [[256, 7, 3],
               [256, 7, 3],
               [256, 3, -1],
               [256, 3, -1],
               [256, 3, -1],
               [256, 3, 3]]

fully_connected_layers = [1024, 1024]
num_of_classes = 5
dropout_p = 0.5
optimizer = 'adam'
loss_type = 'categorical_crossentropy'

### Define the Model

In [9]:
def char_cnn_model(text, labels, num_epochs):
    
    tk = Tokenizer(lower=True, char_level=True, oov_token='UNK')
    tk.fit_on_texts(text)
    sequences = tk.texts_to_sequences(text)
    
    data = pad_sequences(sequences, maxlen=input_size)
    labels = to_categorical(labels)
    
    vocab_size = len(tk.word_index)
    
    x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size = 0.2, random_state = 42)

    #creating embedding matrix
    embedding_weights = []
    embedding_weights.append(np.zeros(vocab_size))

    for char, i in tk.word_index.items():
        onehot = np.zeros(vocab_size)
        onehot[i - 1] = 1
        embedding_weights.append(onehot)

    embedding_weights = np.array(embedding_weights)

    embedding_layer = Embedding(vocab_size + 1, vocab_size, input_length=input_size, weights=[embedding_weights])

    #Model architecture
    inputs = Input(shape=(input_size,), name='input', dtype='int64')
    x = embedding_layer(inputs)
    
    for filter_num, filter_size, pooling_size in conv_layers:
        x = Conv1D(filter_num, filter_size)(x)
        x = Activation('relu')(x)
        if pooling_size != -1:
            x = MaxPooling1D(pool_size=pooling_size)(x)
    x = Flatten()(x)

    for dense_size in fully_connected_layers:
        x = Dense(dense_size, activation='relu')(x)
        x = Dropout(dropout_p)(x)
    
    predictions = Dense(num_of_classes, activation='softmax')(x)

    # Build model
    model = Model(inputs=inputs, outputs=predictions)
    model.compile(optimizer=optimizer, loss=loss_type, metrics=['accuracy'])
    model.fit(x_train, y_train, epochs=num_epochs, batch_size=8)
    
    loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
    
    return loss, accuracy

### Load data

In [10]:
df = pd.read_csv('bbc-text.csv')
print(df.head())

        category                                               text
0           tech  tv future in the hands of viewers with home th...
1       business  worldcom boss  left books alone  former worldc...
2          sport  tigers wary of farrell  gamble  leicester say ...
3          sport  yeading face newcastle in fa cup premiership s...
4  entertainment  ocean s twelve raids box office ocean s twelve...


In [11]:
le = LabelEncoder()
labels = le.fit_transform(df['category'])

### Training

In [12]:
loss, accuracy = char_cnn_model(df['text'], labels, num_epochs=5)

Instructions for updating:
Use tf.cast instead.
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [13]:
print('Accuracy: %f' % (accuracy*100))

Accuracy: 65.393259
