### Convolutional Neural networks for Text Classification
[CNN for Sentence Classification in Keras](https://github.com/alexander-rakhlin/CNN-for-Sentence-Classification-in-Keras)

In [102]:
import pandas as pd

In [103]:
# Read training and testing data
train = pd.read_csv('data/train.csv') # category, text
test = pd.read_csv('data/test.csv') # category, text

# Replace NaN with ''
train = train.fillna('')
test = test.fillna('')

# Shapes
train_n = train.shape[0]
test_n = test.shape[0]
print train_n + test_n

17647


In [104]:
# Concatenate training and testind data
df = pd.concat([train, test])
print df.shape

(17647, 2)


In [105]:
# Data: X and y
# Label encoding: y
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df['category'])
print 'Number of classes: ', len(list(le.classes_))
y = le.transform(df['category'])
X = list(df['text'].values)

Number of classes:  17


In [106]:
import numpy as np
import re
import itertools
from collections import Counter

# Function to clean text - Source: https://github.com/dennybritz/cnn-text-classification-tf
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

# Function to pad texts
def pad_sentences(sentences, padding_word="<PAD/>"):
    """
    Pads all sentences to the same length. The length is defined by the longest sentence.
    Returns padded sentences.
    """
    sequence_length = max(len(x) for x in sentences)
    padded_sentences = []
    for i in range(len(sentences)):
        sentence = sentences[i]
        num_padding = sequence_length - len(sentence)
        new_sentence = sentence + [padding_word] * num_padding
        padded_sentences.append(new_sentence)
    return padded_sentences

In [107]:
# Data preparation

# Remove leading characters
X_strip = [s.strip() for s in X]

# Clean strings
X_clean = [clean_str(s) for s in X_strip]

# Create list of lists
X_list = [s.split(" ") for s in X_clean]

# Pad text
X_pad = pad_sentences(X_list)

# Build vocabulary
word_counts = Counter(itertools.chain(*X_pad))

# Mapping from index to word
vocabulary_inv = [w[0] for w in word_counts.most_common()]

# Mapping from word to index
vocabulary = {w: i for i, w in enumerate(vocabulary_inv)}

# X data
X_data = np.array([[vocabulary[word] for word in sentence] for sentence in X_pad])

In [108]:
# Create testing set and training set
mask = range(train_n, train_n + test_n)
X_test = X_data[mask]
y_test = y[mask]
print X_test.shape, y_test.shape

mask = range(train_n)
X_train = X_data[mask]
y_train = y[mask]
print X_train.shape, y_train.shape

(3599, 66) (3599,)
(14048, 66) (14048,)


#### Word2Vec

In [115]:
# Multiprocessing
from multiprocessing import cpu_count

# word2vec
from gensim.models import word2vec

In [117]:
# Model: 
#       size = 100 as per http://arxiv.org/pdf/1408.5882v2.pdf
#       window = 5 max distance between the current and predicted word within a sentence.
#       min_count` = 1 (ignore all words with total frequency lower than this.)

# Initiate model
num_features = 100
downsampling = 1e-3   # Downsample setting for frequent words

# Create sentence matrix
X_train_sent = [[vocabulary_inv[w] for w in s] for s in X_train]

embedding_model = word2vec.Word2Vec(X_train_sent, size=num_features, window=5, 
                           min_count=1, sample=downsampling, workers=cpu_count())

In [118]:
# Embedding weights
embedding_weights = [np.array([embedding_model[w] if w in embedding_model\
                                                        else np.random.uniform(-0.25, 0.25, embedding_model.vector_size)\
                                                        for w in vocabulary_inv])]

#### CNN using keras

In [129]:
from keras.models import Sequential
from keras.models import Model
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Input, Merge, Convolution1D, MaxPooling1D

np.random.seed(15)

Using TensorFlow backend.


ImportError: No module named tensorflow

### TensorFlow installation problems :-(