In [7]:
import nltk
from nltk.corpus import stopwords
import pandas as pd 
import numpy as np
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import re
import unicodedata
import gensim, logging
from nltk.tokenize import TreebankWordTokenizer
from sklearn.cross_validation import train_test_split
import multiprocessing
from __future__ import division
import time

In [15]:
def clean_text(text):
    result=text.replace('<br clear="none">',' ').replace('</p>',' ').replace('<br>',' ').replace('<p>',' ').replace('\\',' ').replace('\n',' ').replace('   ',' ').replace('  ',' ')
    return re.sub(r'\W+ ', '', result).upper()

def report_to_wordlist(raw_review,remove_stopwords=True ):
    #Remove HTML
    review_text = BeautifulSoup(raw_review, "lxml").get_text() 
    # Remove non-letters        
    regex = re.compile("[',\.!?;¿:()\"=-]")
    letters_only = re.sub(regex, " ", review_text)
    #Convert to lower case, split into individual words
    words = letters_only.lower().split() 
    for i in range(len(words)):
        words[i] = str(unicodedata.normalize('NFKD', words[i]).encode('ascii', 'ignore'))
        words[i] = str(words[i])
        #print(words[i])
    if remove_stopwords:
        stops = set(stopwords.words("french"))
        words = [w for w in words if not w in stops]
    return(words)

# Preparing data

In [9]:
data_soc= open('test_set/SOC','r').read().split('<<<<<<<<<<NEW>>>>>>>>>>')
data_civ= open('test_set/CIV','r').read().split('<<<<<<<<<<NEW>>>>>>>>>>')
data_com= open('test_set/COM','r').read().split('<<<<<<<<<<NEW>>>>>>>>>>')
data_crim= open('test_set/CRIM','r').read().split('<<<<<<<<<<NEW>>>>>>>>>>')

In [16]:
pool = multiprocessing.Pool(processes=4)
X= pool.map(report_to_wordlist,data_soc)
print(0)
X += pool.map(report_to_wordlist,data_civ)
print(1)
X += pool.map(report_to_wordlist,data_com)
print(2)
X += pool.map(report_to_wordlist,data_crim)

0
1
2


In [17]:
Y = np.zeros(len(data_civ+data_com+data_crim+data_soc))
Y[len(data_soc):len(data_soc)+len(data_com)]= np.ones(len(data_com))
Y[len(data_soc)+len(data_com):len(data_soc)+len(data_com)+len(data_crim)]= 2*np.ones(len(data_crim))
Y[len(data_soc)+len(data_com)+len(data_crim):len(data_soc)+len(data_com)+len(data_crim)+len(data_civ)]= 3*np.ones(len(data_civ))

In [18]:
#Spliting the set in training and test set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)


# Doc2Vec + Boosting

In [20]:
Xtrain_Xtest= X_train+X_test
all_data = [gensim.models.doc2vec.LabeledSentence(Xtrain_Xtest[i], tags=['SENT_%s' %i])for i in range(len(Xtrain_Xtest))]
model_d2v = gensim.models.Doc2Vec(min_count=1, window=10, size=100, workers=7)
model_d2v.build_vocab(all_data)

In [21]:
for epoch in range(30):
    print(epoch)
    model_d2v.train(all_data)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29


In [22]:
train = [model_d2v.docvecs['SENT_%s' % i] for i in range(len(X_train))]
test = [model_d2v.docvecs['SENT_%s' % i] for i in range(len(X_train), len(X_train)+len(X_test))]

In [23]:
np.savetxt("train.csv", train, delimiter=",")
np.savetxt("test.csv", test, delimiter=",")

In [48]:
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.grid_search import GridSearchCV
import xgboost
lr_range = [0.01,0.1,0.5,1]
#estimator_range=range(80,150)

#param_grid = dict(n_estimators = estimator_range, learning_rate=lr_range)
param_grid = dict(learning_rate=lr_range)
cv = StratifiedShuffleSplit(y_train, n_iter=5, test_size=0.2, random_state=42)
grid = GridSearchCV(xgboost.XGBClassifier(), param_grid=param_grid, cv=cv,verbose=1)
grid.fit(np.array(train), y_train)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 11.7min finished


The best parameters are {'learning_rate': 0.5} with a score of 0.87


In [59]:
model_xg = xgboost.XGBClassifier(n_estimators=100, learning_rate=0.5)
model_xg.fit(np.array(train),y_train)


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.5, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [61]:
y_pred=model_xg.predict(np.array(test))

In [62]:
accuracy=len([i for i in range(len(y_test)) if y_pred[i]==y_test[i]])/len(y_test)

0.8722619047619048

# Classifier per document

In [93]:
def classify_document(doc):
    w=report_to_wordlist(doc)
    vec=model_d2v.infer_vector(w)
    vec=np.array(vec).reshape((1,-1))
    return model_xg.predict(np.array(vec))


# CNN Approach

In [94]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb

Using TensorFlow backend.


In [114]:
all_words= []
for item in X:
    all_words+=item
all_words=list(set(all_words))
dic_words={}
for i in range(len(all_words)):
    dic_words[all_words[i]]=i
train_cnn=[]
test_cnn=[]
for item in X_train :
    train_cnn.append([dic_words[w] for w in item])
for item in X_test :
    test_cnn.append([dic_words[w] for w in item])

In [115]:
max_features = len(all_words)
maxlen = max([len(X[i]) for i in range(len(X))])
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2

In [134]:
import tensorflow as tf
tf.python.control_flow_ops = tf
from keras.utils.np_utils import to_categorical


print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(train_cnn, maxlen=maxlen)
x_test = sequence.pad_sequences(test_cnn, maxlen=maxlen)
ytrain_cnn=to_categorical(y_train, nb_classes=4)
ytest_cnn=to_categorical(y_test, nb_classes=4)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')
model = Sequential()

#  embedding layer which maps the vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 border_mode='valid',
                 activation='relu'))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a fully-cnnected hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We compute with softmax the probability of the four classes:
model.add(Dense(4))
model.add(Activation('softmax'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(x_train, ytrain_cnn,
          batch_size=batch_size,
          nb_epoch=epochs,
          validation_data=(x_test, ytest_cnn))

Pad sequences (samples x time)
('x_train shape:', (19598, 16329))
('x_test shape:', (8400, 16329))
Build model...
Train on 19598 samples, validate on 8400 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1cb917e90>

In [142]:
#model.predict_classes(x_test[0].reshape((1,-1)))