In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import KFold
from keras.layers import Dense, Dropout
from keras.models import Sequential
import xml.etree.ElementTree as ET
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import re

Using TensorFlow backend.


In [2]:
def ParsXML(path):
    data = []
    filesNames = os.listdir(path)
    for fileName in filesNames:
        tree = ET.parse(path + "/" + fileName)
        root = tree.getroot()
        headline = tree.find('headline').text
        text = list(tree.find('text'))
        content = ""
        topics = []
        for elem in text:
            content += elem.text
        bip_topics = []
        dc_date_published = ""
        itemid = root.attrib['itemid']
        XMLfilename = fileName
        for node in root.iter():
            if node.tag == 'dc' and node.attrib['element'] == "dc.date.published":
                dc_date_published = node.attrib['value']
            if node.tag == 'codes' and node.attrib['class'] == "bip:topics:1.0":
                topics = list(node)
                for topic in topics:
                    bip_topics.append(topic.attrib['code'])

        if len(bip_topics) !=0:
            data.append([headline, content, bip_topics, dc_date_published, itemid,
                 XMLfilename])
    
    return data



df = pd.DataFrame(ParsXML("Data"), columns = ['headline', 'text', 
                                   'bip:topics', 'dc.date.published',
                                   'itemid', 'XMLfilename'])

df.to_csv("DataMultilabelRaw.csv", index=False)




In [5]:
df = pd.read_csv("DataMultilabelRaw.csv")

In [6]:
dataset = df[['text', 'bip:topics']]

In [None]:
def cleanText(dataframe):
    ps = PorterStemmer()
    for i in range(0, len(dataframe)):
        content = dataframe.iat[i, 0]
        temp = re.sub('[^a-zA-Z]', ' ', content)
        temp = temp.lower()
        temp = temp.split()
        temp = [ps.stem(word) for word in temp if not word in set(stopwords.words('english'))]
        content = ' '.join(temp)
        dataframe.iloc[i, 0] = content
    return dataframe
dataset = cleanText(dataset)
dataset.to_csv("DataMultilabelCleaned.csv", index=False)

In [2]:
dataset = pd.read_csv("DataMultilabelCleaned.csv")
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
cleanedString = ""
labels = []
for i in range(0,len(y)):
    cleanedString = re.sub('\W+',' ', y[i])
    labels.append(cleanedString.split())
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(labels)

In [3]:
documents = []
for i in range(0,len(X)):
    documents.append(X[i, 0].split())
def doc2Vec(documents):    
    docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(documents)]
    model = Doc2Vec(docs, vector_size=100, window=10, min_count=1, workers=8)
    return model

model = doc2Vec(X)
dependentVariables = []
for i in range(0,len(documents)):
    dependentVariables.append(model.infer_vector(documents[i]))

In [4]:
X = np.reshape(dependentVariables, (len(dataset), 100))

In [9]:
def model_creator():
    model = Sequential()
    model.add(Dense(output_dim=500, init='uniform', activation='relu', input_dim=100))
    model.add(Dropout(0.5))
    model.add(Dense(output_dim=102, init='uniform', activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
classifier = KerasClassifier(build_fn=model_creator, epochs=2, verbose=1, batch_size=10, validation_data=(X_test, y_test))
classifier.fit(X_train, y_train)

classifier = KerasClassifier(build_fn=model_creator)
classifier.fit(X_train, y_train)

batch_sizes = [10, 20 , 30]
epochs = [2, 3]
parameters = [{'batch_size': batch_sizes, 'epochs': epochs}]
grid_search = GridSearchCV(classifier, parameters, n_jobs=-1, cv=2)
grid_search.fit(X_train, y_train)
print(grid_search.best_score_)
print(grid_search.best_params_)

  This is separate from the ipykernel package so we can avoid doing imports until
  """


Train on 33779 samples, validate on 14478 samples
Epoch 1/2
Epoch 2/2
Epoch 1/1
Epoch 1/2
Epoch 2/2
0.9682222098150044
{'epochs': 2, 'batch_size': 20}


In [17]:
kfold = KFold(n_splits=10, shuffle=True)
cvscores = []
for train_index, test_index in kfold.split(X):
    model = Sequential()
    model.add(Dense(output_dim=500, init='uniform', activation='relu', input_dim=100))
    model.add(Dropout(0.5))
    model.add(Dense(output_dim=102, init='uniform', activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X[train_index], y[train_index], epochs=2, batch_size=10, verbose=1)
    scores = model.evaluate(X[test_index], y[test_index], verbose=1)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

  """
  import sys


Epoch 1/2
Epoch 2/2
acc: 96.83%
Epoch 1/2
Epoch 2/2
acc: 96.86%
Epoch 1/2
Epoch 2/2
acc: 96.83%
Epoch 1/2
Epoch 2/2
acc: 96.82%
Epoch 1/2
Epoch 2/2
acc: 96.83%
Epoch 1/2
Epoch 2/2
acc: 96.81%
Epoch 1/2
Epoch 2/2
acc: 96.80%
Epoch 1/2
Epoch 2/2
acc: 96.80%
Epoch 1/2
Epoch 2/2
acc: 96.82%
Epoch 1/2
Epoch 2/2
acc: 96.81%
96.82% (+/- 0.02%)
