In [1]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from keras.layers import Dense, Dropout
from sklearn.metrics import f1_score
from keras.models import Sequential
import xml.etree.ElementTree as ET
from nltk.corpus import stopwords
from keras import utils
from tqdm import tqdm
import pandas as pd
import numpy as np
import nltk
import re
import os

Using TensorFlow backend.


In [4]:
def ParsXML(path):
    data = []
    filesNames = os.listdir(path)
    for fileName in filesNames:
        tree = ET.parse(path + "/" + fileName)
        root = tree.getroot()
        headline = tree.find('headline').text
        text = list(tree.find('text'))
        content = ""
        topics = []
        for elem in text:
            content += elem.text
        bip_topics = []
        dc_date_published = ""
        itemid = root.attrib['itemid']
        XMLfilename = fileName
        for node in root.iter():
            if node.tag == 'dc' and node.attrib['element'] == "dc.date.published":
                dc_date_published = node.attrib['value']
            if node.tag == 'codes' and node.attrib['class'] == "bip:topics:1.0":
                topics = list(node)
                for topic in topics:
                    bip_topics.append(topic.attrib['code'])

        if len(bip_topics) !=0:
            data.append([headline, content, bip_topics[0], dc_date_published, itemid,
                 XMLfilename])
    
    return data

df = pd.DataFrame(ParsXML("Data"), columns = ['headline', 'text', 
                                   'bip:topics', 'dc.date.published',
                                   'itemid', 'XMLfilename'])

df.to_csv("DataRaw.csv", index=False)

In [5]:
df = pd.read_csv("DataRaw.csv")

In [6]:
dataset = df[['text', 'bip:topics']]

In [8]:
def cleanText(dataframe):
    for i in tqdm(range(0, len(dataframe))):
        content = dataframe.iat[i, 0]
        temp = re.sub('[^a-zA-Z]', ' ', content)
        temp = temp.lower()
        temp = temp.split()
        temp = [word for word in temp if not word in set(stopwords.words('english'))]
        content = ' '.join(temp)
        dataframe.iloc[i, 0] = content
    return dataframe
dataset = cleanText(dataset)
dataset.to_csv("DataCleaned.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
100%|██████████| 48257/48257 [2:17:59<00:00,  5.70it/s]  


In [2]:
dataset = pd.read_csv("DataCleaned.csv")
frequency = dataset['bip:topics'].value_counts()
for k in frequency.keys():
    if frequency.get(k) < 20:
        dataset = dataset[dataset['bip:topics'] != k]

In [3]:
embeddings_dict = {}

with open("glove.6B.100d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [4]:
documents = []
for i in range(0, len(dataset)):
    documents.append(dataset.iat[i, 0].split())

vector = np.zeros(100)
final = np.zeros(100)
docs = []
for i in range(0,len(documents)):
    for j in range(0,len(documents[i])):
        if documents[i][j] in embeddings_dict:
            vector = embeddings_dict[documents[i][j]]
        else:
            vector = np.zeros(100)
        final = np.add(final, vector)
    final = np.true_divide(final, len(documents[i]))
    docs.append(final)
    final = np.zeros(100)

In [5]:
X = np.reshape(docs, (len(dataset), 100))
y = dataset.iloc[:, -1].values

In [6]:
labelEncoder = LabelEncoder()
y = labelEncoder.fit_transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
num_classes = len(dataset['bip:topics'].unique())
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)

In [None]:
def model_creator():
    model = Sequential()
    model.add(Dense(output_dim=1000, init='uniform', activation='relu', input_dim=100))
    model.add(Dense(output_dim=1000, init='uniform', activation='relu'))
    model.add(Dense(output_dim=1000, init='uniform', activation='relu'))
    model.add(Dense(output_dim=num_classes, init='uniform', activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])
    return model

classifier = KerasClassifier(build_fn=model_creator, epochs=2, verbose=1, batch_size=10, validation_split=0.1)

classifier.fit(X_train, y_train)
classifier = KerasClassifier(build_fn=model_creator)
batch_sizes = [10, 20, 100]
epochs = [3, 10]
parameters = [{'batch_size': batch_sizes, 'epochs': epochs}]
grid_search = GridSearchCV(classifier, parameters, n_jobs=-1, cv=5)
grid_search.fit(X_train, y_train)
print(grid_search.best_score_)
print(grid_search.best_params_)

In [11]:
model = Sequential()
model.add(Dense(output_dim=1000, init='uniform', activation='relu', input_dim=100))
model.add(Dense(output_dim=1000, init='uniform', activation='relu'))
model.add(Dense(output_dim=1000, init='uniform', activation='relu'))
model.add(Dense(output_dim=num_classes, init='uniform', activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=100, verbose=1)

  
  import sys
  
  if __name__ == '__main__':


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f8c35c89410>

In [None]:
print(model.evaluate(X_train, y_train))

In [None]:
kfold = KFold(n_splits=10, shuffle=True)
cvscores = []
for train_index, test_index in kfold.split(X):
    model = Sequential()
    model.add(Dense(output_dim=500, init='uniform', activation='relu', input_dim=100))
    model.add(Dropout(0.5))
    model.add(Dense(output_dim=102, init='uniform', activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X[train_index], y[train_index], epochs=2, batch_size=10, verbose=1)
    scores = model.evaluate(X[test_index], y[test_index], verbose=1)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))