In [None]:
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Dropout, Bidirectional
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.utils.np_utils import to_categorical

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer, sent_tokenize

import pandas as pd
import numpy as np
import os

np.random.seed(7)
bbc_data_dir = "./data/"
glove_embedding_dir = "./glove.6B.300d.txt"

In [None]:
def load_dataset(directory=bbc_data_dir):
    df = pd.DataFrame()
    summ = 0
    classes = os.listdir(directory)

    print("Loading:")
    for class_ in classes:
        current_class_directory = "{}{}/".format(directory, class_)
        print(current_class_directory)

        for name in sorted(os.listdir(current_class_directory)):
            path = os.path.join(current_class_directory, name)

            current_text = open(path, encoding = "ISO-8859-1")
            summ += 1
            df.loc[summ,"text"] = current_text.read()
            df.loc[summ,"class"] = class_

                      
    df["class_meaning"] = df["class"]      
    df["class"].replace({"business":0,
                         "entertainment":1,
                         "politics":2,
                         "sport":3,
                         "tech":4},
                        inplace=True)
    return df

- #### This notebook should run from top to bottom. Just run all cells.
- #### Your computer should have all the nltk data downloaded for this to run properly

In this project I worked with the BBC raw dataset (http://mlg.ucd.ie/datasets/bbc.html). A neural network classifier was created to classify a given article/text into one out of the five possible categories (business, entertainment, politics, sport and tech).

#### Loading the data

In [None]:
data = load_dataset()

#shuffle data
data = data.sample(frac=1,random_state=25)

#split into training and test sets in a stratified way - ensuring similar distribution on training and test time
train_df, test_df = train_test_split(data,test_size=0.2,
                                     random_state=25,
                                     stratify=data["class"])

train_df = train_df.copy()
test_df = test_df.copy()

## Preprocessing

In [None]:
def tokenize_document(document):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokenizer_ = RegexpTokenizer('[a-zA-Z]+')
    
    words = []
    for sentence in sent_tokenize(document):
        tokens = [lemmatizer.lemmatize(t.lower()) for t in tokenizer_.tokenize(sentence)\
                  if t.lower() not in stop_words]
        words += tokens
     
    words_ = str()
    for word in words:
        words_ = words_ + " " + word
    return words_

class DocTokenizer(BaseEstimator, TransformerMixin):
        
    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, **transform_params):
        X = pd.Series(X)
        return X.apply(tokenize_document).tolist()

    
class WordsEncoder(BaseEstimator, TransformerMixin):

    def __init__(self, top_words = 20000):
        self.top_words = top_words

    def fit(self, X, y=None, **fit_params):
        encoder = Tokenizer(self.top_words)
        encoder.fit_on_texts(X)
        self.encoder_ = encoder
        
        return self

    def transform(self, X, **transform_params):

        return self.encoder_.texts_to_sequences(X)
    
    
class Padder(BaseEstimator, TransformerMixin):
    def __init__(self, max_sequence_length = 500):
        self.max_sequence_length = max_sequence_length
    
    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, **transform_params):
        return sequence.pad_sequences(np.array(X), maxlen = self.max_sequence_length)

In [None]:
preprocessing_pipeline = Pipeline([
    ('tokenizer', DocTokenizer()),
    ('encoder', WordsEncoder()),
    ('padder', Padder())])

preprocessing_pipeline.fit(train_df.text)

## Model

In [None]:
#load glove embeddings and compute Embedding Matrix

glove_embeddings = {}
f = open(glove_embedding_dir)
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    glove_embeddings[word] = coefs
f.close()

word_index = preprocessing_pipeline.named_steps['encoder'].encoder_.word_index
embedding_dim = len(glove_embeddings['yes']) #=300
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = glove_embeddings.get(word)
    
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
# create the model
model = Sequential()
model.add(Embedding(np.shape(embedding_matrix)[0],
                    np.shape(embedding_matrix)[1],
                    weights=[embedding_matrix],
                    trainable=True))
model.add(LSTM(100))
model.add(Dense(70, activation='relu'))
model.add(Dropout(rate=0.25))
model.add(BatchNormalization())
model.add(Dense(30, activation='relu'))
model.add(Dropout(rate=0.3))
model.add(BatchNormalization())
model.add(Dense(5, activation='softmax'))


model.compile(optimizer="adam", loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())

In [None]:
X_train = preprocessing_pipeline.transform(train_df.text)
y_train = to_categorical(train_df["class"])

X_test = preprocessing_pipeline.transform(test_df.text)
y_test = to_categorical(test_df["class"])

model.fit(X_train, y_train,
          epochs=20, 
          batch_size=128, 
          validation_split=0.25, 
          shuffle=True)

In [None]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy on test set: %.2f%%" % (scores[1]*100))

In [None]:
model.save("model")
joblib.dump(preprocessing_pipeline, "preprocessing_pipeline.pkl") 

loaded_model = load_model("model")
loaded_preprocessing_pipeline = joblib.load('preprocessing_pipeline.pkl')

In [None]:
categories= {0:"business",
             1:"entertainment",
             2:"politics",
             3:"sport",
             4:"tech"}

politics_example = "But there was disagreement over the rights of future family members - meaning children born in the future to EU citizens in the UK - and the exports of certain social benefits. The EU wants rights currently enjoyed by EU citizens in the UK - access to healthcare, welfare, education - to apply to children and family members, whether they currently live in the UK or not, and to continue in perpetuity, after the death or divorce of the rights-holder. The UK wants to give all EU nationals living in the UK the same rights as British citizens once they have been resident in the country for five years, as long as they arrived before a specified cut-off date, probably 29 March 2017, when Article 50 was triggered. After this date, they could continue to build up their five years' entitlement if necessary. The EU position also requires that citizens live in a host country for five years before acquiring permanent residency rights. In addition, EU nationals who get married after March 2019 would lose the right to bring family members to the UK, unless they pass an income test, like non-EU migrants. They could also risk losing their right to return to Britain if they leave for more than two years. David Davis said the UK had published its approach to citizens rights since the first round of negotiations, which he described as both a fair and serious offer and had now published a joint paper setting out areas of agreement, and issues for further talks. He said sticking points in the talks included the rights of employees of EU-based companies to work for extended periods in other countries, such as the UK. British officials highlighted that British expats would lose the rights to vote and stand in local elections under the EU plan - while the UK position is to protect the rights of citizens to vote and stand in elections in their host country."
tech_example = "Kodi is a free, legal media player for computers but software add-ons that in some cases make it possible to download pirated content. The Complete Guide to Kodi magazine instructs readers on how to download such add-ons. Dennis Publishing has not yet responded to a BBC request for comment. The magazine is available at a number of retailers, including WH Smith, Waterstones and Amazon and was spotted on sale by cyber-security researcher Kevin Beaumont. It repeatedly warns readers of the dangers of accessing pirated content online, but one article lists a series of software packages alongside screenshots promoting free TV, popular albums and world sport. Check before you stream and use them at your own risk, the guide says, before adding that readers to stay on the right side of the law. A spokesman for Fact said the body was working with the City of London Police's Intellectual Property Crime Unit (Pipcu) as it made enquiries. We are fully aware of this magazine and have already been in communication with Dennis Publishing regarding our concerns that it signposts consumers to copyright infringing add-ons, said Kieron Sharp, chief executive of Fact. it is concerning that the magazine's content provides information to consumers on add-ons that would potentially allow criminality to take place, he added. In April, the European Court of Justice (ECJ) ruled that selling devices pre-configured with add-ons allowing access to pirated content is illegal, and that streaming such content was also against the law. Two of the add-ons listed in the article are on a banned list maintained by the Kodi developers. We don't support piracy add-ons and so we don't like the idea of someone selling a magazine encouraging people to use them, said Nate Bentzen, Kodi's community and project manager. I am a bit surprised anyone is still selling a magazine like this physically, given all the lawsuits and the recent EU court decision, he added. WHSmith declined to comment but the BBC understands that the newsagent has no plans to stop sales of the magazine. In February, it was reported that five people had been arrested and accused of selling set-top boxes with modified versions of Kodi allowing them to stream subscription football matches, TV channels and films for free."
entertainment_example = "For a generation growing up in the early 2000s, it would have been hard not to find someone who didn't own a copy of the band's debut album Hybrid Theory. It's sold more than 30 million copies worldwide and remains one of the biggest selling albums released since the start of the millennium. Linkin Park's successful trick was to fuse elements of metal and rock with rap and hip-hop to shape the nu-metal genre on songs such as Crawling, In The End and Numb. Arguably their biggest asset was Chester's powerhouse voice. He had a huge, raspy vocal which suited their stadium-filling, singalong anthems. Whilst his vocal persona could be described as angry and harsh, in person he was warm, articulate and funny. The band's most recent album, One More Light, saw a different direction as they worked with prolific pop songwriters Julia Michaels and Justin Tranter - and collaborated with UK grime artist Stormzy."
sports_example = "Sherida Spitse scored the only goal from the penalty spot after Danielle van de Donk was tripped in the box. Arsenal's Sari van Veenendaal made a superb save to deny Denmark's Pernille Harder, while Nadia Nadim headed straight at the goalkeeper late on. Two-time champions Norway are on the brink of elimination after a surprise 2-0 defeat by Belgium. Elke van Gorp came from an offside position to poke in from Ingrid Hjelmseth's parry and give debutants Belgium the lead on 59 minutes. They doubled their lead through Janice Cayman's free header after the Norwegian defenders failed to clear. Norway's best chance fell to Caroline Graham Hansen, but the forward volleyed over the crossbar from close range. They were on top in the first half as Andrine Hegerberg had a flicked effort pushed away, but her sister Ada - the BBC Women's Footballer of the Year - struggled to make an impact. Martin Sjogren's Norway side - runners-up in 2013 - lost to hosts Netherlands in their first game and must now beat Denmark on Monday to have any chance of progressing. Meanwhile, a point for Netherlands in their final group game against Belgium will seal progression to the knockout stages."
business_example = "Chinese media have mocked US President Donald Trump over plans to impose 25% tariffs on $50bn worth of Chinese goods, saying wise men build bridges but fools build walls. Mr Trump announced the tariffs on Friday, accusing Beijing of intellectual copyright theft. China retaliated, saying it would impose an additional 25% tariff on 659 US goods worth $50bn. Stock markets fell after the announcements amid fear of a trade war. The US had earlier warned that it will impose even more tariffs should China retaliate. Mr Trump said the tariffs were essential to preventing further unfair transfers of American technology and intellectual property to China, which will protect American jobs. The Chinese product lines that have been hit range from aircraft tyres to turbines and commercial dishwashers. What is a trade war and why should I worry China vows fast response to US tariffs G7 summit ends in disarray over tariffs US tariffs a dangerous game, says EU State-controlled media made a concerted attack on the new US measures. Following the path of expanding and opening up is China's best response to the trade dispute between China and the United States, and is also the responsibility that major countries should have to the world, said an editorial in Xinhua news agency. The wise man builds bridges, the fool builds walls, it commented. Social media users were quick to make light of the comment, with many making reference to the Great Wall of China."

processed_text = loaded_preprocessing_pipeline.transform(entertainment_example)
probas = loaded_model.predict_proba(processed_text, verbose=0)*100

print('business: {0:.2f}%'.format(probas[0][0]))
print('entertainment: {0:.2f}%'.format(probas[0][1]))
print('politics: {0:.2f}%'.format(probas[0][2]))
print('sport: {0:.2f}%'.format(probas[0][3]))
print('tech: {0:.2f}%'.format(probas[0][4]))