In [1]:
import string
import nltk
from nltk.corpus import stopwords
import gensim
import numpy as np
import keras
from keras.models import load_model
import operator
from sklearn.feature_extraction.text  import TfidfVectorizer
import pandas as pd

Using TensorFlow backend.


In [2]:
def preprocess(text):
    stop_words = stopwords.words('english')
    lemmatizer=nltk.stem.WordNetLemmatizer()
    
    # tokenazation
    tokens_list = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            tokens_list.append(word)
    
    output = []
    for Token in tokens_list:
        # to lowercase
        token=Token.lower()
        
        # punctuation removal
        for punc in string.punctuation:
            token=token.replace(punc,'')
            
        # number digits removal
        for digit in string.digits:
            token=token.replace(digit,'')
            
        # lemmatzation
        token = lemmatizer.lemmatize(token)
        
        # stop words removal  
        if (token != "") and (token not in stop_words):
            output.append(token)
            
    return output

### Loading

In [3]:
%%time
tf_idf = TfidfVectorizer(max_features=8000, min_df=10)

train_set = pd.read_csv("dataset_train_pp.csv")
train_x = train_set["Description"]
tfidf = tf_idf.fit(train_x)

CPU times: user 5.78 s, sys: 172 ms, total: 5.95 s
Wall time: 5.97 s


In [6]:
%%time
nn_model = load_model("nn_tfidf_8000.h5")

CPU times: user 2.33 s, sys: 20.3 ms, total: 2.35 s
Wall time: 2.35 s


In [7]:
def inference(input_text):
    text_pp = []
    for i in input_text:
        text_pp.append(preprocess(i[0]))
    text_pp = pd.Series( (i for i in text_pp) )
    text_pp = text_pp.apply(lambda x: ''.join(i+' ' for i in x))
    text_vec = tfidf.transform(text_pp)
    predictions = nn_model.predict(text_vec)
    result = {1:'World', 2:'Sports', 3:'Business', 4:'Science', 5:'Corona'}
    print([result[r] for r in predictions.argmax(axis=1)+1])
    print([max(predictions[p]) for p in range(len(input_text))])

### Inference

#### DW News

In [16]:
# Corona
input_text = [['As France enters the "Green Zone" lower risk state against COVID-19 on Monday, Emmanuel Macron vows to prop up the economy along with the rest of Europe. France has been hit hard by the novel coronavirus.'],
              ['COVID-19 has taken a metaphorical sledgehammer to global tourism, although European nations are trying to revive the industry. Not so in Ireland or the UK, where stringent quarantine rules further threaten the sector.'],
              ['A man has died after becoming infected with COVID-19 at a Pentecost service in the northern city of Bremerhaven. This isnt the first time a church in Germany has been at the center of an outbreak.'],
              ['COVID-19 studies are being uploaded in great numbers to preprint servers without lengthy peer review processes. Is that good or bad? The fact is that there is no such thing as an absolute guarantee for good research.   '],
              ['In South Asian nations like Pakistan, where child labor is rampant, COVID-19 has brought more hardship to underage workers. Meanwhile, the resulting economic crisis is pushing even more children into child labor.   ']
             ]

In [17]:
%%time
inference(input_text)

['Corona', 'Corona', 'Corona', 'Corona', 'Corona']
[1.0, 1.0, 1.0, 0.9999888, 0.9998292]
CPU times: user 18.1 ms, sys: 3.91 ms, total: 22 ms
Wall time: 20.1 ms


In [10]:
# Sport
input_text = [['Bayern Munich will win the title for an eighth straight time if they can win for the 11th straight time when they visit Bremen on Tuesday. Elsewhere, the fight for survival has heated up — but Paderborn are all but down.   '],
              ['Bayern Munich are poised to secure an eighth consecutive league title this week, but there is still much to be decided in the Bundesliga with three games to go. DW analyzes the race for Europe and the relegation battle.   '],
              ['A week after Weston McKennie, Jadon Sancho and others delivered individual statements of support to the Black Lives Matter movement, Bundesliga clubs showed their collective solidarity.   '],
              ['There was early drama as RB Leipzig won on Julian Nagelsmanns first return to Hoffenheim. While the defeat dents Hoffenheim’s hopes of European football, Leipzig are on track for a Champions League place.   '],
              ['Bad news for the chauvinists at the football table. Scientists from Germanys Sport University Cologne have proved that women who play football can implement tactical approaches just as well as men.   ']
             ]

In [11]:
%%time
inference(input_text)

['Sports', 'Sports', 'Sports', 'Sports', 'Sports']
[0.9999999, 0.99182665, 0.954304, 0.9998791, 1.0]
CPU times: user 19.9 ms, sys: 225 µs, total: 20.2 ms
Wall time: 18.3 ms


In [60]:
# Business
input_text = [['The principal bench of the National Company Law Tribunal (NCLT) in New Delhi ruled that the liquidator has overriding powers under the Insolvency and Bankruptcy Code to take over both movable and immovable assets of a corporate debtor.'],
              ['Shares in German payment service provider Wirecard lost more than half their value within minutes on Thursday after the DAX-listed company said it was not possible for it to publish a delayed annual report due to worrisome audit data.'],
              ['Berlin says it regrets a US plan to expand sanctions on the Nord Stream 2 gas pipeline. US senators announced new sanctions on the project last week, saying the pipeline would boost Moscow’s influence in Europe.'],
              ['From bulky spaceship-like devices to sleek black boxes, consoles have come a long way in recent decades. That has gone hand in hand with the targeting of new products not just to kids, but to adults too.'],
              ['Nord Stream 2, which was originally scheduled to start delivering gas from Russia to Western Europe toward the end of 2019, is almost completed. Of a total of 2,360 kilometers (1,466 miles), 2,200 kilometers of the pipeline have been laid.']
             ]

In [61]:
%%time
inference(input_text)

['Business', 'Business', 'Business', 'Business', 'Science']
[0.9999924, 0.99999964, 0.9999918, 0.91551155, 0.9999361]
CPU times: user 24.1 ms, sys: 3.7 ms, total: 27.8 ms
Wall time: 23.2 ms


In [62]:
# Science
input_text = [['Every 18 to 24 months, Earth and Mars align in such a way as to make deep-space travel that little bit easier, or at least a bit faster. That reduces a trip or "trajectory" to the Red Planet from about nine months down to seven.'],
              ['The impressive pyramid-style cities of the ancient Mayan culture, such as at Tikal in Guatemala, can be found described in any travel book.But the many of the other monumental buildings, houses, roads and paths, water works and drainage systems, and terraces still lay hidden in dense rain forest.'],
              ['Everything about this NASA SpaceX Demo-2 mission is symbolic. It seems that every effort has been made to draw a direct parallel between the last human spaceflight from America, and the Apollo moon missions before that.'],
              ['Heres a simple fact to start: The oceans are huge. Oceans make up about 96.5% of all Earths water. Theres fresh water in the planet, in the ground or elsewhere on land in rivers and lakes — more than 70% of the planet is covered in water — and theres more all around us in the atmosphere. But the oceans are simply huge.'],
              ['Second only to leukemia, brain tumors are top of the list of common forms of cancer in children and the young. The German Brain Tumor Association says 25% of all cancer diagnoses in the young involve tumors in the brain and central nervous system. Its often kids at the age of six-and-a-half, and boys more often than girls.']
             ]

In [63]:
%%time
inference(input_text)

['Science', 'Business', 'Science', 'Science', 'Science']
[0.99999917, 0.99997807, 1.0, 1.0, 0.9998184]
CPU times: user 22.2 ms, sys: 140 µs, total: 22.3 ms
Wall time: 26.3 ms


In [88]:
# World
input_text = [['Three opposition activists from the Movement for Democratic Change-Alliance (MDC-Alliance) disappeared in May after being detained by police while on their way to an anti-government protest The women were found badly injured outside the capital Harare nearly 48 hours later and immediately hospitalized. They say they were abducted, sexually abused and forced to drink their urine.'],
              ['Javed Akhtar, 75, has been vocal about his views on politics, religion and public life and has often spoken out against religious fundamentalism and restrictions on freedom of speech. He has also heavily criticized communalism within Islam while denouncing the anti-Muslim sentiment advocated by the Hindu right.'],
              ['UN Secretary-General Antonio Guterres annual report on children and armed conflict, issued at the start of the week, featured a slight tweak for the year: the Saudi-led coalition waging war in Yemen was omitted from its list of offenders.Dubbed the "list of shame," this annex to the report names groups that fail to comply with measures aimed at ensuring the safety of children in armed conflict.'],
              ['The European Court of Justice (ECJ) ruled Thursday that a Hungarian law concerning the foreign funding of non-governmental organizations (NGOs) was illegal. Hungarys restrictions on the funding of civil organisations by persons established outside that member state do not comply with the Union law, the Luxembourg-based court said in a statement.'],
              ['United Nations members voted in four new members of the powerful Security Council in New York on Wednesday, but failed to decide on which African nation should fill the African regional seat up for grabs. In Wednesdays vote, Kenya received 113 votes while Djibouti got 78. With both failing to gain the two-thirds majority needed to win the Africa seat on the council, the two countries will face off on in a second round of voting on Thursday morning.']
             ]

In [87]:
%%time
inference(input_text)

['World', 'World', 'World', 'World', 'World']
[1.0, 0.99952006, 0.9990018, 0.99450904, 1.0]
CPU times: user 28.6 ms, sys: 0 ns, total: 28.6 ms
Wall time: 26.9 ms


In [84]:
input_text = [['']]

In [85]:
%%time
inference(input_text)

['World']
[0.99450904]
CPU times: user 11.1 ms, sys: 209 µs, total: 11.3 ms
Wall time: 9.62 ms


In [None]:
input_text = [[''],
              [''],
              [''],
              [''],
              ['']
             ]