In [1]:
import pandas as pd

from scripts.downloader import DownloaderAktualne, DownloaderIdnes

In [2]:
## Downloading the articles
# Config
url_aktualne = "https://zpravy.aktualne.cz/domaci/"
url_idnes = "https://www.idnes.cz/zpravy/domaci/"

path_aktualne = "data/articles_Aktualne.cz.csv"
path_idnes = "data/articles_iDnes.cz.csv"

In [3]:
# Init verbose DownloaderAktualne and DownloaderIdnes
d_aktualne = DownloaderAktualne(url_aktualne, save_to=path_aktualne, verbose=True)
d_idnes = DownloaderIdnes(url_idnes, save_to=path_idnes, verbose=True)

Succesfully initialized DownloaderAktualne.
Succesfully initialized DownloaderIdnes.


In [4]:
# Download demo for iDnes.cz
d_idnes.downloadHeadlines(from_page = 2, to_page = 5, bulk_size = 30)

# Download data approx. to January 2016
#d_aktualne.downloadHeadlines(from_page = 1, to_page = 1315, bulk_size = 30)
#d_idnes.downloadHeadlines(from_page = 1, to_page = 672, bulk_size = 30)

# Save downloads
d_aktualne.saveToCsv()
d_idnes.saveToCsv()

#pd.concat([d_aktualne.articles_df[100:102], d_idnes.articles_df[40:42]])

100%|██████████| 4/4 [00:05<00:00,  1.42s/it]


Articles were successfully downloaded. Access DataFrame '.articles_df' or json list '.articles'.


In [5]:
# Check that recent articles are downloaded: 
articles = pd.read_csv(path_idnes)
articles.tail()

Unnamed: 0,article_id,slug,date,time,is_updated,headline,excerpt,article_url,scraped_at
24357,A200509_080447_domaci_kuce,prehledne-pondeli-11-kvetna-akce-do-sto-osob-s...,2020-05-11,00:00:00,False,PŘEHLEDNĚ: Návrat k normálu. Otevírají se kade...,V pondělí 11. května končí některá opatření na...,https://www.idnes.cz/zpravy/domaci/prehledne-p...,18-05-2020 15:59:01
24358,A200510_201118_domaci_aug,duchod-penze-zvyseni-cesko-schillerova-malacova,2020-05-10,20:28:00,False,Důchody by se od ledna měly zvýšit o 800 korun...,Důchody v Česku by se od ledna měly zvýšit o v...,https://www.idnes.cz/zpravy/domaci/duchod-penz...,18-05-2020 15:59:01
24359,A200506_164944_domaci_kuce,jindrich-voboril-drogy-socialni-sluzby-koronav...,2020-05-10,18:55:00,False,"Začali blbnout, rozpili se ze ztráty práce. Sp...",Počet drogově závislých během současné koronav...,https://www.idnes.cz/zpravy/domaci/jindrich-vo...,18-05-2020 15:59:01
24360,A200507_172334_domaci_chtl,koronavirus-v-cesku-deti-v-karantene-doma-se-u...,2020-05-10,18:03:00,False,"Děti se doma učí víc, přesto si toho zapamatuj...","Téměř polovina dětí tvrdí, že se doma během no...",https://www.idnes.cz/zpravy/domaci/koronavirus...,18-05-2020 15:59:01
24361,A200507_142858_domaci_kane,pendleri-zahranici-cesky-obcan-koronavirova-kr...,2020-05-10,15:57:00,False,Trnitá cesta pro 60 tisíc lidí. Od chaotického...,Za prací dojíždí do zahraničí téměř 60 tisíc č...,https://www.idnes.cz/zpravy/domaci/pendleri-za...,18-05-2020 15:59:01


In [6]:
### Headlines Analysis

In [7]:
# H2O Word2vec model
import h2o
h2o.init()
from h2o.estimators.word2vec import H2OWord2vecEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: java version "13.0.2" 2020-01-14; Java(TM) SE Runtime Environment (build 13.0.2+8); Java HotSpot(TM) 64-Bit Server VM (build 13.0.2+8, mixed mode, sharing)
  Starting server from /anaconda3/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/c4/jxchjt990lg9002_rg92j8tr0000gn/T/tmpysk76be2
  JVM stdout: /var/folders/c4/jxchjt990lg9002_rg92j8tr0000gn/T/tmpysk76be2/h2o_simon_started_from_python.out
  JVM stderr: /var/folders/c4/jxchjt990lg9002_rg92j8tr0000gn/T/tmpysk76be2/h2o_simon_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Europe/Prague
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.1
H2O_cluster_version_age:,1 month and 14 days
H2O_cluster_name:,H2O_from_python_simon_iice7r
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [8]:
# Importing Aktualne.cz headlines
headlines_aktualne = pd.read_csv(path_aktualne, usecols=["headline"], encoding="utf-8")
headlines_aktualne.insert(0, "source", "Aktualne.cz")

# Importing iDnes.cz headlines
headlines_idnes = pd.read_csv(path_idnes, usecols=["headline"], encoding="utf-8")
headlines_idnes.insert(0, "source", "iDnes.cz")

# Concat headlines
df_headlines = pd.concat([headlines_aktualne, headlines_idnes], ignore_index=True)

# Convert pd.DataFrame to H2O DataFrame
headlines = h2o.H2OFrame(df_headlines)

headlines_aktualne = h2o.H2OFrame(headlines_aktualne)
headlines_idnes = h2o.H2OFrame(headlines_idnes)

# Check data
pd.concat([df_headlines[:5], df_headlines[-5:]])

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


Unnamed: 0,source,headline
0,Aktualne.cz,Chceme vyšetřovací komisi k nákupu ochranných ...
1,Aktualne.cz,ANO si drží podporu 34 procent. Druzí jsou Pir...
2,Aktualne.cz,"Většinu Česka zasáhne v pondělí silný vítr, na..."
3,Aktualne.cz,Hasiči v Praze hasili požár odpadu v kovošrotu...
4,Aktualne.cz,Sokolov odstranil při pietním aktu vlajku USA....
48879,iDnes.cz,PŘEHLEDNĚ: Návrat k normálu. Otevírají se kade...
48880,iDnes.cz,Důchody by se od ledna měly zvýšit o 800 korun...
48881,iDnes.cz,"Začali blbnout, rozpili se ze ztráty práce. Sp..."
48882,iDnes.cz,"Děti se doma učí víc, přesto si toho zapamatuj..."
48883,iDnes.cz,Trnitá cesta pro 60 tisíc lidí. Od chaotického...


In [9]:
# Defines the stopwords
STOP_WORDS = ["bych", "", "ačkoli","ahoj","ale","anebo","ač","asi","aspoň","během","bez","beze","blízko","bohužel","brzo","bude","budeme","budeš","budete","budou","budu","byl","byla","byli","bylo","byly","bys","čau","chce","chceme","chceš","chcete","chci","chtějí","chtít","chut\u0027","chuti","co","čtrnáct","čtyři","dál","dále","daleko","děkovat","děkujeme","děkuji","den","deset","devatenáct","devět","do","dobrý","docela","dva","dvacet","dvanáct","dvě","hodně","já","jak","jde","je","jeden","jedenáct","jedna","jedno","jednou","jedou","jeho","její","jejich","jemu","jen","jenom","ještě","jestli","jestliže","jí","jich","jím","jimi","jinak","jsem","jsi","jsme","jsou","jste","kam","kde","kdo","kdy","když","ke","kolik","kromě","která","které","kteří","který","kvůli","má","mají","málo","mám","máme","máš","máte","mé","mě","mezi","mí","mít","mně","mnou","moc","mohl","mohou","moje","moji","možná","můj","musí","může","my","na","nad","nade","nám","námi","naproti","nás","náš","naše","naši","ne","ně","nebo","nebyl","nebyla","nebyli","nebyly","něco","nedělá","nedělají","nedělám","neděláme","neděláš","neděláte","nějak","nejsi","někde","někdo","nemají","nemáme","nemáte","neměl","němu","není","nestačí","nevadí","než","nic","nich","ním","nimi","nula","od","ode","on","ona","oni","ono","ony","osm","osmnáct","pak","patnáct","pět","po","pořád","potom","pozdě","před","přes","přese","pro","proč","prosím","prostě","proti","protože","rovně","se","sedm","sedmnáct","šest","šestnáct","skoro","smějí","smí","snad","spolu","sta","sté","sto","ta","tady","tak","takhle","taky","tam","tamhle","tamhleto","tamto","tě","tebe","tebou","ted\u0027","tedy","ten","ti","tisíc","tisíce","to","tobě","tohle","toto","třeba","tři","třináct","trošku","tvá","tvé","tvoje","tvůj","ty","určitě","už","vám","vámi","vás","váš","vaše","vaši","ve","večer","vedle","vlastně","všechno","všichni","vůbec","vy","vždy","za","zač","zatímco","ze","že","aby","aj","ani","az","budem","budes","by","byt","ci","clanek","clanku","clanky","coz","cz","dalsi","design","dnes","email","ho","jako","jej","jeji","jeste","ji","jine","jiz","jses","kdyz","ktera","ktere","kteri","kterou","ktery","ma","mate","mi","mit","muj","muze","nam","napiste","nas","nasi","nejsou","neni","nez","nove","novy","pod","podle","pokud","pouze","prave","pred","pres","pri","proc","proto","protoze","prvni","pta","re","si","strana","sve","svych","svym","svymi","take","takze","tato","tema","tento","teto","tim","timto","tipy","toho","tohoto","tom","tomto","tomuto","tu","tuto","tyto","uz","vam","vas","vase","vice","vsak","zda","zde","zpet","zpravy","a","aniž","až","být","což","či","článek","článku","články","další","i","jenž","jiné","již","jseš","jšte","k","každý","kteři","ku","me","ná","napište","nechť","ní","nové","nový","o","práve","první","přede","při","s","sice","své","svůj","svých","svým","svými","také","takže","te","těma","této","tím","tímto","u","v","více","však","všechen","z","zpět","zprávy"] # "ano" was left out

# Handles the tokenization
def tokenize(sentences, stop_word = STOP_WORDS):
    # Handlig Czech chracters, seperate by space instead of regex "\\W+"
    tokenized = sentences.tokenize(" ")
    tokenized = tokenized.rstrip()
    tokenized = tokenized.rstrip(".")
    tokenized = tokenized.rstrip(",")
    tokenized = tokenized.rstrip("!")
    tokenized = tokenized.rstrip("?")
    tokenized = tokenized.rstrip("-")
    tokenized = tokenized.rstrip(":")

    tokenized_cleaned = tokenized
    tokenized_lower = tokenized.tolower()
    tokenized_filtered = tokenized_lower[(tokenized_lower.nchar() >= 2) | (tokenized_lower.isna()),:]
    tokenized_words = tokenized_filtered[tokenized_filtered.grep("[0-9]",invert=True,output_logical=True),:]
    tokenized_words = tokenized_words[(tokenized_words.isna()) | (~ tokenized_words.isin(STOP_WORDS)),:]
    
    return tokenized_words

In [10]:
# Breaks down the string to sequence of words
words = tokenize(headlines["headline"])

words_aktualne = tokenize(headlines_aktualne["headline"])
words_idnes = tokenize(headlines_idnes["headline"])

words.describe

C1
vyšetřovací
komisi
nákupu
ochranných
pomůcek
shodla
opozice
ano
drží


<bound method H2OFrame.describe of >

In [11]:
# Builds word2vec model
w2v_model = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 10)
w2v_model.train(training_frame=words)

w2v_model_aktualne = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 10)
w2v_model_aktualne.train(training_frame=words_aktualne)

w2v_model_idnes = H2OWord2vecEstimator(sent_sample_rate = 0.0, epochs = 10)
w2v_model_idnes.train(training_frame=words_idnes)

word2vec Model Build progress: |██████████████████████████████████████████| 100%
word2vec Model Build progress: |██████████████████████████████████████████| 100%
word2vec Model Build progress: |██████████████████████████████████████████| 100%


In [12]:
# Gives us synonym to a given word
word_to_find = "strany"
w2v_model_aktualne.find_synonyms(word_to_find, count = 10)

OrderedDict([('čssd', 0.719825029373169),
             ('birke', 0.6884865760803223),
             ('starostů', 0.6816575527191162),
             ('kandidátky', 0.6808851361274719),
             ('běhounek', 0.6792238354682922),
             ('černochová', 0.6779224276542664),
             ('kdu-čsl', 0.6772832870483398),
             ('místopředsedu', 0.6715683341026306),
             ('bartošek', 0.6656481027603149),
             ('občanští', 0.6643967032432556)])

In [13]:
w2v_model_idnes.find_synonyms(word_to_find, count = 10)

OrderedDict([('lidovců', 0.7193344831466675),
             ('kdu-čsl', 0.7079429626464844),
             ('obhájil', 0.7035784125328064),
             ('předsedu', 0.7008177042007446),
             ('pravice', 0.7003647089004517),
             ('kandidovat', 0.7002758383750916),
             ('koalici', 0.6919031739234924),
             ('ods', 0.6882467269897461),
             ('stan', 0.6852326393127441),
             ('gazdík', 0.6843449473381042)])

In [14]:
# Function for prediction
def predict(headlines, w2v, gbm):
    words = tokenize(h2o.H2OFrame(headlines).ascharacter())
    headlines_vec = w2v.transform(words, aggregate_method="AVERAGE")
    print(gbm.predict(test_data=headlines_vec))

In [15]:
# Calculates a vector for each headline
headlines_vecs = w2v_model.transform(words, aggregate_method = "AVERAGE")

In [16]:
# Preparing train and validation data
valid_headlines = ~ headlines_vecs["C1"].isna()
data = headlines[valid_headlines,:].cbind(headlines_vecs[valid_headlines,:])
data_split = data.split_frame(ratios=[0.8])

# Builds GBM model
gbm_model = H2OGradientBoostingEstimator()
gbm_model.train(x = headlines_vecs.names,
                y="source", 
                training_frame = data_split[0], 
                validation_frame = data_split[1])

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [18]:
test_article_source = "iDnes.cz"
test_article = "Když agendu určuje Novotný a výzvy ODS působí směšně"

print("Predict (article from %s)" % test_article_source)
print(predict([test_article], w2v_model, gbm_model))

Predict (article from iDnes.cz)
Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm prediction progress: |████████████████████████████████████████████████| 100%


predict,Aktualne.cz,iDnes.cz
iDnes.cz,0.457224,0.542776



None
