In [1]:
# !/usr/bin/env python
# -*- coding: utf-8 -*-

import requests
import re
import os
import pandas as pd
from time import gmtime, strftime
from models.RSS_classifier import classify, classify2

In [2]:
def get_articles_and_xml(url):
    page = requests.get(url)

    pattern = '<item rdf:about=.*?</item>'
    article_list = re.findall(pattern, page.text, flags=(re.MULTILINE | re.DOTALL))
    return article_list, page.text

In [3]:
def extract_item_info(xml_string):
    item_dict = {}
    item_dict['url'] = re.findall('http://arxiv.org/abs/\d{1,8}\.\d{1,8}', xml_string, flags=(re.MULTILINE | re.DOTALL))[0]
    item_dict['title'] = (re.findall('<title>(.*?)[(]arXiv:', xml_string, flags=(re.MULTILINE | re.DOTALL))[0]).strip()
    item_dict['abstract'] = re.findall('<description rdf:.*?>(.*?)</description>', xml_string, flags=(re.MULTILINE | re.DOTALL))[0]
    item_dict['authors'] = re.findall('<dc:creator>(.*?)</dc:creator>', xml_string, flags=(re.MULTILINE | re.DOTALL))[0]
    item_dict['authors'] = re.findall('\&quot;\&gt;(.*?)\&lt;/a\&gt;', item_dict['authors'], flags=(re.MULTILINE | re.DOTALL))
    item_dict['authors'] = ', '.join(item_dict['authors'])
    item_dict['full_text'] = xml_string
    return item_dict

In [4]:
def get_feed_parts(xml_string):
    beginning = re.findall('.*?<rdf:Seq>\n', xml_string, flags=(re.MULTILINE | re.DOTALL))[0]
    middle = re.findall('</rdf:Seq>.*?</image>', xml_string, flags=(re.MULTILINE | re.DOTALL))[0]
    end = '</rdf:RDF>'
    
    beginning_good = re.sub('<title>(.*?)</title>', '<title>Good papers from arXiv</title>', beginning, flags=(re.MULTILINE | re.DOTALL))
    beginning_bad = re.sub('<title>(.*?)</title>', '<title>Boring papers from arXiv</title>', beginning, flags=(re.MULTILINE | re.DOTALL))
    
    beginning_good = re.sub('<description(.*?)>(.*?)</description>', '<description\g<1>>The Good Papers on ML, AI and Statistics</description>', beginning_good, flags=(re.MULTILINE | re.DOTALL))
    beginning_bad = re.sub('<description(.*?)>(.*?)</description>', '<description\g<1>>The Bad Papers on ML, AI and Statistics</description>', beginning_bad, flags=(re.MULTILINE | re.DOTALL))
    
    return beginning_good, beginning_bad, middle, end

In [5]:
def build_feeds(xml_string, articles_info_classified_pd):
    beginning_good, beginning_bad, middle, end = get_feed_parts(xml_string)
    
    
    items_good = []
    items_bad = []
    for i in range(len(articles_info_classified_pd)):
        item = '<rdf:li rdf:resource="' + articles_info_classified_pd['url'].iloc[i] + '"/>'
        if articles_info_classified_pd['class'].iloc[i] == True:
            items_good.append(item)
        else:
            items_bad.append(item)
            
    items_good = '\n'.join(items_good)
    items_bad = '\n'.join(items_bad)
    
    abstracts_good = []
    abstracts_bad = []
    for i in range(len(articles_info_classified_pd)):
        item = articles_info_classified_pd['full_text'].iloc[i]
        if articles_info_classified_pd['class'].iloc[i] == True:
            abstracts_good.append(item)
        else:
            abstracts_bad.append(item)
            
    abstracts_good = '\n'.join(abstracts_good)
    abstracts_bad = '\n'.join(abstracts_bad)
    
    good_text = beginning_good + items_good + middle + abstracts_good + end
    bad_text = beginning_bad + items_bad + middle + abstracts_bad + end
    
    return good_text, bad_text

In [6]:
urls = [
    'http://arxiv.org/rss/cs.NE',
    'http://arxiv.org/rss/cs.AI',
    'http://arxiv.org/rss/stat.ML'
]
good_feed_name = 'papers_good.xml'
bad_feed_name = 'papers_bad.xml'

articles_info_classified_list = []
for url in urls:
    article_list, xml_string = get_articles_and_xml(url)

    articles_info_list = []
    for article in article_list:
        info = extract_item_info(article)
        articles_info_list.append(info)

    articles_info_pd = pd.DataFrame(articles_info_list)
    articles_info_classified_list.append(classify2(articles_info_pd))
    #articles_info_classified_pd = classify(articles_info_pd)

articles_info_classified_pd = pd.concat([articles_info_classified_list[0], 
                                              articles_info_classified_list[1],
                                              articles_info_classified_list[2]])

articles_info_classified_pd = articles_info_classified_pd.drop_duplicates(subset=['url'])

{'svc': Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])}
{'svc': Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])}
{'svc': Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-

In [7]:
articles_info_classified_pd

Unnamed: 0,abstract,authors,full_text,title,url,class
0,&lt;p&gt;Convolutional auto-encoders have show...,"Yanan Sun, Bing Xue, Mengjie Zhang","<item rdf:about=""http://arxiv.org/abs/1712.050...",A Particle Swarm Optimization-based Flexible C...,http://arxiv.org/abs/1712.05042,True
1,&lt;p&gt;Deep Learning (DL) aims at learning t...,"Yanan Sun, Gary G. Yen, Zhang Yi","<item rdf:about=""http://arxiv.org/abs/1712.050...",Evolving Unsupervised Deep Neural Networks for...,http://arxiv.org/abs/1712.05043,True
2,&lt;p&gt;Fully connected multilayer perceptron...,V.I. Avrutskiy,"<item rdf:about=""http://arxiv.org/abs/1712.050...",Neural networks catching up with finite differ...,http://arxiv.org/abs/1712.05067,True
3,&lt;p&gt;To harness the complexity of their hi...,"Freek Stulp, Pierre-Yves Oudeyer","<item rdf:about=""http://arxiv.org/abs/1712.052...",Proximodistal Exploration in Motor Learning as...,http://arxiv.org/abs/1712.05249,True
4,&lt;p&gt;Many biological and cognitive systems...,"Miguel Aguilera, Manuel G. Bedia","<item rdf:about=""http://arxiv.org/abs/1712.052...",Adaptation to criticality through organization...,http://arxiv.org/abs/1712.05284,True
5,&lt;p&gt;Satellite imagery and remote sensing ...,"Sam Kriegman, Marcin Szubert, Josh C. Bongard,...","<item rdf:about=""http://arxiv.org/abs/1706.078...",Evolving Spatially Aggregated Features from Sa...,http://arxiv.org/abs/1706.07888,True
6,&lt;p&gt;Sensor-based activity recognition see...,"Jindong Wang, Yiqiang Chen, Shuji Hao, Xiaohui...","<item rdf:about=""http://arxiv.org/abs/1707.035...",Deep Learning for Sensor-based Activity Recogn...,http://arxiv.org/abs/1707.03502,True
7,"&lt;p&gt;In this paper, we present a black-box...","Ishai Rosenberg, Asaf Shabtai, Lior Rokach, Yu...","<item rdf:about=""http://arxiv.org/abs/1707.059...",Generic Black-Box End-to-End Attack Against St...,http://arxiv.org/abs/1707.05970,True
8,&lt;p&gt;Keyword spotting (KWS) is a critical ...,"Yundong Zhang, Naveen Suda, Liangzhen Lai, Vik...","<item rdf:about=""http://arxiv.org/abs/1711.071...",Hello Edge: Keyword Spotting on Microcontrollers.,http://arxiv.org/abs/1711.07128,True
9,&lt;p&gt;Stochastic Gradient Descent (SGD) is ...,"Chen Huang, Chen Kong, Simon Lucey","<item rdf:about=""http://arxiv.org/abs/1712.025...",CNNs are Globally Optimal Given Multi-Layer Su...,http://arxiv.org/abs/1712.02501,True


In [None]:
good_text, bad_text = build_feeds(xml_string, articles_info_classified_pd)

with open(good_feed_name, 'w') as good:
    good.write(good_text)
    
with open(bad_feed_name, 'w') as bad:
    bad.write(bad_text)

In [None]:
import git
repo = git.Repo( '../rss-classifier' )
print(repo.git.add( '.' ))

timestring = strftime("%Y%m%d_%H%M%S", gmtime())
message = 'Update at ' + timestring

print(repo.git.commit( m=message ))
print(repo.git.push())
print(repo.git.status())