In [10]:
#class for getting & parsing RSS feeds
import feedparser as fp
import pandas as pd
import html
from nltk.tokenize import TreebankWordTokenizer,WhitespaceTokenizer
import string
from nltk.corpus import stopwords
import re
import time
from guess_language import guess_language
import time
import numpy as np

WORD = re.compile(r'\w+')


#These functions clean the data in various ways
class Cleaner():
    
    def __init__(self):
        self.wp = WhitespaceTokenizer()
    
    @classmethod
    def remove_html_tags(cls,text):
        import re
        clean = re.compile('<.*?>')
        return re.sub(clean, '', text)   
    @classmethod
    def replace_dash(cls,text):
        return text.replace('-','')

    #This removes non-alphabetical characters and makes everything lower case
    @classmethod
    def clean(cls,text):
        return ''.join(c for c in cls.remove_html_tags(cls.replace_dash(html.unescape(text.lower()))) 
                       if c in string.ascii_lowercase+' ')
    #this tokenizes intelligently

    def tokenize(self,text):
        #return TreebankWordTokenizer().tokenize(text)
        return self.wp.tokenize(text)
    #     words = WORD.findall(text)
    #     return words
    #this removes stopword tokens from a list of tokens
    @classmethod
    def remove_stop_words(cls,tokens):
        return [word for word in tokens if word not in stopwords.words('english')]
    #this will clean & tokenize a list of documents.

    def preprocess_documents(self,summaries):
        return [self.remove_stop_words(self.tokenize(self.clean(s))) for s in summaries]
            
class NoMoreFeedError(Exception):
    pass

class FeedCrawler():
    def __init__(self,podcast_df,foutloc,identifier=str(int(time.time()))):
        self.foutloc = foutloc
        self.podcast_df = podcast_df
        self.counter = 0
        self.feeds = []
        self.feedctr = []
        self.state = 'parsed'
        self.cleaner = Cleaner()
        self.filecounter = 0
        self.identifier = identifier
        
    def getNcasts(self):
        return self.podcast_df.shape[0]
    
    #gets the next n feeds in 
    def getFeed(self,n):
        for i in range(0,n):
            if(self.counter > self.podcast_df.shape[0]):
                raise NoMoreFeedError
            url = podcast_df.iloc[self.counter]['feedUrl']
            self.feeds.append(self._feed_request(url))
            self.counter += 1
    
    def parseFeeds(self):
        parsed_feeds = []
        for f in self.feeds:
            try:
                parsed_feeds.append((f[0],[f[1]['entries'][k]['content'][0]['value'] 
                                           for k in range(0,len(f[1]['entries']))]))
            except KeyError as e:
                pass
        self.feeds = parsed_feeds
        self.state = 'parsed'
        print('Parsed!')
        
    def cleanFeeds(self):
        #remove non-english feeds
        english_feeds = []
        for f in self.feeds:
            if(len(f[1])>0):
                lang = guess_language(self.cleaner.remove_html_tags(f[1][0]))
                if(lang=='en'):
                    english_feeds.append(f)
        self.feeds = english_feeds
        
        #clean feeds
#         cleaned_feeds = [(f1[0],self.cleaner.preprocess_documents(f1[1])) for f1 in self.feeds]
#         self.feeds = cleaned_feeds
#         self.state = 'clean'
        print('Clean!')
    
    def _saveFeeds(self):
        print('Saving %d feeds...' % len(self.feeds))
        with open(foutloc+'feeds_'+str(self.filecounter)+'_'+str(self.identifier)+'.pkl','wb') as fid:
            pickle.dump(self.feeds,fid)
            self.filecounter += 1
    
    def _saveProgress(self):
        with open(foutloc+'progress.pkl','wb') as fid:
            pickle.dump([self.identifier,self.filecounter,self.counter],fid)
    
    def save(self):
        self._saveFeeds()
        self._saveProgress()
        self._resetFeeds()
        
    def loadcounters(self):
        with open(foutloc+'progress.pkl','rb') as fid:
            progress = pickle.load(fid)
            self.identifier = progress[0]
            self.filecounter = progress[1]
            self.counter = progress[2]
            
    
    def _resetFeeds(self):
        self.feeds = []
        self.state = 'raw'
        
    def resetCounter(self):
        self.counter = 0
        
    @classmethod 
    def _feed_request(cls,url):
        try:
            return (url,fp.parse(url))
        except:
            print('Error on ' + url)
            return (url,None)
        
    


In [11]:
#associate summaries w/ row in podcast_df
#load in podcast_df
import pickle
import pandas as pd


#load in itunes_request_db
floc = '/home/bmassi/Dropbox/professional/Insight/data/'
with open(floc+'raw_itunes_requests1528609065.pkl','rb') as fid:
    raw_itunes_requests = pickle.load(fid)
#turn everything into a pandas dataframe
formatted_results = []
bads = []
cnames = ['']
for rir in raw_itunes_requests:
    for p in rir.json()['results']:
        if(p['kind']=='podcast'):
            formatted_results.append(p)

podcast_df = pd.DataFrame(formatted_results)
podcast_df = podcast_df.loc[podcast_df['feedUrl'].isnull()==False]

In [12]:
#filter out podcasts that weren't updated recently (look at fraction of remaining dataset as a function of cutoff)
from datetime import datetime,timedelta

#parameters for analysis
target_cutoff = 45#filter out podcasts that weren't updated for this many days
comparator_day = datetime(2018,6,11,14,5,23,424906)#This was "today" on Monday 6-11-18

podcast_df['releaseDate'] = pd.to_datetime(podcast_df['releaseDate'])
podcast_df = podcast_df.loc[podcast_df['releaseDate'] > (comparator_day - timedelta(days=target_cutoff))]

print(len(podcast_df))

137785


In [None]:
#Crawl & scrape feeds
import numpy as np
import time
import socket
import feedparser as fp
import requests

MAX_REQUEST_DURATION = 10 #seconds
socket.setdefaulttimeout(MAX_REQUEST_DURATION)

step_size = 500#number of feeds to get/save at once
foutloc = '/home/bmassi/Dropbox/professional/Insight/data/preprocessed_summaries2/'
crawler = FeedCrawler(podcast_df,foutloc)

try:
    crawler.loadcounters()
    print('initialized! counter = %d,filecounter = %d,id = %s' %
          (crawler.counter,crawler.filecounter,crawler.identifier))
except:
    print("could not initialize")
    init = 0

flag = 1
while(flag):
    start_time = time.time()
    try:
        crawler.getFeed(step_size)
        crawler.parseFeeds()
        crawler.cleanFeeds()
        crawler.save()
    except NoMoreFeedError as e:
        flag = 0
        print("Job's done!")
    stop_time = time.time()
    duration = stop_time - start_time
    print('Counter=%d/%d (duration=%.2f, step=%d)' % (crawler.counter,crawler.getNcasts(),duration,step_size))

initialized! counter = 500,filecounter = 1,id = 1529184593.957413
Parsed!
Clean!
Saving 340 feeds...
Counter=1000/137785 (duration=345.92, step=500)
Parsed!
Clean!
Saving 365 feeds...
Counter=1500/137785 (duration=353.64, step=500)
Parsed!
Clean!
Saving 322 feeds...
Counter=2000/137785 (duration=392.31, step=500)
Parsed!
Clean!
Saving 348 feeds...
Counter=2500/137785 (duration=369.92, step=500)
Parsed!
Clean!
Saving 306 feeds...
Counter=3000/137785 (duration=399.55, step=500)
Parsed!
Clean!
Saving 291 feeds...
Counter=3500/137785 (duration=348.34, step=500)
Parsed!
Clean!
Saving 303 feeds...
Counter=4000/137785 (duration=383.37, step=500)
Parsed!
Clean!
Saving 328 feeds...
Counter=4500/137785 (duration=403.08, step=500)
Parsed!
Clean!
Saving 328 feeds...
Counter=5000/137785 (duration=387.65, step=500)
Parsed!
Clean!
Saving 349 feeds...
Counter=5500/137785 (duration=326.87, step=500)
Parsed!
Clean!
Saving 259 feeds...
Counter=6000/137785 (duration=416.46, step=500)
Parsed!
Clean!
Saving

In [63]:
#Clean feeds


120

In [3]:
import pickle

floc = '/home/bmassi/Dropbox/professional/Insight/data/preprocessed_summaries2/'
with open(floc+'feeds_0_1529184593.957413.pkl','rb') as fid:
    prog = pickle.load(fid)


2

306