In [7]:
#These functions clean the data in various ways
import re
import html
from nltk.corpus import stopwords

class Cleaner():
    
    def __init__(self):
        self.wp = WhitespaceTokenizer()
        self.reclean = re.compile('<.*?>')
        
    def remove_html_tags(self,text):
        return re.sub(self.reclean, ' ', text) 
    
    @classmethod
    def replace_newline(cls,text):
        return text.replace('\n',' ')
    
    @classmethod
    def replace_dash(cls,text,on=True):
        if(on):
            return text.replace('-',' ')
        else:
            return text

    #This removes non-alphabetical characters and makes everything lower case
    def clean(self,text,rep_dash=True):
        return ''.join(c for c in self.remove_html_tags(self.replace_dash(self.replace_newline(html.unescape(text.lower())),rep_dash)) 
                       if c in string.ascii_lowercase+' ')
    #this tokenizes intelligently

    def tokenize(self,text):
        #return TreebankWordTokenizer().tokenize(text)
        return self.wp.tokenize(text)
    #     words = WORD.findall(text)
    #     return words
    #this removes stopword tokens from a list of tokens
    def remove_stop_words(self,tokens):
        return [word for word in tokens if word not in stopwords.words('english')]
    
    #this will clean & tokenize a list of documents.
    
    def preprocess_input(self,words,rep_dash=True):
        return self.remove_stop_words(self.tokenize(self.clean(words,rep_dash)))

    def preprocess_documents(self,summaries,rep_dash=True):
        return [self.preprocess_input(s,rep_dash) for s in summaries]

In [33]:
#class for managing interactions between model and podcast database.
import pickle
import time
import pandas as pd
import gensim
import numpy as np
import scipy
from sklearn import decomposition,mixture
import sklearn
from matplotlib import pyplot as plt, rcParams
import matplotlib
rcParams.update({'font.size': 15})
from nltk.tokenize import TreebankWordTokenizer,WhitespaceTokenizer
from nltk.corpus import stopwords
import string
import feedparser as fp
import sqlite3

class PodcastDB:
    #static class variables
    wp = WhitespaceTokenizer()

    #initialize object
    def __init__(self,dbfloc=None,model=None):
        if(dbfloc is not None):
            conn = sqlite3.connect(database=dbfloc)
            self.podcastdbcur = conn.cursor()
            sqlout = self.__querydb('SELECT collectionId,w2v FROM podcasts')
            self.w2vs = [np.array([float(e) for e in x[1].split(',')]) for x in sqlout]
            self.ids = [x[0] for x in sqlout]
            self.npodcast = len(self.ids)
        else:
            raise ValueError('Object constructor must be called with a valid file ID')
            self.podcastdbcur = None
            self.w2vs = None
            self.ids = None
            self.npodcast = 0
            
        if(isinstance(model,gensim.models.keyedvectors.Word2VecKeyedVectors)):
            self.model = model
        else:
            raise ValueError('Object constructor must be called with a valid model')
            self.model = None
            
        self.comparator = scipy.spatial.distance.cosine
        #self.reclean = re.compile('<.*?>')
        self.cleaner = Cleaner()

            
    #primary method. finds podcasts most similar to some word.
    def search(self,word,n_outputs=5):
        
        word = self.cleaner.preprocess_input(word)
        
        #ensures that object is properly initialized
        if((self.podcastdbcur is None) or (self.model is None)):
            raise ClassError('Object not properly initialized.')
            
        if(not word):
            raise ValueError('Input contains no valid words.')
        
        #ADD SQL QUERY HERE
        bestID = [self.ids[id] for id in self.__compare(self._evaluate(word)).argsort()[:n_outputs]]
        return pd.DataFrame(self.__querydb('SELECT collectionName,feedUrl,collectionId FROM podcasts WHERE collectionId in ('
                         +','.join(str(id) for id in bestID) + ');'),columns=['collectionName','feedUrl','id'])
    

    #primary method. finds podcasts most similar to some word.
    def search_episodes(self,word,n_outputs=3,n_episodes=5,n_most_recent=10):
                
        #find the best podcasts, evaluate input
        pc_match = self.search(word,n_outputs)
        u = self._evaluate(self.cleaner.preprocess_input(word,rep_dash=True))
        
        #get the episodes associated with the best podcasts
        #get eps of each matching podcast
        ep_data = [self._get_eps(pc_match.iloc[i]['feedUrl']) for i in range(0,len(pc_match))] 
        #vectorize each episode
        ep_vec = [[self._evaluate(self.cleaner.preprocess_input(eps['entries'][i]['content'][0]['value'])) 
                   for i in range(0,min([n_most_recent,len(eps['entries'])]))] for eps in ep_data]

        #get relevant ep data
        sorted_eps = [np.array([self.comparator(u,v) for v in ev]).argsort()[:n_outputs] for ev in ep_vec]
        
        #return the data for the best eps
        return pc_match, [[e for e in eps['entries'][0:min([len(eps['entries']),n_outputs])]]
                          for eps in ep_data]

    #get the most recent n episodes associated with the best matching podcasts
    def _get_eps(self,url):
        try:
            return fp.parse(url)
        except:
            print('Error on ' + url)
            return (url,None)
    
    #apply internal model to a single word. 
    def _evaluate(self,word):
        if(isinstance(word,list)):
            return self.__evaluate_set(word)
        elif(isinstance(word,str)):
            #attempt to get vectorial representation of word.
            try:
                return self.model[word]
            except KeyError as e:
                return np.full([300,],np.nan)
        else:
            raise TypeError()
            
    #apply the model to a set of words and average them. 
    #this is simply ep2vec from other scripts.
    def __evaluate_set(self,words):
        #evaluate each word in 
        n = 0
        a = []
        for w in words:
            #attempt to evaluate vectorial representation of word.
            try:
                v = self.model[w]
                if((np.isnan(v).any() + np.isinf(v).any()) == 0):
                    a.append(v)
                    n += 1
            except KeyError as e:
                pass
        #if nothing was valid, return nan
        if(n==0):
            return np.full([300,], np.nan)
        #return average
        return np.mean(np.array(a),axis=0)
    
        #compares vector 
    def __compare(self,u):
        
        #return distances between vector and all our podcasts.
        return np.array([self.comparator(u,v) for v in self.w2vs])
    
    def __querydb(self,query):
        self.podcastdbcur.execute(query)
        return self.podcastdbcur.fetchall()
    

In [2]:
#load in podcast df, gensim model, and put it in the database object.
floc = '/home/bmassi/Dropbox/professional/Insight/data/'
modelfname = 'GoogleNews-vectors-negative300.bin'
word2vec = gensim.models.KeyedVectors.load_word2vec_format(floc+modelfname, binary=True)


In [34]:
#Instantiate podcast search object
dbname = 'podcast_database.db'
podcastdb = PodcastDB(dbfloc=floc+dbname,model=word2vec)

In [28]:
#Function for displying results
from newspaper import Article


cleaner = Cleaner()

#Returns text or title of an article from article URL. 
def generateArticleInput(url,title_only=True):
    article = Article(url)
    article.download()
    article.parse()
    if(title_only):
        return article.title
    else:
        return article.text

def formatOutput(output):
    outstr = ''
    for i in range(0,len(output[1])):
        outstr += '\n================================'+output[0].iloc[i]['collectionName']+'================================\n'
        for j in range(0,len(output[1][i])):
            outstr += '================' + output[1][i][j].title + '\n' + cleaner.remove_html_tags(output[1][i][j].summary_detail.value) + '\n'
            
    return outstr

In [35]:
#recommendation pipeline using only title
url = 'http://www.breitbart.com/big-government/2018/06/19/donald-trump-democrats-want-more-illegal-immigrants-as-potential-voters/'

start_time = time.time()
atext = generateArticleInput(url,title_only=True)
output = podcastdb.search_episodes(atext)
#output = podcastdb.search(atext)
stop_time = time.time()
duration = stop_time - start_time
print(duration)

4.633759021759033


In [36]:
print(formatOutput(output))


 &nbsp;The open-borders misinformation machine is firing on all cylinders over the Trump administration&rsquo;s immigration enforcement policies concerning families who illegally cross our borders. Nate&rsquo;s here to bring you the facts. 
 &nbsp;The open-borders misinformation machine is firing on all cylinders over the Trump administration&rsquo;s immigration enforcement policies concerning families who illegally cross our borders. Nate&rsquo;s here to bring you the facts. 
 The pro-deep-state Left and the legacy media want you to believe that the explosive new report detailing rampant anti-Trump bias at the FBI is a big nothingburger. Rep. Jim Jordan is having absolutely NONE of that nonsense. 
 Copyright CRTV. All rights reserved. 
 Kim Jong Un's atrocious human rights abuses often get less attention than American national security concerns about the North Korean regime. Nate Madden speaks with Sen. James Lankford about the importance of fighting for religious freedom in North Ko

In [37]:
#recommendation pipeline using entire body
url = 'http://www.breitbart.com/big-government/2018/06/19/donald-trump-democrats-want-more-illegal-immigrants-as-potential-voters/'

start_time = time.time()
atext = generateArticleInput(url,False)
output = podcastdb.search_episodes(atext)
#output = podcastdb.search(atext)
stop_time = time.time()
duration = stop_time - start_time
print(duration)

6.500269412994385


In [38]:
print(formatOutput(output))


 Washington Post is in big trouble as more than four hundred employees demand boss Jeff Bezos for higher pay and better benefits.  
 Right-Wing blogger Jennifer Ruben claimed that the Inspector General report proved that Hillary Clinton has a good reason to complain. 
 A group of potential Democratic presidential candidates spoke at the We The People summit. Bernie Sanders spoke about universal healthcare and Luis Gutierrez spoke about illegal immigration. 

"Something can start from there, where there was nothing before." 

Donald Trump has brought his special brand of negotiation to the summit with Kim Jong-un and come away with a win despite a concession on US / South Korea war games. What worked, and what would Barack Obama or other predecessors have done differently? His diplomacy was less stellar at the G7 summit in Canada and immediately after, mindlessly attacking allies and pulling for Russia. Also talking Ohio's voter purge, the GOP's split on immigration, and a case study f

In [22]:
podcastdb.podcastdb.columns

Index(['artistId', 'artistName', 'artistViewUrl', 'artworkUrl100',
       'artworkUrl30', 'artworkUrl60', 'artworkUrl600',
       'collectionCensoredName', 'collectionExplicitness', 'collectionHdPrice',
       'collectionId', 'collectionName', 'collectionPrice',
       'collectionViewUrl', 'contentAdvisoryRating', 'country', 'currency',
       'feedUrl', 'genreIds', 'genres', 'kind', 'primaryGenreName',
       'releaseDate', 'trackCensoredName', 'trackCount', 'trackExplicitness',
       'trackHdPrice', 'trackHdRentalPrice', 'trackId', 'trackName',
       'trackPrice', 'trackRentalPrice', 'trackViewUrl', 'wrapperType', 'w2v'],
      dtype='object')

In [1]:
import pickle

floc = '/home/bmassi/Dropbox/professional/Insight/data/'

podcastfname = 'podcast_df_subset_BIGDATA_1529347011.pkl'
with open(floc+podcastfname,'rb') as fid:
    podcastdb = pickle.load(fid)
podcastdb = podcastdb[['artistName','collectionExplicitness','collectionId','collectionName',
                     'collectionViewUrl','feedUrl','genres','primaryGenreName','releaseDate','trackCount','w2v']]

podcastdb = podcastdb.loc[podcastdb['trackCount']>10]

In [2]:
podcastfname = 'podcast_df_subset_BIGDATA_REDUCED.pkl'
with open(floc+podcastfname,'wb') as fid:
    pickle.dump(podcastdb,fid)

In [32]:
#Make a SQL database
import pickle
import pandas as pd
import sqlite3

#Connect to database
floc = '/home/bmassi/Dropbox/professional/Insight/data/'
conn = sqlite3.connect(database=floc+'podcast_database.db')
cur = conn.cursor()

print('loaded')
podcastfname = 'podcast_df_subset_BIGDATA_REDUCED.pkl'


with open(floc+podcastfname,'rb') as fid:
    podcastdf = pickle.load(fid)
    podcastdf = podcastdf[['artistName', 'collectionExplicitness', 'collectionId',
       'collectionName', 'collectionViewUrl', 'feedUrl',
       'primaryGenreName', 'releaseDate', 'trackCount', 'w2v']]
    podcastdf['w2v'] = [','.join(str(e) for e in x) for x in podcastdf['w2v'].get_values()]
    podcastdf.to_sql('podcasts', conn, if_exists='replace', index=False)

loaded


In [47]:
query = '''
SELECT collectionId,w2v FROM podcasts
'''

cur.execute(query)
sqlout = cur.fetchall()
w2v = [np.array([float(e) for e in x[1].split(',')]) for x in sqlout]
ids = [x[0] for x in sqlout]

TypeError: list indices must be integers or slices, not list

(54026, 10)