# Scripted Primo Search Assessment Tool

Description: < Describe the purpose of the SPSAT>

## Load required libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import glob
import os
import codecs
import json
import time
from urllib.request import Request, urlopen
from urllib.parse import urlencode, quote_plus
import pymarc
import marcx
import io

## Define Some Variables

In [2]:
## URLs ##

# prod_url = <base url>
prod_url_base = 'http://bu-primo.hosted.exlibrisgroup.com:1701/PrimoWebServices/xservice/search/brief'
# stage_url = <base url>
stage_url_base = 'http://bu-primostage.hosted.exlibrisgroup.com:1701/PrimoWebServices/xservice/search/brief'

## query params ##
query_Params1 = '?institution=BOSU&query=any,contains,'
query_Params2 = '&indx=1&bulkSize=25'
query_Params3 = '&loc=local,scope,BOSU&loc=adaptor,primo_central_multiple_fe&onCampus=true&json=true'
stage = True
#production = True
#num_results = 10
#scope = 

## file locations ## 

# move to working directory, this is the path where we want files to reside
os.chdir('/Volumes/jwa_drive1/git/spst')



## Get Search Strings

In [3]:
### if loading search strings from a csv file
#### open input file, read each line as a separate string, and remove the new line character
# Input file (contains search strings)
f = '/Volumes/jwa_drive1/git/spst/search_strings.csv'
search_strings = open(f).read().splitlines()

### if building search strings from a marcxml file
# Input file (contains search strings)
f = '/Volumes/jwa_drive1/git/spst/BIBLIOGRAPHIC_18318707070001161_1.xml'
records = pymarc.parse_xml_to_array(io.open(f,mode='r',encoding='utf-8'))
df_search = pd.DataFrame()

for rec in records:
    d = {}
    rec = marcx.FatRecord.from_record(rec)
    try:
        d['author'] = rec['100']['a']
    except Exception as e:
        d['author'] = ''
    d['title'] = rec.title().replace('/','')
    d['mmsid'] = rec['001'].data
    d['title_author'] = rec.title().replace('/','') + d['author'] #rec.author()
    df_search = df_search.append(d,ignore_index=True)
title_search_strings = df_search['title']
title_author_search_strings = df_search['title_author']

In [4]:
#select the list of search strings
#search_strings = title_search_strings
search_strings = search_strings


## Define functions

In [5]:
# build_url 
    # takes base_url and search_string, returns url properly formated for searching
def build_url(base_url,search_string):
    url = base_url+query_Params1 + search_string.replace(' ','+') + query_Params2 + query_Params3
    return(url)
    
# parse_response 
    # takes response (as json) from search, returns response_dict (dict of our responses)
def parse_response(search,response,platform,search_no):
    response_dict = {}
    # define the elements to grab from the json response. These become the columns in the dataframe
    elements = []
    #elements.append('Platform')
    #elements.append('Search')
    #elements.append('ResultNumber')
    #elements.append('TotalHits')
    #elements.append('Rank')
    #elements.append('SearchEngine')
    elements.append('control.recordid')
    elements.append('display.type')
    elements.append('display.creator')
    elements.append('display.title')
    elements.append('display.ispartof')
    elements.append('delivery.delcategory')
    elements.append('delivery.fulltext')
    elements.append('search.searchscope')
    elements.append('search.general')
    elements.append('facets.collection')
    elements.append('facets.frbrtype')
    elements.append('facets.toplevel')
    elements.append('facets.prefilter')
    elements.append('sort')
    # parse the response to return the data desired elements
    response = json.loads(response.decode('utf8'))
    # need to extract the desired elements and add them to a dict
    response = response['SEGMENTS']
    response = response['JAGROOT']
    response = response['RESULT']
    response_dict['facet'] = response['FACETLIST']
    response_dict['docset'] = response['DOCSET']
    docs = response_dict['docset']['DOC']
    facets = response_dict['facet']
        
    ## define a dataframe to return the results
    doc_columns = ['Platform','Search_No','Search','ResultNumber','TotalHits','FirstHit','LastHit','Rank','SearchEngine','recordid', \
           'type','creator','title','ispartof','delcategory','fulltext','searchscope','general','collection', \
          'frbrtype','toplevel','prefilter','sort']
    df_doc = pd.DataFrame(columns = doc_columns)
    fac_columns = ['Platform','Search_No','Search','TotalHits','SearchEngine','facet','values']
    df_facets = pd.DataFrame(columns = fac_columns)
    #bib = doc['PrimoNMBib']['record']
    #print(type(bib)
    ## iterate through the results (normally 10) to populate a dictionary that will be added to the dataframe
    for doc in docs:
        doc_dict = {}
        doc_dict['Platform'] = platform
        doc_dict['Search_No'] = search_no
        doc_dict['Search'] = search
        doc_dict['ResultNumber'] = doc['@NO']
        doc_dict['TotalHits'] = response['DOCSET']['@TOTALHITS']
        doc_dict['FirstHit'] = response['DOCSET']['@FIRSTHIT']
        doc_dict['LastHit'] = response['DOCSET']['@LASTHIT']
        doc_dict['Rank'] = doc['@RANK']
        doc_dict['SearchEngine'] = doc['@SEARCH_ENGINE']
        #doc_dict['Bib'] = doc['PrimoNMBib']['record']
        Bib = doc['PrimoNMBib']['record']
        
        for element in elements:
            x = element.split('.')
            if len(x) == 1:
                try:
                    doc_dict[x[-1]] = Bib[x[0]]
                except:
                    pass
            if len(x) == 2:
                try:
                    doc_dict[x[-1]] = Bib[x[0]][x[1]]
                except:
                    pass
            if len(x) == 3:
                try:
                    doc_dict[x[-1]] = Bib[x[0]][x[1]][x[2]]
                except:
                    pass
            if len(x) == 4:
                try:
                    doc_dict[x[-1]] = Bib[x[0]][x[1]][x[2]][x[3]]
                except:
                    pass
            if len(x) == 5:
                try:
                    doc_dict[x[-1]] = Bib[x[0]][x[1]][x[2]][x[3]][x[4]]
                except:
                    pass   
        df_doc = df_doc.append(doc_dict, ignore_index=True)
        f_dict = parse_facets(facets['FACET'])
        for k,v in f_dict.items() :
            if doc['@NO'] == '1':
                facet_dict = {}
                facet_dict['Platform'] = platform
                facet_dict['Search_No'] = search_no
                facet_dict['Search'] = search
                #facet_dict['ResultNumber'] = doc['@NO']
                facet_dict['TotalHits'] = response['DOCSET']['@TOTALHITS']
                facet_dict['SearchEngine'] = doc['@SEARCH_ENGINE']
                facet_dict['facet'] = k
                facet_dict['values'] = v
                df_facets = df_facets.append(facet_dict,ignore_index=True)

    return(df_doc,Bib,df_facets)


def parse_facets(facets):
    return_dict = {}
    for facet in facets:
        f = facet['@NAME']
        fv = facet['FACET_VALUES']
        #print(f)
        l = []
        for x in fv :
            try:
                l.append((x['@KEY'],int(x['@VALUE'])))
            except Exception as e:
                pass
        sorted_by_second = sorted(l, key=lambda tup: tup[1],reverse=True)
        return_dict[f] = sorted_by_second
    return return_dict

In [6]:

# create a DataFrame. Expand the columns to match the desired elements
# to be returned by the function 'parse_response'

# Create a dataframe with
# - column for search string 
# - column for response (returned from parse_response fuction)
columns = ['Platform','Search','ResultNumber','TotalHits','FirstHit','LastHit','Rank','SearchEngine','recordid', \
           'type','creator','title','ispartof','delcategory','fulltext','searchscope','general','collection', \
          'frbrtype','toplevel','prefilter','sort']
df = pd.DataFrame(columns = columns)
fac_columns = ['Platform','Search_No','Search','TotalHits','SearchEngine','facet','values']
df_f = pd.DataFrame(columns = fac_columns)

# Loop through search_strings, execute search, ... [to be done?]
counter = 0
print(len(search_strings))
for search_string in search_strings:
    time.sleep(1)
    print(counter)
    prod_search_url = build_url(prod_url_base,search_string)
    prod_request = Request(prod_search_url)
    stage_search_url = build_url(stage_url_base,search_string)
    stage_request = Request(stage_search_url)    
    try:
        response_body = urlopen(prod_request).read() # reads request into response_body var
        prod = response_body
        d = parse_response(search_string,response_body,'production',counter)[0]
        b = parse_response(search_string,response_body,'production',counter)[1]
        f = parse_response(search_string,response_body,'production',counter)[2]
        df = df.append(d, ignore_index=True)
        df_f = df_f.append(f,ignore_index=True)
        response_body = urlopen(stage_request).read() # reads request into response_body var
        stage = response_body
        d = parse_response(search_string,response_body,'stage',counter)[0]
        b = parse_response(search_string,response_body,'stage',counter)[1]
        f = parse_response(search_string,response_body,'stage',counter)[2]
        df = df.append(d, ignore_index=True)
        df_f = df_f.append(f,ignore_index=True)

    #prints errors 
    except Exception as e:
        pass
        print('Exception: ',e)
    counter += 1

10
0
1
2
3
4
5
6
7
8
9


In [7]:
search1 = df
search1 = search1.ix[:,'Platform':]
search1 = search1.sort(columns=['Search_No','ResultNumber'], ascending=[1,1])

In [8]:
## here we are going to combine values from the search on production and stage to compare the results for each search
columns = ['Search','match','pRecordid','sRecordid','pRank','sRank','pCollection','sCollection', 
                        'pCreator','sCreator', 'pDelcategory','sDelcategory','pFrbrtype','sFrbrtype', 
                        'pFulltext','sFulltext', 'pTitle','sTitle', 'pToplevel','sToplevel', 'pType','sType']
comp_results = pd.DataFrame(columns = columns)
mask = search1['Platform'] == 'production'
production = search1[mask] 
mask = search1['Platform'] == 'stage'
stage = search1[mask]
stage= stage[['Rank', 'ResultNumber', 'Search', 'SearchEngine',
       'Search_No', 'TotalHits', 'collection', 'creator', 'delcategory',
       'frbrtype', 'fulltext', 'general', 'ispartof', 'prefilter', 'recordid',
       'searchscope', 'sort', 'title', 'toplevel', 'type']]
production = production[['Rank', 'ResultNumber', 'Search', 'SearchEngine',
       'Search_No', 'TotalHits', 'collection', 'creator', 'delcategory',
       'frbrtype', 'fulltext', 'general', 'ispartof', 'prefilter', 'recordid',
       'searchscope', 'sort', 'title', 'toplevel', 'type']]
stage= stage.set_index(['Search_No', 'ResultNumber'])
production = production.set_index(['Search_No', 'ResultNumber'])
production = production[['recordid','Rank', 'Search','collection', 'creator','delcategory', 'frbrtype', 'fulltext','title', 'toplevel','type']]
stage = stage[['recordid','Rank', 'Search','collection', 'creator','delcategory', 'frbrtype', 'fulltext','title', 'toplevel','type']]
production.columns = ['pRecordid','pRank', 'Search', 'pCollection', 'pCreator', 'pDelcategory', 'pFrbrtype',
       'pFulltext', 'pTitle', 'pToplevel', 'pType']
stage.columns = ['sRecordid','sRank', 'Search', 'sCollection', 'sCreator', 'sDelcategory', 'sFrbrtype',
       'sFulltext', 'sTitle', 'sToplevel', 'sType']

In [9]:
if (stage.shape == production.shape):
    rows = set(range(0,stage.shape[0]))
    cols = set(range(0,stage.shape[1]))
    scolumns = stage.columns
    pcolumns = production.columns
    for row in rows:
        d = {}
        for col in cols:
            if scolumns[col] == 'sRecordid':
                if stage.iloc[row][scolumns[col]] == production.iloc[row][pcolumns[col]]:
                    d['match'] = True
                else:
                    d['match'] = False
            d[scolumns[col]] = stage.iloc[row][scolumns[col]]
            d[pcolumns[col]] = production.iloc[row][pcolumns[col]]
        comp_results = comp_results.append(d,ignore_index=True)


In [10]:
comp_results.head(50)

Unnamed: 0,Search,match,pRecordid,sRecordid,pRank,sRank,pCollection,sCollection,pCreator,sCreator,...,pFrbrtype,sFrbrtype,pFulltext,sFulltext,pTitle,sTitle,pToplevel,sToplevel,pType,sType
0,murder,True,TN_proquest1645937272,TN_proquest1645937272,0.07,0.07,"[ProQuest Education Journals, ProQuest Central...","[ProQuest Education Journals, ProQuest Central...","Erath, Lex","Erath, Lex",...,6,6,fulltext,fulltext,Murder,Murder,,,newspaper_article,newspaper_article
1,murder,False,TN_proquest390127397,TN_proquest284135094,0.020000534,0.020000001,"[Ethnic NewsWatch [Current], Ethnic NewsWatch,...","[ProQuest Research Library, ProQuest Central]",Anonymous,,...,6,6,fulltext,fulltext,MURDER,Murder,,,newspaper_article,newspaper_article
2,murder,False,TN_proquest1301291758,TN_proquest127889412,0.020000001,0.012500141,[Periodicals Archive Online Collection 1 (purc...,"[American Periodicals Series, American Periodi...","Gordon, Mary",,...,5,6,fulltext,fulltext,Murder,MURDER,,,article,article
3,murder,False,TN_proquest127889412,TN_proquest137163675,0.012500141,0.012500141,"[American Periodicals Series, American Periodi...","[American Periodicals Series, American Periodi...",,,...,6,6,fulltext,fulltext,MURDER,Murder,,,article,article
4,murder,False,TN_proquest137163675,TN_proquest91197889,0.012500141,0.01250008,"[American Periodicals Series, American Periodi...","[American Periodicals Series, American Periodi...",,,...,6,6,fulltext,fulltext,Murder,Murder.,,,article,article
5,murder,False,TN_proquest91197889,TN_proquest9630930,0.01250008,0.01250008,"[American Periodicals Series, American Periodi...",[British Periodicals - British Periodicals Col...,,,...,6,6,fulltext,fulltext,Murder.,MURDER.,,,article,article
6,murder,False,TN_proquest9630930,TN_proquest284530185,0.01250008,0.0125,[British Periodicals - British Periodicals Col...,"[ProQuest Research Library, ProQuest Central]",,Anonymous,...,6,6,fulltext,fulltext,MURDER.,Murder,,,article,newspaper_article
7,murder,True,TN_proquest91208961,TN_proquest91208961,0.0125,0.0125,"[American Periodicals Series, American Periodi...","[American Periodicals Series, American Periodi...",,,...,6,6,fulltext,fulltext,MURDER.,MURDER.,,,article,article
8,murder,False,TN_proquest8959620,TN_proquest284513361,0.0125,0.0125,[British Periodicals - British Periodicals Col...,"[ProQuest Research Library, ProQuest Central]",,Anonymous,...,6,6,fulltext,fulltext,MURDER.,Murder,,,article,newspaper_article
9,murder,False,TN_medline14821541,TN_proquest8959620,0.0125,0.0125,MEDLINE/PubMed (NLM),[British Periodicals - British Periodicals Col...,"Simpson, K",,...,5,6,fulltext,fulltext,Murder,MURDER.,peer_reviewed,,article,article


In [None]:
# Define the aggregation calculations
aggregations = {
    'match' : {
        'match' : 'count'
    },
    'sType' : {
        'stype' : 'count'
    },
    'pType' : {
        'pType' : 'count'
        
    },

    'sToplevel' : {
        'pToplevel' : 'count'
        
    },
    'pToplevel' : {
        'pToplevel': 'count'
        
    }
}

In [None]:
by_search = comp_results.groupby(['Search']).agg(aggregations)

In [61]:
rt_ = comp_results[['Search','pType','sType']]
rt_.head(25)

Unnamed: 0,Search,pType,sType
0,murder,newspaper_article,newspaper_article
1,murder,newspaper_article,newspaper_article
2,murder,article,article
3,murder,article,article
4,murder,article,article
5,murder,article,article
6,murder,article,newspaper_article
7,murder,article,article
8,murder,article,newspaper_article
9,murder,article,article


In [59]:
rt_.groupby(pType)

AttributeError: Cannot access callable attribute 'groupby' of 'DataFrameGroupBy' objects, try using the 'apply' method

In [None]:
pd.Grouper()