# Data Collection of Website Metrics with AWIS

Amazon Web Information Service (AWIS) Data collection script. Calls AWIS Api to retrieve website data for 205 domains, preprocess data and format for Database insertion.

## Dependencies

In [1]:
from myawis import *
from xmljson import BadgerFish
from xml.etree.ElementTree import fromstring
import pandas as pd
from decimal import Decimal
import json, re

## File Paths

In [2]:
bloggerDomains = '.\Bloggers domain WIP.csv'

## Read in AWIS credentials

In [3]:
awsAccessKeyId = ''
awsSecretAccessKey = '' 

f=open(".awis.py.credentials", "r")
if f.mode == 'r':
    creds = f.read()
    creds = creds.split('\n')
    awsAccessKeyId = creds[0].split(' = ')[1]
    awsSecretAccessKey = creds[1].split(' = ')[1]
    
obj = CallAwis(awsAccessKeyId, awsSecretAccessKey)

## Read in list of blogs urls for scrapping

In [4]:
# Read in domain names
allDomainNames = list(pd.read_csv(bloggerDomains).domain_name)
allDomainNames

['2bearbear.com',
 '2cameras1bucketlist.com',
 'adventureswithnienie.com',
 'adventurouskate.com',
 'aggylow.com',
 'agirlabroad.com',
 'agirlandabaldtraveller.com',
 'alexischeong.com',
 'aliadventures.com',
 'alittleadrift.com',
 'alvinology.com',
 'amandacastleman.com',
 'amandalingg.com',
 'amatteroftaste.me',
 'aroundouretable.com',
 'aroundtheworldinkatydays.com',
 'aroundtheworldl.com',
 'aspirantsg.com',
 'backpackingdetourstravel-blog.com',
 'backpackwonders.com',
 'belaroundtheworld.com',
 'bemytravelmuse.com',
 'bernardthetraveller.com',
 'betsiworld.com',
 'beverlyville.blogspot.com',
 'beyondelittlereddot.com',
 'bigworldsmallpockets.com',
 'blog.dianajlee.com',
 'bpdgtravels.blogspot.com',
 'breathedreamgo.com',
 'btravelsbetter.com',
 'bykido.com',
 'camemberu.com',
 'cassidysadventures.com',
 'cavinteo.blogspot.com',
 'cheekiemonkie.net',
 'createwithmom.com',
 'culturestamps.com',
 'danflyingsolo.com',
 'danielleinwanderland.com',
 'dereklow.co',
 'detourista.com',
 'd

In [5]:
# # Create AWIS request object
# url = 'www.loveandroad.com'
# obj = CallAwis(awsAccessKeyId, awsSecretAccessKey)
# urlinfo = obj.urlinfo(url)

# # Convert response of BeautifulSoup object from AWIS to dict
# bf = BadgerFish(dict_type=dict) 
# awisResultsDict = bf.data(fromstring(str(urlinfo)))

# # Clean response dict
# awisResultsCleaned = eval(re.sub(r'{http://awis.amazonaws.com(.+?)}', '', str(awisResultsDict)))
# awisResultsCleaned = awisResultsCleaned['UrlInfoResponse']['Response']['UrlInfoResult']['Alexa']

# awisResultsCleaned

In [6]:
from collections import MutableMapping

# Function to delete specific key from dictionary
def deleteKeysFromDict(dictionary, removeKey):

    try:
        modified_dict = {}
        for key, value in dictionary.items():
            key = key[0].lower() + key[1:]
            if type(value) == dict and removeKey in value.keys():
                value = value.get(removeKey)
            if isinstance(value, MutableMapping):
                modified_dict[key] = deleteKeysFromDict(value, removeKey)
            else:
                modified_dict[key] = value
    except Exception as e:
        print(e)
        modified_dict = {}
    finally:
        return modified_dict

  """Entry point for launching an IPython kernel.


In [7]:
def getSpeedAndUsageData(url):
    # Create AWIS request object    
    urlinfo = obj.urlinfo(url)

    # Convert response of BeautifulSoup object from AWIS to dict
    bf = BadgerFish(dict_type=dict) 
    awisResultsDict = bf.data(fromstring(str(urlinfo)))
    
    # Clean response dict
    awisResultsCleaned = eval(re.sub(r'{http://awis.amazonaws.com(.+?)}', '', str(awisResultsDict)))
    awisResultsCleaned = awisResultsCleaned['UrlInfoResponse']['Response']['UrlInfoResult']['Alexa']
    return awisResultsCleaned

In [8]:
def getTrafficHistory(url, dateRange = 31, startDate = '20190801'):
    trafficHistory = obj.traffichistory(url, myrange=dateRange, start=startDate)
    # print(trafficHistory)

    # Convert response of BeautifulSoup object from AWIS to dict
    bf = BadgerFish(dict_type=dict) 
    awisTrafficHistoryDict = bf.data(fromstring(str(trafficHistory)))

    # Clean response dict
    trafficHistoryCleanedDict = eval(re.sub(r'{http://awis.amazonaws.com(.+?)}', '', str(awisTrafficHistoryDict)))
    trafficHistoryCleanedDict = trafficHistoryCleanedDict['TrafficHistoryResponse']['Response']['TrafficHistoryResult']['Alexa']    
    return trafficHistoryCleanedDict


In [9]:
def formatAwisResult(awisSpeedMetricsDict, awisUsageDataDict, awisTrafficHistoryDict):
    newContentDataDict, newUsageDataDict, newTrafficHistoryDataDict = {}, {}, {}
    
    # Formatting of Speed Metrics
#     print("AWIS Speed Data")
    if awisSpeedMetricsDict:
        contentDataDict = deleteKeysFromDict(awisSpeedMetricsDict, '$')
        newContentDataDict = {'contentData': {'speed': contentDataDict}}
#         print(newContentDataDict)
#         print('\n')

    # Formatting of Usage Data    
#     print('AWIS Usage Data')
    newAwisUsageDataList = []
    if awisUsageDataDict:
        for awisUsageData in awisUsageDataDict['UsageStatistic']:
            tempDict = deleteKeysFromDict(awisUsageData, '$')
            if tempDict:
                timeRange = tempDict.get('timeRange')
                tempDict['timeRange'] = {'Period': list(timeRange.keys())[0], 'Count': list(timeRange.values())[0]}
                newAwisUsageDataList.append(tempDict)
        newUsageDataDict = {"usageData": newAwisUsageDataList}
#         print(newUsageDataDict)
#         print('\n')
    
    # Retrieving site
    site = awisTrafficHistoryDict['TrafficHistory']['Site'].get('$')
    
    # Formatting of Traffic Data
#     print('AWIS Traffic Data')
#     print(awisTrafficHistoryDict)
    newTrafficHistoryDataList = []
    awisTrafficHistoryDict = awisTrafficHistoryDict['TrafficHistory']['HistoricalData']
    if awisTrafficHistoryDict and type(awisTrafficHistoryDict['Data']) == list:
        for awisTrafficHistoryData in awisTrafficHistoryDict['Data']:
            newTrafficHistoryDataList.append(deleteKeysFromDict(awisTrafficHistoryData, '$'))
    elif awisTrafficHistoryDict:
        newTrafficHistoryDataList.append(deleteKeysFromDict(awisTrafficHistoryDict.get('Data'), '$'))
    newTrafficHistoryDataDict = {'historicalData': newTrafficHistoryDataList}
#         print(newTrafficHistoryDataDict)
#         print('\n')

    finalDict = {'blogUrl': site, 'blogTraffic': {}}
    finalDict.get('blogTraffic').update(newTrafficHistoryDataDict)
    finalDict.get('blogTraffic').update(newUsageDataDict)
    finalDict.get('blogTraffic').update(newContentDataDict)
    
    return finalDict
# formatAwisResult(awisSpeedMetricsDict, awisUsageDataDict, trafficHistoryCleanedDict)

## Retrieve data from AWIS

In [10]:
allDomainNames.index('travelbooksfood.com')

184

In [43]:
getTrafficHistory('2bearbear.com', 31, '20191101')

{'Request': {'Arguments': {'Argument': [{'Name': {'$': 'start'},
     'Value': {'$': 20191101}},
    {'Name': {'$': 'range'}, 'Value': {'$': 31}},
    {'Name': {'$': 'url'}, 'Value': {'$': '2bearbear.com'}},
    {'Name': {'$': 'responsegroup'}, 'Value': {'$': 'History'}}]}},
 'TrafficHistory': {'Range': {'$': 3},
  'Site': {'$': '2bearbear.com'},
  'Start': {'$': '2019-11-01'},
  'HistoricalData': {}}}

In [26]:
blogs = []
# allDomainNames

In [47]:
import time

startIdx = allDomainNames.index(blogs[-1].get('blogUrl')) if blogs else 0

for url in allDomainNames[startIdx+1:]:
    print(url)
    awisSpeedAndUsageCleaned = getSpeedAndUsageData(url)
    awisTrafficHistoryCleaned = getTrafficHistory(url, 31, '20191101')
    
    awisSpeedMetricsDict = awisSpeedAndUsageCleaned['ContentData']['Speed']
    awisUsageDataDict = awisSpeedAndUsageCleaned['TrafficData']['UsageStatistics']
    blogs.append(formatAwisResult(awisSpeedMetricsDict, awisUsageDataDict, awisTrafficHistoryCleaned))
    time.sleep(2)

littlegreybox.net
livelaughtravel.net
'str' object has no attribute 'items'
'str' object has no attribute 'items'
'str' object has no attribute 'items'
'str' object has no attribute 'items'
livingthedreamrtw.com
localadventurer.com
loveandroad.com
loveyloi.com
lydiascapes.com
lyfandspice.com
marocmama.com
mccooltravel.com
melissackoh.com
milelion.com
missingavenue.com
misstamchiak.com
misstourist.com
missuschewy.com
mitsueki.sg
mongabong.com
ms-skinnyfat.com
myturntotravel.com
nerdnomads.com
nomadicmatt.com
nomadventura.com
ohtheplacesiwillgoto.com
oo-foodielicious.com
ourlittlesmarties.com
ourtravelitinerary.com
outofyourcomfortzone.net
paigemindsthegap.com
'str' object has no attribute 'items'
'str' object has no attribute 'items'
'str' object has no attribute 'items'
'str' object has no attribute 'items'
pamgoestravelling.com
'str' object has no attribute 'items'
'str' object has no attribute 'items'
'str' object has no attribute 'items'
'str' object has no attribute 'items'
paperin

In [48]:
len(blogs)

219

In [49]:
# for url in allDomainNames[100:]:
#     print(url)
#     awisSpeedAndUsageCleaned = getSpeedAndUsageData(url)
#     awisTrafficHistoryCleaned = getTrafficHistory(url, 31, '20190801')
    
#     awisSpeedMetricsDict = awisSpeedAndUsageCleaned['ContentData']['Speed']
#     awisUsageDataDict = awisSpeedAndUsageCleaned['TrafficData']['UsageStatistics']
#     blogs.append(formatAwisResult(awisSpeedMetricsDict, awisUsageDataDict, awisTrafficHistoryCleaned))
#     time.sleep(2)
    
blogsAwisMetricsDict = {'blogs': blogs}

In [50]:
with open('finalBlogsAWISMetricsDbInsertion.json', 'w') as outfile:
    json.dump(blogsAwisMetricsDict, outfile)

#### Resources:
1. Converting BeautifulSoup Object to dict
    - https://pypi.org/project/xmljson/