# Data Collection of Website Metrics with AWIS

## Dependencies

In [2]:
from myawis import *
from xmljson import BadgerFish
from xml.etree.ElementTree import fromstring
import pandas as pd
from decimal import Decimal
import json, re

## File Paths

In [101]:
bloggerDomains = '.\Bloggers domain WIP.csv'

## Read in AWIS credentials

In [102]:
awsAccessKeyId = ''
awsSecretAccessKey = '' 

f=open(".awis.py.credentials", "r")
if f.mode == 'r':
    creds = f.read()
    creds = creds.split('\n')
    awsAccessKeyId = creds[0].split(' = ')[1]
    awsSecretAccessKey = creds[1].split(' = ')[1]
    print(awsAccessKeyId)
    print(awsSecretAccessKey)

AKIA2N4624GJTPHKFN4L
OfFdxUsSM3eLNJuWKvHy/bigZt710CUgJ6GJvcXR


## Read in list of blogs urls for scrapping

In [103]:
# Read in domain names
allDomainNames = list(pd.read_csv(bloggerDomains).domain_name)
allDomainNames

['loveandroad.com',
 'laughtraveleat.com',
 'littlegreybox.net',
 'beyondelittlereddot.com',
 'mitsueki.sg',
 'alexischeong.com',
 'thetravelintern.com',
 'belaroundtheworld.com',
 '2bearbear.com',
 'flyhoneystars.com',
 'pekyj.blogspot.com',
 'wanderingearl.com',
 'hkfoodcrawlers.com',
 'travelinspiration360.com',
 'iwandered.net',
 'wanderlex.com',
 'cheekiemonkie.net',
 'irene-travelogue.com',
 'floraisabelle.com',
 'travelerfolio.com',
 'wildjunket.com',
 'sunriseodyssey.com',
 'havehalalwilltravel.com',
 'thewinterrhapsody.wordpress.com',
 'expertworldtravel.com',
 'scarletscribs.wordpress.com',
 'createwithmom.com',
 'culturestamps.com',
 'shinyvisa.com',
 'myturntotravel.com',
 'justonewayticket.com',
 'thepetitewanderess.com',
 'ramblingfeet.net',
 'terminaltrend.com',
 'amatteroftaste.me',
 'passportchop.com',
 'livelaughtravel.net',
 'detourista.com',
 'faithjoyhope.blogspot.com',
 'pamgoestravelling.com',
 'singaporego.com',
 'thetlist.net',
 'ourtravelitinerary.com',
 'alvi

In [105]:
# Function to flatten dict
def flatten(items):
    namelist = []
    values = []
    def helper(items, colName=''):
        if type(items) is dict: 
            for k,v in items.items():
                newColName = colName + ' - ' + k if colName != '' else colName + k
                helper(v, newColName)
        elif type(items) is list:
            count = 0
            listLen = len(items)
            while(count < listLen):
                newColName = colName + '[' + str(count) + ']' if colName != '' else colName
                helper(items[count], newColName)
                count += 1

        if type(items) is not dict and type(items) is not list:

            if type(items) is Decimal:
                items = int(items)
            namelist.append(colName)
            values.append(items)
        return namelist,values
    return helper(items)

In [106]:
namelist, values = flatten(awisResultsCleaned)
namelist

['Request - Arguments - Argument[0] - Name - $',
 'Request - Arguments - Argument[0] - Value - $',
 'Request - Arguments - Argument[1] - Name - $',
 'Request - Arguments - Argument[1] - Value - $',
 'ContactInfo - DataUrl - @type',
 'ContactInfo - DataUrl - $',
 'ContentData - DataUrl - @type',
 'ContentData - DataUrl - $',
 'ContentData - SiteData - Title - $',
 'ContentData - Speed - MedianLoadTime - $',
 'ContentData - Speed - Percentile - $',
 'ContentData - LinksInCount - $',
 'Related - DataUrl - @type',
 'Related - DataUrl - $',
 'TrafficData - DataUrl - @type',
 'TrafficData - DataUrl - $',
 'TrafficData - Rank - $',
 'TrafficData - RankByCountry - Country[0] - @Code',
 'TrafficData - RankByCountry - Country[0] - Rank - $',
 'TrafficData - RankByCountry - Country[0] - Contribution - PageViews - $',
 'TrafficData - RankByCountry - Country[0] - Contribution - Users - $',
 'TrafficData - RankByCountry - Country[1] - @Code',
 'TrafficData - RankByCountry - Country[1] - Rank - $',
 

## Retrieve data from AWIS

In [109]:
domainMetricsDf = pd.DataFrame()

for url in allDomainNames[0:1]:
    print(url)
    namelist, values = [], []

    # Create AWIS request object
    obj = CallAwis(awsAccessKeyId, awsSecretAccessKey)
    urlinfo = obj.urlinfo(url)

    # Convert response of BeautifulSoup object from AWIS to dict
    bf = BadgerFish(dict_type=dict) 
    awisResultsDict = bf.data(fromstring(str(urlinfo)))
    
    # Clean response dict
    awisResultsCleaned = eval(re.sub(r'{http://awis.amazonaws.com(.+?)}', '', str(awisResultsDict)))
    awisResultsCleaned = awisResultsCleaned['UrlInfoResponse']['Response']['UrlInfoResult']['Alexa']
    
    # Flatten dictionary
    namelist, values = flatten(awisResultsCleaned)

    # Add to dataframe
    domainMetricsDf = pd.concat([domainMetricsDf, pd.DataFrame([values], columns=namelist)])
print("Retrieval of metrics completed")

loveandroad.com
Retrieval of metrics completed


In [110]:
print(len(domainMetricsDf))
print(len(domainMetricsDf.columns))

1
85


In [None]:
cleanedDomainMetricsDf.to_csv('cleaned_domain_metrics_df' + '.csv', index=False)

In [111]:
# Remove columns that have 95% or more null values
cleanedDomainMetricsDf = domainMetricsDf[domainMetricsDf.columns[domainMetricsDf.isnull().mean() < 0.95]]

# Remove columns containing the word '@type'
cleanedDomainMetricsDf = cleanedDomainMetricsDf[cleanedDomainMetricsDf.columns.drop(list(cleanedDomainMetricsDf.filter(regex='@type')))]

colNameList = []
for colName in list(cleanedDomainMetricsDf.columns):
    colNameList.append(colName.split(' - $')[0])

# print(colNameList)
cleanedDomainMetricsDf.columns = colNameList
cleanedDomainMetricsDf

Unnamed: 0,Request - Arguments - Argument[0] - Name,Request - Arguments - Argument[0] - Value,Request - Arguments - Argument[1] - Name,Request - Arguments - Argument[1] - Value,ContactInfo - DataUrl,ContentData - DataUrl,ContentData - SiteData - Title,ContentData - Speed - MedianLoadTime,ContentData - Speed - Percentile,ContentData - LinksInCount,...,TrafficData - UsageStatistics - UsageStatistic[3] - PageViews - PerMillion - Delta,TrafficData - UsageStatistics - UsageStatistic[3] - PageViews - Rank - Value,TrafficData - UsageStatistics - UsageStatistic[3] - PageViews - Rank - Delta,TrafficData - UsageStatistics - UsageStatistic[3] - PageViews - PerUser - Value,TrafficData - UsageStatistics - UsageStatistic[3] - PageViews - PerUser - Delta,TrafficData - ContributingSubdomains - ContributingSubdomain - DataUrl,TrafficData - ContributingSubdomains - ContributingSubdomain - TimeRange - Months,TrafficData - ContributingSubdomains - ContributingSubdomain - Reach - Percentage,TrafficData - ContributingSubdomains - ContributingSubdomain - PageViews - Percentage,TrafficData - ContributingSubdomains - ContributingSubdomain - PageViews - PerUser
0,url,loveandroad.com,responsegroup,"RelatedLinks,Categories,Rank,ContactInfo,RankB...",loveandroad.com,loveandroad.com,loveandroad.com/,2266,37,106,...,-57.829%,501604,174155,1,+20%,loveandroad.com,1,100.00%,100.00%,1.5


In [34]:
# Writes to output csv 
# domainMetricsDf.to_csv('domain_metrics_df' + '.csv', index=False)
# cleanedDomainMetricsDf.to_csv('cleaned_domain_metrics_df' + '.csv', index=False)

#### Resources:
1. Converting BeautifulSoup Object to dict
    - https://pypi.org/project/xmljson/