# Web Scraping

## 1) Scraping a list of Urls

In [1]:
import pandas as pd
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import os
import time 
import requests
import re
pd.options.display.max_colwidth
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

Sensortower.com has the moblie games data.

- Google Play

https://sensortower.com/android/rankings/top/mobile/us/game?date=2019-04-30
https://sensortower.com/android/rankings/top/mobile/us/game?date=2019-05-31
https://sensortower.com/android/rankings/top/mobile/us/game?date=2019-06-30

- App Store

https://sensortower.com/ios/rankings/top/iphone/us/games?date=2019-04-30
https://sensortower.com/ios/rankings/top/iphone/us/games?date=2019-05-31
https://sensortower.com/ios/rankings/top/iphone/us/games?date=2019-06-30

- Scrap 6 web pages to get 7200 total urls.
- Each web page has 3 ranking: Free, Paid, Top Grossing
- Pull top 400 games from each ranking.
6 * 3 * 400 = 7200

In [27]:
# Selenium to scroll down page. Soup the urls. Put them into a list
driver = webdriver.Chrome(chromedriver)
url= "https://sensortower.com/ios/rankings/top/iphone/us/games?date=2019-04-30"
driver.maximize_window()
driver.get(url)
pause_time = 0.5
# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")

while driver.page_source:
    # Scroll down to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # wait to load page
    time.sleep(pause_time)

    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height: # which means end of page
        break
    # update the last height
    last_height = new_height

    
soup = BeautifulSoup(driver.page_source, 'html.parser')
url_list = []
for link in soup.find_all('a', class_="name"):
    url_list.append(link.get('href'))


In [28]:
# Check the list
url_list[0:20]

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 '/ios/us/geisha-tokyo-inc/app/traffic-run/1434400630/',
 '/ios/us/mojang/app/minecraft/479516143/',
 '/ios/us/supercell/app/clash-royale/1053012308/',
 '/ios/us/crazy-labs/app/amaze/1452526406/',
 '/ios/us/ndemic-creations/app/plague-inc/525818839/',
 '/ios/us/supercell/app/clash-of-clans/529479190/',
 '/ios/us/voodoo/app/purple-diver/1453402726/',
 '/ios/us/warner-bros/app/heads-up/623592465/',
 '/ios/us/epic-games/app/fortnite/1261357853/',
 '/ios/us/saygames-llc/app/clean-road/1455654495/',
 '/ios/us/ninja-kiwi/app/bloons-td-6/1118115766/']

In [29]:
# Check len
len(url_list)

1209

In [31]:
# Count the None in the list
countnone = 0
for i in url_list:
    if i == None:
        countnone+=1
countnone

9

In [32]:
# Drop the none
url_list_clean = [i for i in url_list if i]

In [33]:
# Check len
len(url_list_clean)

1200

In [34]:
url_list_clean[0:10]

['/ios/us/geisha-tokyo-inc/app/traffic-run/1434400630/',
 '/ios/us/mojang/app/minecraft/479516143/',
 '/ios/us/supercell/app/clash-royale/1053012308/',
 '/ios/us/crazy-labs/app/amaze/1452526406/',
 '/ios/us/ndemic-creations/app/plague-inc/525818839/',
 '/ios/us/supercell/app/clash-of-clans/529479190/',
 '/ios/us/voodoo/app/purple-diver/1453402726/',
 '/ios/us/warner-bros/app/heads-up/623592465/',
 '/ios/us/epic-games/app/fortnite/1261357853/',
 '/ios/us/saygames-llc/app/clean-road/1455654495/']

In [36]:
# convert the list to dataframe and add the front missing part of url for each url.
url_pd = pd.DataFrame(url_list_clean, columns=["URL"])
url_pd['URL'] = 'https://sensortower.com' + url_pd["URL"].astype(str)

In [43]:
# Combine all urls to one dataframe
frames = [url_pd, url_pd1, url_pd2....]

url_df = pd.concat(frames)

In [46]:
# Drop the duplicated url, and we left around 3200 unique urls.
url_df.drop_duplicates(keep='first', inplace=True)

In [48]:
# save to csv
url_df.to_csv('url_list.csv', index=False)

2) Scraping the game overview page with a list of 3200 Urls

In [9]:
# The key word I am looking in the html.
kocomponent = re.compile("ko component")

In [10]:
# import the 3200 url dataframe
url_list = pd.read_csv('url_list')

In [11]:
# convert it to a list
urls = url_list['URL'].tolist()

['https://sensortower.com/ios/us/voodoo/app/aquapark-io/1453989822/',
 'https://sensortower.com/ios/us/mojang/app/minecraft/479516143/',
 'https://sensortower.com/ios/us/supercell/app/clash-of-clans/529479190/',
 'https://sensortower.com/ios/us/voodoo/app/touch-the-wall/1464879969/',
 'https://sensortower.com/ios/us/warner-bros/app/heads-up/623592465/',
 'https://sensortower.com/ios/us/king/app/candy-crush-saga/553834731/',
 'https://sensortower.com/ios/us/voodoo/app/pottery/1451304804/',
 'https://sensortower.com/ios/us/ndemic-creations/app/plague-inc/525818839/',
 'https://sensortower.com/ios/us/niantic-inc/app/pokemon-go/1094591345/',
 'https://sensortower.com/ios/us/good-job-games/app/fun-race-3d/1462556579/']

## 2) Scraping data from each url

In [19]:
# Function to soup the html and convert the part I want to dict then to pd.dataframe
def scapegameinfo(urls):
    game_df = pd.DataFrame()
    for i, url in enumerate(urls):
        response = requests.get(url)
        page = response.text
        soup = BeautifulSoup(page, "lxml")
        S = soup.find_all(text=kocomponent)[1].split("params: ")[1][:-3]
        null = None
        false = False
        true = True
        game_dict = eval(S)
        game_df = game_df.append(game_dict, ignore_index=True)
        time.sleep(2)
        if (i+1 % 5 == 0):
            time.sleep(10)
    return game_df


In [14]:
# excute the scraping function, this told hours. Depend on the len of list. Suggest doing 200 at a time.
game_df_raw = scapegameinfo(urls)

In [15]:
game_df_raw.head()

Unnamed: 0,app_id,app_profile_url,app_versions,app_view_url,apple_watch_enabled,apple_watch_icon,categories,category_chart_type_pairs,contains_ads,country,...,subtitle,support_url,top_in_app_purchases,trailers,unified_app,urlified_name,urlified_publisher_name,valid_countries,version,worldwide_release_date
0,1453990000.0,https://apps.apple.com/US/app/id1453989822?l=en,"[{'date': 1562544000000, 'value': '2.5'}, {'da...",/ios/us/voodoo/app/aquapark-io/1453989822/,,,"[{'countryUrl': 'us', 'deviceIdentifier': 'iph...","[[0, topfreeapplications], [6014, topfreeappli...",,US,...,Best water slide game,https://voodoo.io,"[{'iap_id': 1463210291, 'price': '$2.99', 'nam...",{'iPhone': {'img': 'https://is5-ssl.mzstatic.c...,,aquapark-io,voodoo,"[US, AU, CA, CN, FR, DE, GB, IT, JP, KR, RU, D...",2.5,1558100000000.0
1,479516100.0,https://apps.apple.com/US/app/id479516143?l=en,"[{'date': 1562630400000, 'value': '1.12'}, {'d...",/ios/us/mojang/app/minecraft/479516143/,,,"[{'countryUrl': 'us', 'deviceIdentifier': 'iph...","[[0, toppaidapplications], [6014, toppaidappli...",,US,...,"Create, explore and survive!",http://help.mojang.com,"[{'iap_id': 1213177982, 'price': '$1.99', 'nam...","{'iPhone': None, 'iPad': None}",,minecraft,mojang,"[US, AU, CA, FR, DE, GB, IT, JP, KR, RU, DZ, A...",1.12,1321520000000.0
2,529479200.0,https://apps.apple.com/US/app/id529479190?l=en,"[{'date': 1560988800000, 'value': '11.651.12'}...",/ios/us/supercell/app/clash-of-clans/529479190/,,,"[{'countryUrl': 'us', 'deviceIdentifier': 'iph...","[[0, topfreeapplications], [6014, topfreeappli...",,US,...,Build your empire,https://supercell.helpshift.com/a/clash-of-clans/,"[{'iap_id': 529484491, 'price': '$4.99', 'name...",{'iPhone': {'img': 'https://is5-ssl.mzstatic.c...,,clash-of-clans,supercell,"[US, AU, CA, CN, FR, DE, GB, IT, JP, KR, RU, D...",11.651.12,1343900000000.0
3,1464880000.0,https://apps.apple.com/US/app/id1464879969?l=en,"[{'date': 1562371200000, 'value': '1.1'}, {'da...",/ios/us/voodoo/app/touch-the-wall/1464879969/,,,"[{'countryUrl': 'us', 'deviceIdentifier': 'iph...","[[0, topfreeapplications], [6014, topfreeappli...",,US,...,Sneak your way to victory,https://www.voodoo.io,"[{'iap_id': 1470916394, 'price': '$2.99', 'nam...","{'iPhone': None, 'iPad': None}",,touch-the-wall,voodoo,"[US, AU, CA, CN, FR, DE, GB, IT, JP, KR, RU, D...",1.1,1561680000000.0
4,623592500.0,https://apps.apple.com/US/app/id623592465?l=en,"[{'date': 1560816000000, 'value': '3.9.23'}, {...",/ios/us/warner-bros/app/heads-up/623592465/,,,"[{'countryUrl': 'us', 'deviceIdentifier': 'iph...","[[0, toppaidapplications], [6014, toppaidappli...",,US,...,Trivia on the go,http://www.ellentube.com/article/apps-and-game...,"[{'iap_id': 709779977, 'price': '$0.99', 'name...",{'iPhone': {'img': 'https://is4-ssl.mzstatic.c...,,heads-up,warner-bros,"[US, AU, CA, CN, FR, DE, GB, IT, JP, KR, RU, D...",3.9.23,1413250000000.0


In [37]:
game_df_raw.to_csv("game_df_raw.csv")