In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import string
import json
from random import sample
import re
import time
import requests
from datetime import datetime
import collections
import pandas as pd
from joblib import Parallel, delayed
from tqdm import tqdm
from s3fs import S3FileSystem
import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
PLAY_STORE_URL = 'https://play.google.com/store/apps/details?id='

In [3]:
bundles_android = pd.read_csv("../data/bundles_android.csv")
bundles_android=bundles_android[~bundles_android.bundle_id.isnull()]
bundles_android = bundles_android[
    ~(bundles_android.bundle_id.str.startswith("www")) &
    ~(bundles_android.bundle_id.str.startswith("m")) & 
    ~(bundles_android.bundle_id.str.startswith("http")) &
    ~(bundles_android.bundle_id.str.endswith(".com"))
]
print(bundles_android.shape)

(50398, 1)


In [4]:
def strip_emoji(text):
    if text:
        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)
    else:
        return text


def clean_string(text):
    return text.replace('"', '').replace('\\', '')

In [5]:
def get_html_source(bundle_id):

    try:
        page = requests.get(f"{PLAY_STORE_URL}{bundle_id}")
    except requests.Timeout as err:
        print({"message": err.message})
        
    if page.status_code < 400:
        return page.content
    else:
        return None

In [26]:
bundleid_list = list(bundles_android.bundle_id.unique())
samples = sample(bundleid_list, 20)

In [27]:
def get_app_metadata(bundle_id):
    app_metadata = {}
    html = get_html_source(bundle_id)
    if html == None:
        app_metadata["bundle_id"] = bundle_id
        return app_metadata

    soup = BeautifulSoup(html, 'html.parser')
    title_tag = soup.find('title', id="main-title")
    app_metadata["title"] = title_tag.find(
        text=True).replace(" - Apps on Google Play", "")

    app_metadata["app_type"] = None
    app_metadata["description"] = None
    app_metadata["operatingSystem"] = None
    app_metadata["contentRating"] = None
    author = None
    aggregateRating = None
    offers = None
    scripts = soup.find_all('script')
    for script in scripts:
        if 'type' in script.attrs and script['type'] == "application/ld+json":
            contents = json.loads(script.contents[0])

    app_metadata["primary_genre"] = contents["applicationCategory"]
    app_metadata["description"] = clean_string(
        strip_emoji(contents["description"]))
    app_metadata["app_type"] = contents['@type']
    app_metadata["operatingSystem"] = contents["operatingSystem"]
    app_metadata["contentRating"] = contents["contentRating"].replace(
        "Rated for ", "")

    author = contents["author"]
    app_metadata["ratingValue"] = None
    app_metadata["ratingCount"] = None
    if "aggregateRating" in contents:
        aggregateRating = contents["aggregateRating"]
        app_metadata["ratingValue"] = aggregateRating['ratingValue']
        app_metadata["ratingCount"] = aggregateRating['ratingCount']

    offers = contents["offers"][0]
    app_metadata["price"] = offers['price']
    app_metadata["priceCurrency"] = offers['priceCurrency']
    app_metadata["author_name"] = author["name"]
    app_metadata["author_url"] = None
    if "url" in author:
        app_metadata["author_url"] = author["url"]

    key_divs = soup.find_all('div', class_="BgcNfc")
    keys = []
    for key in key_divs:
        keys.append(key.find(text=True))

    val_spans = soup.find_all('span', class_="htlgb")
    values = []
    for val_span in val_spans:
        values.append(val_span.find(text=True))

    values = sorted(set(values), key=values.index)
    additional_details = dict(zip(keys, values))
    app_metadata["size"] = additional_details["Size"]
    app_metadata["installs"] = additional_details["Installs"]
    app_metadata["updated"] = datetime.strptime(
        additional_details["Updated"], '%B %d, %Y').date()
    app_metadata["bundle_id"] = bundle_id
    icon_url = soup.find('img', alt="Cover art")
    if icon_url is not None:
        app_metadata["icon_url"] = icon_url['src']
    return app_metadata

In [28]:
# processed = Parallel(n_jobs=1)(delayed(get_app_metadata)(bundle_id)
#                                for bundle_id in tqdm(samples))

In [29]:
def pandas_to_s3_parquet(df, profile_name=None, partition_cols=None):
    bucket = "<bucket_name>"
    prefix = f"<prefix_name>"
    s3_url = f's3://{bucket}/{prefix}'
    if profile_name == None:
        s3 = S3FileSystem()
    else:
        s3 = S3FileSystem(profile_name=profile_name)
    table = pa.Table.from_pandas(df)
    pq.write_to_dataset(table=table, partition_cols=partition_cols,
                        root_path=s3_url,
                        filesystem=s3)

In [31]:
start = time.time()
play_store = []
for index, bundle_id in enumerate(samples):
    processed = get_app_metadata(bundle_id)
    if index > 0 and index % 100 == 0:
        end = time.time()
        print(f"Time elapsed for scraping {index} android apps",
              f"{(end-start)/60.0} minutes")
        start = time.time()
    
    #print(processed)
    if len(processed) > 1:
        play_store.append(processed)

play_store_df = pd.DataFrame(play_store) 


In [32]:
play_store_df

Unnamed: 0,title,app_type,description,operatingSystem,contentRating,primary_genre,ratingValue,ratingCount,price,priceCurrency,author_name,author_url,size,installs,updated,bundle_id,icon_url
0,Mod PokeCraft + New Mod and Skins,SoftwareApplication,Three new entites known as pokemobs have been ...,ANDROID,Everyone,GAME_ADVENTURE,3.0987653732299805,1665,0,USD,M.M.Studio,,68M,"100,000+",2020-03-15,com.doublemstudio.modpokecraft,https://lh3.googleusercontent.com/f9qzSLzpn7Tc...
1,Family Zoo: The Story,SoftwareApplication,Celebrate Valentine's Day\n\nThe most awaited ...,ANDROID,Everyone,GAME_PUZZLE,4.288489818572998,183044,0,USD,Plarium LLC,https://www.familyzoothestory.com/,101M,"10,000,000+",2020-04-07,com.plarium.zoo,https://lh3.googleusercontent.com/xPjtiqnjoGoG...
2,Трактор+,SoftwareApplication,Трактор+ от Sports.ru для Android – новое прил...,ANDROID,Everyone,SPORTS,4.619999885559082,628,0,USD,Sports.ru,https://sports.ru,13M,"10,000+",2019-07-19,ru.sports.khl_traktor,https://lh3.googleusercontent.com/YeWSUlKH6rdj...
3,Word Search Quest : Word Search Stacks Puzzle ...,SoftwareApplication,The most addictive word puzzle game with a rel...,ANDROID,Everyone,GAME_WORD,4.650000095367432,201,0,USD,Beasty App Media L.L.C.,http://beastyappmedia.com,33M,"10,000+",2019-12-19,com.ghkgame.wordquest,https://lh3.googleusercontent.com/ZThNAA5we_YX...
4,Yukkuri My Friends,SoftwareApplication,Yukkuris love manjyus. By feeding them manjyus...,ANDROID,Teen,GAME_ADVENTURE,4.504524230957031,32099,0,USD,MiuLabo,http://bocchigames.main.jp/,Varies with device,"500,000+",2020-04-04,jp.miura.yukkurisodateteittene,https://lh3.googleusercontent.com/q07Xbj-irbZ-...
5,Sniper 3D Shooter- Free Gun Shooting Game,SoftwareApplication,You like killing games? If so The Sniper 3D Sh...,ANDROID,Mature 17+,GAME_ROLE_PLAYING,4.205069065093994,8925,0,USD,Fun Shooting Games For Free,https://atm451d89e7f5454a338be2.adstxtmarket.com,Varies with device,"1,000,000+",2019-10-18,com.shootinggames.sniper3d.assassin,https://lh3.googleusercontent.com/qjItSFn0nNg1...


In [None]:
if play_store_df.shape[0]>1:
    play_store_df["date"] = "2020-03-31"
    pandas_to_s3_parquet(play_store_df,profile_name="<profile_name>",
                        partition_cols=["date"])