In [1]:
import string
import json
import re
import pandas as pd
from datetime import datetime
import requests
from joblib import Parallel, delayed
from tqdm import tqdm
from random import sample
import boto3
from s3fs import S3FileSystem
import pyarrow as pa
import pyarrow.parquet as pq

In [2]:
APP_STORE_JSON_URL = 'https://itunes.apple.com/lookup?id='

In [3]:
def strip_emoji(text):
    if text:
        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)
    else:
        return text


def clean_string(text):
    return text.replace('"', '').replace('\\', '')

In [4]:
bundles_ios = pd.read_csv("../data/bundles_ios.csv")
bundles_ios.shape

(11471, 1)

In [13]:
def get_app_metadata(results, bundle_id):
    app_metadata = {}
    app_metadata["operatingSystem"] = "IOS"
    app_metadata["title"] = None
    if "trackName" in results:
        app_metadata["title"] = results["trackName"]
    
    app_metadata["contentRating"] = None
    if "contentAdvisoryRating" in results:
        app_metadata["contentRating"] = results["contentAdvisoryRating"]
    
    app_metadata["app_type"] = None
    if "kind" in results:
        app_metadata["app_type"] = results["kind"]
        
    app_metadata["size"] = None
    if "fileSizeBytes" in results:    
        app_metadata["size"] = str(int(results["fileSizeBytes"])//(1024*1024))+"M"
    
    app_metadata["description"] = None
    if "description" in results:    
        app_metadata["description"] = clean_string(
        strip_emoji(results["description"]))

    # app rating
    app_metadata["ratingValue"] = None
    if "averageUserRating" in results:
        app_metadata["ratingValue"] = results["averageUserRating"]
    
    app_metadata["ratingCount"] = None
    if "userRatingCount" in results:
        app_metadata["ratingCount"] = results["userRatingCount"]
    
    app_metadata["price"] = None
    if "price" in results:
        app_metadata["price"] = results["price"]
    
    app_metadata["priceCurrency"] = None
    if "currency" in results:
        app_metadata["priceCurrency"] = results["currency"]
    
    app_metadata["primary_genre"] = None
    if "primaryGenreName" in results:
        app_metadata["primary_genre"] = results["primaryGenreName"]
    
    app_metadata["author_name"] = None
    if "sellerName" in results:
        app_metadata["author_name"] = results["sellerName"]
    
    app_metadata["author_url"] = None
    if "sellerUrl" in results:
        app_metadata["author_url"] = results["sellerUrl"]
    
    app_metadata["updated"] = None
    if "currentVersionReleaseDate" in results:    
        app_metadata["updated"] = datetime.strptime(
        results["currentVersionReleaseDate"], '%Y-%m-%dT%H:%M:%SZ').date()
    
    app_metadata["installs"] = None
    app_metadata["bundle_id"] = bundle_id
    app_metadata["icon_url"] = "NA"
    if "artworkUrl100" in results:
        app_metadata["icon_url"] = results["artworkUrl100"]
    
    return app_metadata

In [14]:
bundleid_list = list(bundles_ios.bundle_id.unique())
samples = sample(bundleid_list, 100)

In [15]:
def get_api_response(bundle_id):
    request = requests.get(APP_STORE_JSON_URL + str(bundle_id))
    if request== None or request.text==None or request.text=="":
        return None
        
    response = json.loads(request.text)
    results = None
    if response['resultCount'] > 0:
        results = response["results"][0]
        app_metadata = get_app_metadata(results, bundle_id)
        return app_metadata
    else:
        return None

In [16]:

processed = Parallel(n_jobs=4)(delayed(get_api_response)(bundle_id)
                               for bundle_id in tqdm(samples))

100%|██████████| 100/100 [01:31<00:00,  1.09it/s]


In [18]:
app_store = [app for app in processed if (app != None and len(app) == 17)]
app_store_df = pd.DataFrame(app_store)
app_store_df["date"] = "2020-04-01"
app_store_df.head()

Unnamed: 0,operatingSystem,title,contentRating,app_type,size,description,ratingValue,ratingCount,price,priceCurrency,primary_genre,author_name,author_url,updated,installs,bundle_id,icon_url,date
0,IOS,Flower Crush - Match 3 & Blast Garden to Bloom!,4+,software,217M,Fall in love with this sweet flower match 3 pu...,4.38542,96,0.0,USD,Games,Tho Huynh Ngoc,http://www.cobala.xyz,2017-04-24,,1227364416,https://is5-ssl.mzstatic.com/image/thumb/Purpl...,2020-04-01
1,IOS,KMOT-TV First Warn Weather,4+,software,84M,KMOT is proud to announce a full featured weat...,3.66667,3,0.0,USD,Weather,"Gray Television Group, Inc.",http://www.kmot.com/about/kmotsales,2020-02-17,,451557364,https://is2-ssl.mzstatic.com/image/thumb/Purpl...,2020-04-01
2,IOS,Stone Skimming,12+,software,262M,Throw stones and make them skip as far as you ...,4.5315,55095,0.0,USD,Games,Voodoo,http://voodoo.io,2020-04-10,,1344851805,https://is3-ssl.mzstatic.com/image/thumb/Purpl...,2020-04-01
3,IOS,AS,4+,software,91M,"The new AS app, completely revamped so you can...",2.875,8,0.0,USD,Sports,Diario AS,http://as.com,2020-04-15,,311797767,https://is1-ssl.mzstatic.com/image/thumb/Purpl...,2020-04-01
4,IOS,Mudras [YOGA],4+,software,34M,New Mudras [YOGA] is now available for free\n\...,2.875,8,0.0,USD,Health & Fitness,Swapna Puramsetty,http://perfectgym.co.in,2020-03-07,,1316985118,https://is2-ssl.mzstatic.com/image/thumb/Purpl...,2020-04-01


In [None]:
def pandas_to_s3_parquet(df, profile_name=None, partition_cols=None):
    bucket = "<bucket_name>"
    prefix = f"<prefix>"
    s3_url = f's3://{bucket}/{prefix}'
    if profile_name == None:
        s3 = S3FileSystem()
    else:
        s3 = S3FileSystem(profile_name=profile_name)
    table = pa.Table.from_pandas(df)
    print(table)
    pq.write_to_dataset(table=table, partition_cols=partition_cols,
                        root_path=s3_url,
                        filesystem=s3)

In [None]:
if app_store_df.shape[0]>1:
    pandas_to_s3_parquet(app_store_df,profile_name="<profile_name>",
                        partition_cols=["date"])