# Google Play Store App Data Retrieval

This **Notebook-as-Tool** allows you:

1.   to query the Google Play Store for Apps and retrieve metadata about them,
2.   to retrieve a Network of similar Apps based on the search results.

For running or adapting this Colab Notebook you need to create a copy in you Google drive: **File → Save a copy in Drive**. I will be stored in a folder ```Colab Notebooks```. Open this file with Google Colab and run the cells consecutively by pressing the **Play** button or pushing **shift+enter**.

**Important notes:**

- Code is hidden in the background of Colab forms. For viewing and editing the code **double click** cell or select  **View → Show/hide code**
- Data will be stored in Google Drive in the folder ```Colab Data```. A connection to your drive will be authenticated when running setup code cells. This is temporary and only your current notebook will be conncted to your drive. The connection will be revoked when the notebook is terminated or by selecting **Runtime → Factory reset runtimme**.


**Credits:** This notebook was written by Marcus Burkhardt. For scraping data from Google's Play Store it uses the Library [google-play-scraper-dmi](https://pypi.org/project/google-play-scraper-dmi/) released by the Digital Methods Initiative. Documentation of the library can be found here: https://github.com/digitalmethodsinitiative/google-play-scraper. 

In [None]:
#@title Setup 1: Mount Google Drive for Loading and Storing Data
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
#@title Setup 2: Install and Load Required Libraries and Run Setup Procedures

# The currently released version of the google play scraper has a bug. 
# !pip install google-play-scraper-dmi

try: 
  from google_play_scraper.scraper import PlayStoreScraper
except: 
  print('The current version of the Google Play Scraper on PyPI has a bug. A fixed version is installed from Github instead.')
  print()
  !pip install git+https://github.com/bumatic/google-play-scraper.git
  from google_play_scraper.scraper import PlayStoreScraper

# Import Libaries
import os
import time
import pandas as pd
from datetime import datetime
from tqdm.notebook import tqdm
from google_play_scraper.scraper import PlayStoreScraper

# Defining path variable for data path
data_path = os.path.join("gdrive", "MyDrive", "Colab_Data", "Data", "Google Play Store")
if not os.path.isdir(data_path):
  os.makedirs(data_path)

# Initialize Scrapers for Play Store
scraper = PlayStoreScraper()

In [None]:
#@title Setup 3: Definition of Core and Support Functions Used by the Tool(s)

def retrieve_apps(ids, country='de', lang='de'):
    time.sleep(2)
    apps = []
    if type(ids)==str:
        apps.append(scraper.get_app_details(ids, country=country, lang=lang))
    elif type(ids)==list:
        for id_ in tqdm(ids):
          try:
            apps.append(scraper.get_app_details(id_, country=country, lang=lang))
          except:
            print('Could not retrieve app {}'.format(id_))
    apps = pd.json_normalize(apps)
    return apps

def query_store(query, num=100, page=1, country='de', lang='de'): 
    ids = scraper.get_app_ids_for_query(query, num=num, page=page, country=country, lang=lang)
    print('{} IDs of apps retrieved for query [{}]. Now retrieving data for individual apps.'.format(len(ids), query))
    apps = retrieve_apps(ids,country=country, lang=lang)
    return apps

def search(queries, num=100, page=1, country='de', lang='de', flatten=True):
  data = pd.DataFrame()
  for query in tqdm(queries, desc='Running queries'):
    tmp = query_store(query, num=num, page=page, country=country, lang=lang)
    data = data.append(tmp)
    time.sleep(3)
  return data.drop_duplicates('id').reset_index(drop=True)

def get_similar_network(apps, country='de', lang='de', store='Play Store'):
  similarEdges = []
  for id_ in tqdm(list(apps['id'])):
      time.sleep(1)
      try:
        tmp = scraper.get_similar_app_ids_for_app(id_, country=country, lang=lang)
      except:
        tmp = []
        print('Could not retrieve similar apps for {}'.format(id_))
      for t in tmp:
          similarEdges.append([str(t), str(id_)])
  time.sleep(5)
  similarEdges = pd.DataFrame(similarEdges, columns=['Source', 'Target'])
  ids2get = [item for item in list(similarEdges['Source']) if item not in list(similarEdges['Target'])]

  print('Details for {} apps will be retrieved.'.format(len(ids2get)))
  similarApps = retrieve_apps(ids2get)
  similarApps['is_seed'] = False
  apps['is_seed'] = True
  similarApps = apps.append(similarApps)
  similarApps = similarApps.drop_duplicates('id')
  
  similarNodes = similarApps
  similarNodes = similarNodes[['id', 'title', 'category', 'is_seed']].copy()
  similarNodes.columns = ['Id', 'Label', 'primaryGenre', 'isSeed']
  similarNodes = similarNodes.drop_duplicates('Id')
  similarNodes['store'] = store
  return similarApps, similarNodes, similarEdges

In [None]:
#@title Tool 1: Query Google Play Store (comma separate multiple queries)
queries = "" #@param {type:"string"}
queries = [query.strip() for query in queries.split(',')]
number = 5 #@param {type:"slider", min:5, max:1000, step:5}

out_name = "" # could be added as a form field.

if not out_name or (out_name and out_name == ""):
  out_name = '_'.join(queries)
outfile_name = 'play_store_query_' + out_name + '_' +str(number)+'_items_' + str(datetime.fromtimestamp(time.time()).date())+'.csv'
outfile = os.path.join(data_path, outfile_name)
print('Results will be stored at {}'.format('/'.join(outfile.split('/')[2:])))
print()

data = search(queries, num=number)
data.to_csv(outfile, sep='\t', index=None)

print('Done.')

In [None]:
#@title Tool 2: Retrieve Network of Similar Apps for Results of Tool 1

outfile_apps = os.path.join(data_path, 'similar_apps_play_store_query_' + out_name + '_' +str(number)+'_seed_items_' + str(datetime.fromtimestamp(time.time()).date())+'.csv')
outfile_nodes = os.path.join(data_path, 'similar_nodes_play_store_query_' + out_name + '_' +str(number)+'_seed_items_' + str(datetime.fromtimestamp(time.time()).date())+'.csv')
outfile_edges = os.path.join(data_path, 'similar_edges_play_store_query_' + out_name + '_' +str(number)+'_seed_items_' + str(datetime.fromtimestamp(time.time()).date())+'.csv')

print('Similar Apps will be saved to: {}'.format('/'.join(outfile_apps.split('/')[2:])))
print('Node file for similar Apps network will be saved to: {}'.format('/'.join(outfile_nodes.split('/')[2:])))
print('Edge file for similar Apps network will be saved to: {}'.format('/'.join(outfile_edges.split('/')[2:])))
print()

apps, nodes, edges = get_similar_network(data)
if out_name and out_name != "":
  out_name = '_'.join(queries)

apps.to_csv(outfile_apps, sep='\t', index=None)
nodes.to_csv(outfile_nodes, sep='\t', index=None)
edges.to_csv(outfile_edges, sep='\t', index=None)

print('Done.')