# Find apps that are both on Apple and Google App Stores

In [1]:
import sys, os
import pandas as pd
from textdistance import levenshtein
from itertools import product

## Define some variables

In [2]:
final_dir = "final_results/"
apple_unique_file = "final_results_apple_bystander_unique.csv"
google_unique_file = "merged_results_android_bystander_unique.csv"

## Find matches

In [8]:
print()

#read apple csv
print("1) Reading Apple App Store unique entries ...", end=" ")
df2 = pd.read_csv(final_dir + apple_unique_file, sep=',', header = 0, dtype='unicode')
num_rows = len(df2.index)
print(str(num_rows) + " rows")

#read google csv
print("2) Reading Google Play Store unique entries ...", end=" ")
df3 = pd.read_csv(final_dir + google_unique_file, sep=',', header = 0, dtype='unicode')
num_rows = len(df3.index)
print(str(num_rows) + " rows")

#drop columns we don't need
df2.drop(['pref', 'dupe_count', 'store_lang'], axis=1, inplace=True)
df3.drop(['pref', 'dupes_merged', 'store_lang', 'round'], axis=1, inplace=True)

#add source columns
df2['store'] = "Apple"
df3['store'] = "Google"

spec_chars = ["!",'"',"#","%","&","'","(",")", "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_", "`","{","|","}","~","–"]

#clean up apple titles
df2['Title'] = df2['nameid'].replace('-',' ', regex=True).str.lower()
df2['Title'] = df2['Title'].replace('app','', regex=True)
df2['Title'] = df2['Title'].str.strip()
df2['appid'] = df2['nameid']
df2.drop('nameid', axis=1, inplace=True)

#clean up google titles
for char in spec_chars:
    df3['Title'] = df3['Title'].str.replace(char, '', regex=True)
    
df3['Title'] = df3['Title'].str.split().str.join(" ")
df3['Title'] = df3['Title'].str.lower().replace('google play','', regex=True)
df3['Title'] = df3['Title'].replace('apps on','', regex=True)
df3['Title'] = df3['Title'].replace('apps bei','', regex=True)
df3['Title'] = df3['Title'].str.strip()

#df4 = pd.DataFrame(product(df2['Title'], df3['Title']), columns=['Apple', 'Google'])
#df4['Distance'] = df4.apply(lambda x: levenshtein.distance(x['Apple'], x['Google']), axis=1)
#df4.sort_values(by='Distance', inplace=True)
#df4.drop(df4[df4.Distance > 1].index, inplace=True) #any distance above 1 is deleted

#join datasets and try to identify apps available in both stores
df2 = pd.concat([df2,df3])
df2['both_stores'] = df2.groupby('Title').appid.transform('count')-1
df2.sort_values(by=['both_stores','Title'], ascending=False, inplace=True)

df2.to_csv(final_dir + "google_apple_matches.csv", index = False)
num_rows_uni = len(df2.index)
print ("3) Created CSV with both datasets: " + str(num_rows_uni) + " rows.")
print ("4) Number of apps in both stores: " + str(round(sum(df2['both_stores']/2))))


1) Reading Apple App Store unique entries ... 58 rows
2) Reading Google Play Store unique entries ... 41 rows
3) Created CSV with both datasets: 99 rows.
4) Number of apps in both stores: 14
