# **Problem 3: Google Playstore Apps Dataset**

In [502]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import re

In [503]:
file_in = 'GooglePlaystore.xlsx'
df = pd.read_excel(file_in)
# display(df)

### **Preprocessing**

In [504]:
# 1) Removing the outlier with 3.0M reviews
df = df[df['Reviews'] != '3.0M']
# uncomment the line below to visualize the result
# display(df)

In [505]:
# 2) Remove any row with any column that has "Varies with device"
to_remove = lambda x: any(x == 'Varies with device')
df = df[~df.apply(to_remove, axis=1)]
# uncomment the line below to visualize the result
# display(df)

In [506]:
# 3) Android version reformatting
regex = r'(\d+\.\d+)'
df['Android Ver'] = df['Android Ver'].str.extract(regex).astype(float)
# uncomment the line below to visualize the result
# display(df)

In [507]:
# 4) Reformat installs 
df['Installs'] = df['Installs'].str.replace(',', '').str.replace('+', '')
df['Installs'] = df['Installs'].astype(int)
# uncomment the line below to visualize the result
# display(df)

In [508]:
# 5) Fill missing ratings 
df['Reviews'] = df['Reviews'].astype(int)
df = df[~((df['Reviews'] < 100) & (df['Installs'] < 50000) & (df['Rating'].isna()))]

average_rating = df.groupby('Category')['Rating'].mean()
for category, avg in average_rating.items():
    df.loc[(df['Category'] == category) & (df['Rating'].isna()), 'Rating'] = avg
# uncomment the line below to visualize the result
# display(df)

In [510]:
# 6) preprocess size column
def process(size_in):
    if pd.isnull(size_in):
        return np.na()
    elif size_in.endswith('M'):
        return int(float(size_in[:-1]) * 1e6)
    elif size_in.endswith('k'):
        return int(float(size_in[:-1]) * 1e3)
    else:
        return size_in
    
df['Size'] = df['Size'].apply(process)
display(df)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19e+6,10000,Free,0,Everyone,Art & Design,2018-01-07 00:00:00,1.0.0,4.0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14e+6,500000,Free,0,Everyone,Art & Design;Pretend Play,2018-01-15 00:00:00,2.0.0,4.0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7e+6,5000000,Free,0,Everyone,Art & Design,2018-08-01 00:00:00,1.2.4,4.0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8e+6,100000,Free,0,Everyone,Art & Design;Creativity,2018-06-20 00:00:00,1.1,4.4
5,Paper flowers instructions,ART_AND_DESIGN,4.4,167,5.6e+6,50000,Free,0,Everyone,Art & Design,2017-03-26 00:00:00,1,2.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10832,FR Tides,WEATHER,3.8,1195,582e+3,100000,Free,0,Everyone,Weather,2014-02-16 00:00:00,6,2.1
10833,Chemin (fr),BOOKS_AND_REFERENCE,4.8,44,619e+3,1000,Free,0,Everyone,Books & Reference,2014-03-23 00:00:00,0.8,2.2
10834,FR Calculator,FAMILY,4.0,7,2.6e+6,500,Free,0,Everyone,Education,2017-06-18 00:00:00,1.0.0,4.1
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53e+6,5000,Free,0,Everyone,Education,2017-07-25 00:00:00,1.48,4.1
