In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('dataset/GooglePlay_pion.csv')

In [3]:
data.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [4]:
data.sample(5)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
4825,Empire Z: Endless War,FAMILY,4.2,367951,76M,"5,000,000+",Free,0,Teen,Strategy,"July 16, 2018",2.2.8,4.0.3 and up
6616,BP Scanner Prank,FAMILY,4.1,157,7.0M,"50,000+",Free,0,Everyone,Entertainment,"September 20, 2016",1.2,4.0.3 and up
383,imo free video calls and chat,COMMUNICATION,4.3,4785988,11M,"500,000,000+",Free,0,Everyone,Communication,"June 8, 2018",9.8.000000010501,4.0 and up
5680,Alerte au gogol,FAMILY,,14,2.0M,"1,000+",Free,0,Everyone,Entertainment,"February 9, 2018",1.0.0.0,4.0 and up
4923,AC Remote for Midea - NOW FREE,TOOLS,4.0,448,26M,"100,000+",Free,0,Everyone,Tools,"July 30, 2018",6.1.7,4.2 and up


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8999 entries, 0 to 8998
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             8999 non-null   object 
 1   Category        8999 non-null   object 
 2   Rating          7957 non-null   float64
 3   Reviews         8999 non-null   int64  
 4   Size            8999 non-null   object 
 5   Installs        8999 non-null   object 
 6   Type            8999 non-null   object 
 7   Price           8999 non-null   object 
 8   Content Rating  8999 non-null   object 
 9   Genres          8999 non-null   object 
 10  Last Updated    8999 non-null   object 
 11  Current Ver     8992 non-null   object 
 12  Android Ver     8997 non-null   object 
dtypes: float64(1), int64(1), object(11)
memory usage: 914.1+ KB


In [7]:
# value counts for fast browsing on vscode
vc_category = data['Category'].value_counts()
vc_genres = data['Genres'].value_counts()
vc_current_ver = data['Current Ver'].value_counts()
vc_android_ver = data['Android Ver'].value_counts()

## Quick first observations
* _columnName {importance}{valuetype}{comment/action plan}_
### Columns: 
* app{-1}{__categorical__}{not needed to make a prediction, unless we are trying to figure out a good name for a highly installable app. Candidate for removal.}
* category{4}{__categorical__}{Encode it}
* rating {4}{__numerical__}{has nan values, but setting the value to 0 or any number might yield to wrong information. We could either ignore it (remove those that are nan), or predict some of its values}
* size{1}{__categorical__}{candidate to be converted to numerical value, although, it doesn't feel as if it could give a lot of information.}
* installs{__target feature__}
* type {-1}{__categorical__}{can be infered from actual price column, candidate for removal}
* price {5}{__categorical__}{needs to be converted to numeric value}
* content rating {1}{__categorical__}{Encode it}
* genres {2}{__categorical__}{this seems to have a mixed of base categories and subcategories, for instance action is a category (meaning it can come alone) but "action & adventure" always comes along with some other category, explore more to define a strategy, first thoughts either remove the column (since we have category) or create categories from "subcategories" like "action & adventure" or "creativity"}
* Last Updated {2}{__categorical__}{actually a date value, convert to date}
* Current ver {0}{__numerical__}{"varies with device" is very high, and it normally shouldn't affect the prediction, as the nomenclature might change from developer to developer, candidate for removal}
* Android ver {1}{__categorical__}{actually cound be treated as numerical, but according to [this](https://developer.android.com/guide/topics/manifest/uses-sdk-element) some values also convey extra information whether it is wearable or not, I am going to assume that _we are not interested on wearables_, => I think it should be safe to remove them. There are some others that show a range of android versions, => I think it should be safe to remove them. "varies with device" contributes a lot of counts, I think it should be safe to remove them. And, finally "normalize" the data by versions. This can also lead to make an analysis were it is treated as a number or as a category hence using hot encoding}

### Data prep: 1st round 

In [18]:
# working dataset
pion = data.copy()

# drop unnecessary columns
pion.drop(["App", "Type", "Current Ver"], axis=1, inplace=True)

#convert to datetime
pion['Last Updated'] = pd.to_datetime(pion['Last Updated'])

#### Column: Android Ver

In [128]:
# drop wearables
pion.drop(pion[pion['Android Ver'].str.contains('W', na=False)].index, inplace=True)
# drop 'Varies with device'
pion.drop(pion[pion['Android Ver'].str.contains('Varies', na=False)].index, inplace=True)
# drop ranges of versions
pion.drop(pion[pion['Android Ver'].str.contains('-', na=False)].index, inplace=True)
# remove 'and up'
pion['Android Ver'] = pion['Android Ver'].str.replace('and up', '')
# approximate versions
pion['Android Ver'] = pion['Android Ver'].str.replace('2.0.1', '2.0')
pion['Android Ver'] = pion['Android Ver'].str.replace('2.3.3', '2.3')
pion['Android Ver'] = pion['Android Ver'].str.replace('4.0.3', '4.0')
# convert to number
pion['Android Ver'] = pd.to_numeric(pion['Android Ver'])

#### Column: Price

In [135]:
pion['Price'] = pion['Price'].replace('[\$,]', '', regex=True).astype(float)

In [136]:
pion.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7728 entries, 0 to 8998
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Category        7728 non-null   object        
 1   Rating          6721 non-null   float64       
 2   Reviews         7728 non-null   int64         
 3   Size            7728 non-null   object        
 4   Installs        7728 non-null   object        
 5   Price           7728 non-null   float64       
 6   Content Rating  7728 non-null   object        
 7   Genres          7728 non-null   object        
 8   Last Updated    7728 non-null   datetime64[ns]
 9   Android Ver     7726 non-null   float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(5)
memory usage: 984.1+ KB
