In [20]:
'''
Library declare
@Author: MinHyung Lee
@Since: 2022/05/22
Used library:
pandas
numpy
featuretools
matplot
sklearn

'''

import pandas as pd
import numpy as np
import featuretools as ft
import matplotlib.pyplot as plt

from woodwork.logical_types import Categorical, Boolean
from sklearn.preprocessing import OrdinalEncoder
from datetime import datetime


In [6]:


df = pd.read_csv('Google-Playstore(new).csv')

# 필요없는 컬럼 버리기
df = df.drop(['Unnamed: 0', 'App Name', 'App Id', 'Installs', 'Minimum Installs', 'Price', 'Currency', 'Developer Id', 'Developer Website', 'Developer Email', 'Privacy Policy', 'Last Updated', 'Editors Choice', 'Scraped Time', 'Free'], axis=1)

# 숫자로 되어 있지 않은 컬럼 숫자로 인코딩하기
encoding = OrdinalEncoder()
encoding.fit(df[['Category', 'Minimum Android', 'Content Rating', 'Ad Supported', 'In App Purchases']])
df[['Category', 'Minimum Android', 'Content Rating', 'Ad Supported', 'In App Purchases']] = encoding.transform(df[['Category', 'Minimum Android', 'Content Rating', 'Ad Supported', 'In App Purchases']])

# Size 컬럼 인코딩
A = df[df['Size'] == 'Varies with device'].index
df = df.drop(A,axis='index')
df = df.dropna()  # NaN가 포함된 데이터는 모두 drop
# 앱 크기 사이즈 계산
# 1. ','drop
# 2. M과 k drop
# 3. k값 M으로 치환
df['Size'] =  np.where(df['Size'].str.contains('M'), df['Size'].str.replace(',','').str[:-1].astype('float64') , df['Size'].str.replace(',','').str[:-1].astype('float64')/1000)

# 날짜 월만 남기고 버리고 인코딩
df['Released'] = pd.to_datetime(df['Released'])
df['Released'].dt.strftime('%b %d, %Y')
df['Released'] = df['Released'].dt.month

df.fillna(df.mean(), inplace=True)

df

Unnamed: 0,Category,Rating,Rating Count,Maximum Installs,Size,Minimum Android,Released,Content Rating,Ad Supported,In App Purchases
0,26.0,4.9,20.0,552,5.8,27.0,3,0.0,1.0,0.0
1,31.0,4.6,719.0,118989,12.0,21.0,3,0.0,1.0,0.0
2,24.0,0.0,0.0,85,8.9,22.0,12,0.0,0.0,0.0
3,11.0,0.0,0.0,723,27.0,27.0,4,0.0,1.0,0.0
4,33.0,0.0,0.0,153,6.0,27.0,11,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
19995,17.0,4.0,5.0,335,6.9,25.0,8,0.0,1.0,0.0
19996,13.0,0.0,0.0,900,8.9,21.0,8,0.0,0.0,0.0
19997,20.0,0.0,0.0,30,2.0,19.0,1,0.0,0.0,0.0
19998,2.0,0.0,0.0,10,32.0,27.0,9,0.0,1.0,0.0


In [5]:
es = ft.EntitySet(id='Store')
df["index"] = df.index

In [24]:
es.add_dataframe(dataframe_name="store_log",
                 dataframe=df,
                 index="index",
                 logical_types= {
                     "Ad Supported": Boolean,
                     "In App Purchases": Boolean
                 })



Entityset: Store
  DataFrames:
    store_log [Rows: 18893, Columns: 11]
  Relationships:
    No relationships

In [25]:
es["store_log"].ww.schema

Unnamed: 0_level_0,Logical Type,Semantic Tag(s)
Column,Unnamed: 1_level_1,Unnamed: 2_level_1
Category,Double,['numeric']
Rating,Double,['numeric']
Rating Count,Double,['numeric']
Maximum Installs,Integer,['numeric']
Size,Double,['numeric']
Minimum Android,Double,['numeric']
Released,Integer,['numeric']
Content Rating,Double,['numeric']
Ad Supported,Categorical,['category']
In App Purchases,Categorical,['category']


In [26]:
features, feature_names = ft.dfs(entityset=es,
                                 target_dataframe_name='store_log',
                                 agg_primitives=['mean', 'max', 'percent_true', 'last'],
                                 trans_primitives=['year', 'month', 'subtract_numeric_scalar','divide_numeric_scalar'])


  trans_primitives: ['month', 'year']
  agg_primitives: ['last', 'max', 'mean', 'percent_true']
This may be caused by a using a value of max_depth that is too small, not setting interesting values, or it may indicate no compatible columns for the primitive were found in the data.


In [28]:
feature_names

[<Feature: Category>,
 <Feature: Rating>,
 <Feature: Rating Count>,
 <Feature: Maximum Installs>,
 <Feature: Size>,
 <Feature: Minimum Android>,
 <Feature: Released>,
 <Feature: Content Rating>,
 <Feature: Ad Supported>,
 <Feature: In App Purchases>,
 <Feature: Category / 1>,
 <Feature: Content Rating / 1>,
 <Feature: Maximum Installs / 1>,
 <Feature: Minimum Android / 1>,
 <Feature: Rating / 1>,
 <Feature: Rating Count / 1>,
 <Feature: Released / 1>,
 <Feature: Size / 1>,
 <Feature: Category - 0>,
 <Feature: Content Rating - 0>,
 <Feature: Maximum Installs - 0>,
 <Feature: Minimum Android - 0>,
 <Feature: Rating - 0>,
 <Feature: Rating Count - 0>,
 <Feature: Released - 0>,
 <Feature: Size - 0>]