In [131]:
import pandas as pd
import numpy as np
import featuretools as ft
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.feature_selection import  r_regression, SelectKBest
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

from woodwork.logical_types import Categorical, Boolean
from math import  ceil, log10
'''
Add plt config
@Author: MinHyung Lee
@Since: 2022/05/24

'''

plt.style.use('ggplot')
rcParams['axes.spines.right'] = False
rcParams['axes.spines.top'] = False
rcParams['figure.figsize'] = [12, 9]
rcParams['font.size'] = 16
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)
rcParams['figure.dpi'] = 600

pd.set_option('display.max_rows', 20)

In [140]:
class AutoML:
    def __init__(self, dataset, do_sampling):
        self.df = dataset
        self.do_sampling = do_sampling
        self.X = None
        self.y = None

    def sampling(self, sample_size=20000):
        if self.do_sampling:
            self.df = self.df.sample(n=sample_size, replace=True)

    def value_change(self, unused_column=[], string_column=[]):
        self.df.drop(unused_column, axis=1, inplace=True)

        # Encode string values
        encoding = OrdinalEncoder()
        encoding.fit(self.df[string_column])
        self.df[string_column] = encoding.transform(self.df[string_column])
        self.df.fillna(self.df.mean(), inplace=True)

    def export_to_csv(self, name):
        self.df.to_csv(name + ".csv")

    def corr_check(self):
        f,ax = plt.subplots(figsize=(12, 12))
        sns.heatmap(self.df.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax)
        plt.show()

    def feature_selection(self, target, k=5):
        X = self.df.drop([target], axis = 1)
        self.y = self.df[[target]].to_numpy()


        # Select best feature using selectKBest and scoring option is r_regression
        select = SelectKBest(score_func=r_regression, k = k)
        self.X = select.fit_transform(X, self.y)

    def PCA(self, explained_var=0.9):

        self.X = StandardScaler().fit_transform(self.X)

        # Set variance ratio to 0.9
        pca = PCA(explained_var)
        self.X = pca.fit_transform(self.X)

    def make_subplot_layout(self, col_num=3):

        k = len(self.df.columns)
        row_num = ceil(k/col_num)
        for i in range(k) :
            plt.subplot(row_num,col_num,i+1)
            plt.hist(self.df.iloc[:,i])
            plt.title(self.df.columns[i])
        plt.tight_layout()
        plt.show()

    def linear_regression(self):
        X_train, X_test, y_train,  y_test = train_test_split(self.X, self.y, test_size = 0.2, random_state=7777)

        reg = LinearRegression()
        reg.fit(X_train, y_train)

        #Set Kfold for k == 10
        cv = KFold(n_splits=10)

        fold = 0
        for train_index, test_index in cv.split(X_test):

            '''

            @Author: MinHyung Lee
            @Since: 2022/05/31
            Validate each fold by using linear regression

            '''


            fold += 1
            train_X, test_X = X_train[train_index], X_test[test_index]
            train_y, test_y = y_train[train_index], y_test[test_index]

            reg.fit(train_X, train_y)

            print(f'Result of {fold} fold')
            print(f'Result => {mean_squared_error(y_test, reg.predict(X_test))}')


In [141]:
raw_df = pd.read_csv('Dataset/Google-Playstore(new).csv', index_col=[0])

In [142]:
model = AutoML(raw_df, do_sampling=True)

model.sampling(sample_size=20000)


In [143]:
string_col = ['Category', 'Minimum Android', 'Content Rating', 'Ad Supported', 'In App Purchases', 'Installs']
unusable_col = ['App Name', 'App Id', 'Minimum Installs', 'Price', 'Currency', 'Developer Id', 'Developer Website', 'Developer Email', 'Privacy Policy', 'Last Updated', 'Editors Choice', 'Scraped Time', 'Free', 'Rating Count', 'Rating', 'Size', 'Released']

In [144]:
model.value_change(string_column=string_col, unused_column=unusable_col)

In [145]:
model.feature_selection(target="Maximum Installs")

In [146]:
model.PCA()

In [147]:
model.linear_regression()

Result of 1 fold
Result => 1886722380766.5764
Result of 2 fold
Result => 1866122986943.1355
Result of 3 fold
Result => 1800899817810.3965
Result of 4 fold
Result => 1884066032912.213
Result of 5 fold
Result => 1877116789385.685
Result of 6 fold
Result => 1852631898298.3574
Result of 7 fold
Result => 1866966584546.222
Result of 8 fold
Result => 1877171485390.5747
Result of 9 fold
Result => 1890103471709.827
Result of 10 fold
Result => 1877309789835.266
