In [16]:
!mkdir Dataset
!wget https://raw.githubusercontent.com/agtmwebtoon/automl-data-science/main/Dataset/sampled_dataset.csv -P Dataset

mkdir: Dataset: File exists
--2022-06-17 23:21:57--  https://raw.githubusercontent.com/agtmwebtoon/automl-data-science/main/Dataset/sampled_dataset.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6109441 (5.8M) [text/plain]
Saving to: ‘Dataset/sampled_dataset.csv.1’


2022-06-17 23:21:59 (11.7 MB/s) - ‘Dataset/sampled_dataset.csv.1’ saved [6109441/6109441]



In [1]:
import pandas as pd
import numpy as np
import featuretools as ft
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import itertools
import warnings
warnings.filterwarnings('ignore')

from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, RobustScaler, MinMaxScaler
from sklearn.feature_selection import  r_regression, SelectKBest
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

from woodwork.logical_types import Categorical, Boolean
from math import  ceil, log10, sqrt
'''
Add plt config
@Author: MinHyung Lee
@Since: 2022/05/24

'''

plt.style.use('ggplot')
rcParams['axes.spines.right'] = False
rcParams['axes.spines.top'] = False
rcParams['figure.figsize'] = [12, 9]
rcParams['font.size'] = 16
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)
rcParams['figure.dpi'] = 600

pd.set_option('display.max_rows', 20)

In [10]:
class AutoML:

    '''
    AutoML
    @Author: MinHyung Lee
    @Since: 2022/06/02
    Find best parameter automatically
    Just decide which columns is unusable
    Find columns that consisted of String

    and call

    model.feed_input(raw_df, do_sampling=True, sample_size=20000)
    model.feature_cleaning(unusable_col, string_col)
    model.find_best_combination()


    Finally It will search best parameter for you
    '''


    def __init__(self):


        self.df = None
        self.X = None
        self.y = None
        self.best_param = None
        self.model = None
        self.score = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None

    def feed_input(self, dataset, do_sampling, sample_size):
        '''
        feed_input
        @Author: MinHyung Lee
        @Since: 2022/06/02
        @dataset: raw dataframe for machine learning
        @do_sampling: Flag that sampling the dataset
        @sample_size: Sample size for random sampling
        Set object's dataset
        Also can random sampling by do_sampling option

        '''


        self.df = dataset

        if do_sampling:
            self.sampling(sample_size)

    def feature_cleaning(self, unused_column=[], string_column=[]):

        '''
        feature_cleaning
        @Author: MinHyung Lee
        @Since: 2022/06/02
        @unused_column: Features considered useless <list>
        @string_column: Features consisted of String <list>
        Drop unused column
        Encode string value using ordinalEncoder
        Find Na value and fill it my df.mean

        '''


        self.df.drop(unused_column, axis=1, inplace=True)

        # Encode string values
        encoding = OrdinalEncoder()
        encoding.fit(self.df[string_column])
        self.df[string_column] = encoding.transform(self.df[string_column])
        self.df.fillna(self.df.mean(), inplace=True)

    def split_dataset(self, target):

        '''
        split_dataset
        @Author: MinHyung Lee
        @Since: 2022/06/04
        @target: Dataset <DataFrame>

        '''

        self.y = self.df[[target]].to_numpy()
        self.X = self.df.drop([target], axis = 1)

    def find_best_combination(self):

        # Do feature selection (k = 1 .. size of feature)
        # Do normalization (StandardScaler, RobustScaler, MinMaxScaler)
        # Do PCA (explained_variance = 0..1)
        # find best parameter by using different parameter

        '''
        find_best_combination
        @Author: MinHyung Lee
        @Since: 2022/06/02
        find best parameter and set best model
        '''


        self.split_dataset("Maximum Installs")
        param_k = np.arange(1, self.X.shape[1])
        param_scale_method = ['s', 'r', 'm']
        param_explained_var = np.linspace(0.01, 0.9999, 100)
        param_model_type = ['l', 'k']
        result = []
        param_list = list(itertools.product(param_k, param_scale_method, param_explained_var, param_model_type))

        for (k, scale_method, explained_var, param_model_type) in param_list:
            X = self.feature_selection(k)
            X = self.scaler(X, scale_method=scale_method)
            X = self.pca(X, explained_var=explained_var)

            X_train, X_test, y_train,  y_test = train_test_split(X, self.y, test_size = 0.2, random_state=7777)

            mt = self.model_type(param_model_type)
            _, score = mt(X_train, X_test, y_train, y_test)
            result.append(score)

        idx = np.argmin(result)
        self.score = np.min(result)
        self.best_param = param_list[idx]
        self.set_model(self.best_param)

    def set_model(self, best_param):

        '''
        set_model
        @Author: MinHyung Lee
        @Since: 2022/06/02
        @best_param: parameter set calculated by find_best_combination <list>
        Set best model by using best parameter

        '''


        (k, scale_method, explained_var, model_type) = best_param
        X = self.feature_selection(k)
        X = self.scaler(X, scale_method=scale_method)
        X = self.pca(X, explained_var=explained_var)
        self.X_train, self.X_test, self.y_train,  self.y_test = train_test_split(X, self.y, test_size = 0.2, random_state=7777)
        mt = self.model_type(model_type)
        reg, _ = mt(self.X_train, self.X_test, self.y_train, self.y_test)
        self.model = reg


    def sampling(self, sample_size=20000):
        # Do random sampling
        self.df = self.df.sample(n=sample_size, replace=True)

    def feature_selection(self, k=5):

        # Select best feature using selectKBest and scoring option is r_regression
        select = SelectKBest(score_func=r_regression, k = k)
        ret  = select.fit_transform(self.X, self.y)

        return ret

    def scaler(self, X, scale_method='s'):

        '''
        scaler
        @Author: MinHyung Lee
        @Since: 2022/06/02
        @scale_method:
        s for standard scaler
        r for robustscaler
        m for minmaxscaler
        Return Normalized feature

        '''


        if scale_method == 's':
            ret = StandardScaler().fit_transform(X)
        elif scale_method == 'r':
            ret = RobustScaler().fit_transform(X)
        elif scale_method == 'm':
            ret = MinMaxScaler().fit_transform(X)

        return ret

    def pca(self, X, explained_var=0.9):

        '''
        pca
        @Author: MinHyung Lee
        @Since: 2022/06/02
        Calculate PCA
        :return Feature reduction by PCA

        '''


        # Set variance ratio to 0.9
        pca = PCA(explained_var)
        ret = pca.fit_transform(X)

        return ret

    def model_type(self, train_method):
        '''
        model_type
        @Author: MinHyung Lee
        @Since: 2022/06/02
        @train_method:
        l for linear_regression
        k for KNN_regression
        Return train method

        '''

        if train_method == 'l':
            return self.linear_regression
        elif train_method == 'k':
            return self.KNN


    def make_subplot_layout(self, col_num=3):

        '''
        make_subplot_layout
        @Author: MinHyung Lee
        @Since: 2022/06/02
        Plot each features by subplots

        '''

        k = len(self.df.columns)
        row_num = ceil(k/col_num)
        for i in range(k) :
            plt.subplot(row_num,col_num,i+1)
            plt.hist(self.df.iloc[:,i])
            plt.title(self.df.columns[i])
        plt.tight_layout()
        plt.show()

    def linear_regression(self, X_train, X_test, y_train, y_test):

        '''
        linear_regression
        @Author: MinHyung Lee
        @Since: 2022/06/02
        Calculate MSE
        :return
        model: linearRegreesion model
        score: logscale MSE
        '''


        reg = LinearRegression()
        reg.fit(X_train, y_train)
        mean_error = 0
        #Set Kfold for k == 5
        cv = KFold(n_splits=5)

        fold = 0
        for train_index, test_index in cv.split(X_test):

            '''

            @Author: MinHyung Lee
            @Since: 2022/05/31
            Validate each fold by using linear regression

            '''

            fold += 1
            train_X, test_X = X_train[train_index], X_test[test_index]
            train_y, test_y = y_train[train_index], y_test[test_index]

            reg.fit(train_X, train_y)

            # Calculate mean error
            mean_error += log10(mean_squared_error(y_test, reg.predict(X_test)))

        return reg, mean_error / 5

    def KNN(self, X_train, X_test, y_train, y_test):

        '''
        KNN
        @Author: MinHyung Lee
        @Since: 2022/06/02
        Find best neighbors by using GridSearch
        Scoring method is negative MAE

        :return
        bestmodel // knn model
        score // log scaled MSE

        '''

        knn = KNeighborsRegressor()
        param_grid = dict(n_neighbors = np.arange(1, 30))

        #Hyperparameter tuning by grid searching
        grid_search = GridSearchCV(knn, param_grid, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1)
        grid_search.fit(X_train, y_train)
        optimal_knn = grid_search.best_estimator_

        return optimal_knn, log10(mean_squared_error(y_test, optimal_knn.predict(X_test)))


In [11]:
raw_df = pd.read_csv('Dataset/sampled_dataset.csv', index_col=[0])

In [12]:
model = AutoML()

In [13]:
#Declare unusable columns for drop
#Declare columns that consist of string to encoding

string_col = ['Category', 'Minimum Android', 'Content Rating', 'Ad Supported', 'In App Purchases', 'Installs']
unusable_col = ['App Name', 'App Id', 'Minimum Installs', 'Price', 'Currency', 'Developer Id', 'Developer Website', 'Developer Email', 'Privacy Policy', 'Last Updated', 'Editors Choice', 'Scraped Time', 'Free', 'Rating Count', 'Rating', 'Size', 'Released']


In [14]:
# Feed raw dataset
# You can also random sampling
model.feed_input(raw_df, do_sampling=True, sample_size=20000)
model.feature_cleaning(unusable_col, string_col)

# Do feature selection (k = 1 .. size of feature)
# Do normalization (StandardScaler, RobustScaler, MinMaxScaler)
# Do PCA (explained_variance = 0..1)
# find best parameter by using different parameter

model.find_best_combination()

In [15]:
model.best_param
model.score

12.220636913574868