In [1]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from mcar import mcar_test
from preprocess import preprocess

In [3]:
df = pd.read_csv('../data/googleplaystore.csv')
df2 = pd.read_csv('../data/googleplaystore_user_reviews.csv')
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,7-Jan-18,1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,15-Jan-18,2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,1-Aug-18,1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,8-Jun-18,Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,20-Jun-18,1.1,4.4 and up


In [4]:
df_temp = preprocess(df,df2)
mcar_test(df_temp[['Rating','Size']])

0.0

In [16]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
import xgboost
from sklearn.model_selection import ParameterGrid
cont_frs = ['Reviews','Size','Price','Last Updated','Num_of_Characters','Rating']
cat_frs_onehot = ['Category','Current_Ver_truncated','Android_Ver_truncated']
cat_frs_ordinal = ['Type','Content Rating']

In [6]:
X = df_temp.drop(['Installs'], axis=1)
y = df_temp['Installs']
print(y.value_counts()/len(y))
classes, counts = np.unique(y,return_counts=True)
print('balance:',np.max(counts/len(y)))

0.0    0.324992
1.0    0.270293
3.0    0.205770
2.0    0.198945
Name: Installs, dtype: float64
balance: 0.32499224485575434


In [9]:
def preprocess_with_impute(X,y,random_state,n_folds):
    '''
    
    '''
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state,stratify=y)
    kf = StratifiedKFold(n_splits=5,shuffle=True,random_state=random_state)
    imputer = IterativeImputer(estimator = RandomForestRegressor(),random_state=random_state)
    standard_transformer = Pipeline(steps=[('imputer', imputer),('standard', StandardScaler())])
    onehot_transformer = Pipeline(steps=[('onehot', OneHotEncoder(sparse=False,handle_unknown='ignore'))])
    ordinal_transformer1 = Pipeline(steps=[('ordinal1', OrdinalEncoder(categories = [['Free','Paid']]))])
    ordinal_transformer2 = Pipeline(steps=[('ordinal2', OrdinalEncoder(categories = [['Everyone', 'Everyone 10+', 'Teen', 'Mature 17+', 'Adults only 18+','Unrated']]))])
    preprocessor = ColumnTransformer(
    transformers=[
        ('standard', standard_transformer, cont_frs),
        ('ordinal1',ordinal_transformer1,['Type']),
        ('ordinal2',ordinal_transformer2,['Content Rating']),
        ('onehot', onehot_transformer, cat_frs_onehot)])
    return X_other, X_test, y_other, y_test, kf, preprocessor

def ML_pipeline_rf_GridSearchCV(X_other, X_test, y_other, y_test, kf, preprocessor, random_state):
    rf = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', RandomForestClassifier())])
    param_grid = { 
    'classifier__max_features': ['auto'],
    'classifier__max_depth' : range(30,65,5),
    'classifier__min_samples_split' : range(2,6),
    'classifier__n_estimators' : [100],
    'classifier__random_state' : [random_state]}
    grid = GridSearchCV(rf, param_grid=param_grid,scoring = make_scorer(accuracy_score),
                        cv=kf, return_train_score = True,iid=True)
    grid.fit(X_other, y_other)
    return grid, grid.score(X_test, y_test)

In [10]:
test_scores_rf = []
for i in range(8):
    random_state = (i + 1) * 42
    X_other, X_test, y_other, y_test, kf, preprocessor = preprocess_with_impute(X,y,random_state = random_state,n_folds=5)
    grid, test_score = ML_pipeline_rf_GridSearchCV(X_other, X_test, y_other, y_test, kf, preprocessor, random_state)
    print(grid.best_params_)
    print('test score:',test_score)
    test_scores_rf.append(test_score)
print('test accuracy:',np.around(np.mean(test_scores_rf),3),'+/-',np.around(np.std(test_scores_rf),3))

{'classifier__max_depth': 35, 'classifier__max_features': 'auto', 'classifier__min_samples_split': 4, 'classifier__n_estimators': 100, 'classifier__random_state': 42}
test score: 0.8408268733850129
{'classifier__max_depth': 50, 'classifier__max_features': 'auto', 'classifier__min_samples_split': 3, 'classifier__n_estimators': 100, 'classifier__random_state': 84}
test score: 0.8501291989664083
{'classifier__max_depth': 60, 'classifier__max_features': 'auto', 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100, 'classifier__random_state': 126}
test score: 0.841343669250646
{'classifier__max_depth': 45, 'classifier__max_features': 'auto', 'classifier__min_samples_split': 4, 'classifier__n_estimators': 100, 'classifier__random_state': 168}
test score: 0.8434108527131783
{'classifier__max_depth': 50, 'classifier__max_features': 'auto', 'classifier__min_samples_split': 4, 'classifier__n_estimators': 100, 'classifier__random_state': 210}
test score: 0.8366925064599483
{'classi

In [11]:
def ML_pipeline_svc_GridSearchCV(X_other, X_test, y_other, y_test, kf, preprocessor, random_state):
    estimators = Pipeline([('reduce_dim', PCA()), ('clf', SVC())])
    svc = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', estimators)])
    Cs = np.logspace(3,7,5)
    γs = np.logspace(-4,2,3)
    param_grid = { 
    'classifier__reduce_dim__n_components': [25],
    'classifier__clf__C': Cs,
    'classifier__clf__gamma' : γs}
    grid = GridSearchCV(svc, param_grid=param_grid,scoring = make_scorer(accuracy_score),
                        cv=kf, return_train_score = True,iid=True,n_jobs=-1)
    grid.fit(X_other, y_other)
    return grid, grid.score(X_test, y_test)

In [12]:
test_scores_svc = []
for i in range(5):
    random_state = (i + 1) * 42
    X_other, X_test, y_other, y_test, kf, preprocessor = preprocess_with_impute(X,y,random_state = random_state,n_folds=4)
    grid, test_score = ML_pipeline_svc_GridSearchCV(X_other, X_test, y_other, y_test, kf, preprocessor, random_state)
    print(grid.best_params_)
    print('test score:',test_score)
    test_scores_svc.append(test_score)
print('test accuracy:',np.around(np.mean(test_scores_svc),3),'+/-',np.around(np.std(test_scores_svc),3))

{'classifier__clf__C': 1000000.0, 'classifier__clf__gamma': 0.0001, 'classifier__reduce_dim__n_components': 25}
test score: 0.661498708010336
{'classifier__clf__C': 1000000.0, 'classifier__clf__gamma': 0.0001, 'classifier__reduce_dim__n_components': 25}
test score: 0.6723514211886304
{'classifier__clf__C': 1000000.0, 'classifier__clf__gamma': 0.0001, 'classifier__reduce_dim__n_components': 25}
test score: 0.6992248062015504
{'classifier__clf__C': 10000000.0, 'classifier__clf__gamma': 0.0001, 'classifier__reduce_dim__n_components': 25}
test score: 0.6589147286821705
{'classifier__clf__C': 10000000.0, 'classifier__clf__gamma': 0.0001, 'classifier__reduce_dim__n_components': 25}
test score: 0.6702842377260982
test accuracy: 0.672 +/- 0.014


In [13]:
def ML_pipeline_knn_GridSearchCV(X_other, X_test, y_other, y_test, kf, preprocessor, random_state):
    knn = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', KNeighborsClassifier())])
    param_grid = {'classifier__n_neighbors': [20,25,30,35,40,45]}
    grid = GridSearchCV(knn, param_grid=param_grid,scoring = make_scorer(accuracy_score),
                    cv=kf, return_train_score = True,iid=True,n_jobs=-1)
    grid.fit(X_other, y_other)
    return grid, grid.score(X_test, y_test)

In [None]:
test_scores_knn = []
for i in range(8):
    random_state = (i + 1) * 42
    X_other, X_test, y_other, y_test, kf, preprocessor = preprocess_with_impute(X,y,random_state = random_state,n_folds=5)
    grid, test_score = ML_pipeline_knn_GridSearchCV(X_other, X_test, y_other, y_test, kf, preprocessor, random_state)
    print(grid.best_params_)
    print('test score:',test_score)
    test_scores_knn.append(test_score)
print('test accuracy:',np.around(np.mean(test_scores_knn),3),'+/-',np.around(np.std(test_scores_knn),3))

In [14]:
def ML_pipeline_xgb_GridSearchCV(X, y, random_state, n_folds):
    X_other, X_test, y_other, y_test = train_test_split(X, y, test_size=0.2, random_state = random_state,stratify=y)
    kf = StratifiedKFold(n_splits=5,shuffle=True,random_state = random_state)
    standard_transformer = Pipeline(steps=[('standard', StandardScaler())])
    onehot_transformer = Pipeline(steps=[('onehot', OneHotEncoder(sparse=False,handle_unknown='ignore'))])
    ordinal_transformer1 = Pipeline(steps=[('ordinal1', OrdinalEncoder(categories = [['Free','Paid']]))])
    ordinal_transformer2 = Pipeline(steps=[('ordinal2', OrdinalEncoder(categories = [['Everyone', 'Everyone 10+', 'Teen', 'Mature 17+', 'Adults only 18+','Unrated']]))])                              
    XGB = xgboost.XGBClassifier()
    preprocessor = ColumnTransformer(
    transformers=[
        ('standard', standard_transformer, cont_frs),
        ('ordinal1',ordinal_transformer1,['Type']),
        ('ordinal2',ordinal_transformer2,['Content Rating']),
        ('onehot', onehot_transformer, cat_frs_onehot)])

    param_grid = {'classifier__learning_rate': [0.03],
              'classifier__n_estimators': [100],
              'classifier__random_state': [random_state],
              'classifier__missing': [np.nan], 
              'classifier__max_depth': [20,30,40,50],
              'classifier__colsample_bytree': [0.75],              
              'classifier__subsample': [0.66]}

    xgb = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', XGB)])

    grid = GridSearchCV(xgb, param_grid=param_grid,scoring = make_scorer(accuracy_score),
                    cv=kf, return_train_score = True,iid=True,n_jobs=-1)
    grid.fit(X_other, y_other)
    return grid, grid.score(X_test, y_test)

In [17]:
test_scores_xgb = []
for i in range(8):
    random_state = (i + 1) * 42
    grid, test_score = ML_pipeline_xgb_GridSearchCV(X, y, random_state, 5)
    print(grid.best_params_)
    print('test score:',test_score)
    test_scores_xgb.append(test_score)
print('test accuracy:',np.around(np.mean(test_scores_xgb),3),'+/-',np.around(np.std(test_scores_xgb),3))

{'classifier__colsample_bytree': 0.75, 'classifier__learning_rate': 0.03, 'classifier__max_depth': 50, 'classifier__missing': nan, 'classifier__n_estimators': 100, 'classifier__random_state': 42, 'classifier__subsample': 0.66}
test score: 0.8542635658914729
{'classifier__colsample_bytree': 0.75, 'classifier__learning_rate': 0.03, 'classifier__max_depth': 30, 'classifier__missing': nan, 'classifier__n_estimators': 100, 'classifier__random_state': 84, 'classifier__subsample': 0.66}
test score: 0.8630490956072352
{'classifier__colsample_bytree': 0.75, 'classifier__learning_rate': 0.03, 'classifier__max_depth': 20, 'classifier__missing': nan, 'classifier__n_estimators': 100, 'classifier__random_state': 126, 'classifier__subsample': 0.66}
test score: 0.8583979328165374
{'classifier__colsample_bytree': 0.75, 'classifier__learning_rate': 0.03, 'classifier__max_depth': 40, 'classifier__missing': nan, 'classifier__n_estimators': 100, 'classifier__random_state': 168, 'classifier__subsample': 0.6