The aim of the project is to identify the accuracy of R2 score for different tree-based models, covering:
- Decision Tree
- Random Forest
- XG Boost
- LightGBM

Meanwhile, the cross validation will be also adopted for model evaluation

The use of data cleaning is firstly adopted for the input of regression model. 



In [11]:
import re
import sys
import time
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

from mpl_toolkits.mplot3d import Axes3D
import matplotlib as mpl
import numpy as np
import seaborn as sns
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn import linear_model
import statsmodels.api as sm
import sklearn.model_selection as ms
from sklearn import neighbors
from sklearn import tree
from sklearn.cluster import KMeans
from sklearn.neighbors import KDTree
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,cross_val_score, ShuffleSplit
from sklearn.model_selection import StratifiedKFold,KFold,GridSearchCV,RandomizedSearchCV
from sklearn import metrics
from sklearn.metrics import r2_score,mean_squared_error,confusion_matrix

from xgboost import XGBRegressor 
from lightgbm import LGBMRegressor 

from tensorflow import keras
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense

database = pd.read_csv(r"../input/google-play-store-apps/googleplaystore.csv")# store wine type as an attribute



#############################################################  
######Data Cleaning

i = database[database['Category'] == '1.9'].index
database.loc[i]
database = database.drop(i)

database = database[pd.notnull(database['Last Updated'])]
database = database[pd.notnull(database['Content Rating'])]


CategoryList = database['Category'].unique().tolist() 
CategoryList = ['cat_' + word for word in CategoryList]
database = pd.concat([database, pd.get_dummies(database['Category'], prefix='cat')], axis=1)


database['Rating'] = database['Rating'].fillna(database['Rating'].median())
database['Installs'] = database['Installs'].apply(lambda x : x.strip('+').replace(',', ''))
database['Type'] = pd.get_dummies(database['Type'])
database['Price'] = database['Price'].apply(lambda x : x.strip('$'))
database['Last Updated'] = database['Last Updated'].apply(lambda x : time.mktime(datetime.datetime.strptime(x, '%B %d, %Y').timetuple()))


#######################################################
###### Encoding

LE = preprocessing.LabelEncoder()
database['App'] = LE.fit_transform(database['App'])
database['Genres'] = LE.fit_transform(database['Genres'])
database['Content Rating'] = LE.fit_transform(database['Content Rating'])


########################################################
###### Size


k_indices = database['Size'].loc[database['Size'].str.contains('k')].index.tolist()
converter = pd.DataFrame(database.loc[k_indices, 'Size'].apply(lambda x: x.strip('k')).astype(float).apply(lambda x: x / 1024).apply(lambda x: round(x, 3)).astype(str))
database.loc[k_indices,'Size'] = converter

database['Size'] = database['Size'].apply(lambda x: x.strip('M'))
database[database['Size'] == 'Varies with device'] = 0
database['Size'] = database['Size'].astype(float)


After the shuffled the database, the features of the dataset are selected and listed below:

- App 
- Reviews
- Size
- Installs
- Type 
- Price
- Content Rating 
- Genres, 
- Last Updated

The output I want to evaluate is Rating 

In [12]:
########################################################
###### Feature Selection

shuffled_database = database.reindex(np.random.permutation(database.index))


features = ['App', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated']
shuffled_database[features]=shuffled_database[features].astype(float)
X = shuffled_database[features]
y = shuffled_database['Rating']


#######################################################
##### Train Test Split


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=10)


In [13]:
###################################################################
### DecisionTreeRegressor

DT_Regression = tree.DecisionTreeRegressor(criterion='mae', max_depth=5, min_samples_leaf=5, random_state=42)
DT_Regression.fit(X_train,y_train)
y_DT_pred=DT_Regression.predict(X_test)
DT_Regression_score=DT_Regression.score(X_test,y_test)


print("with train test split_DecisionTreeRegression", DT_Regression_score)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_DT_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_DT_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_DT_pred)), '\n')

with train test split_DecisionTreeRegression 0.9204473450232281
Mean Absolute Error: 0.25510455104551044
Mean Squared Error: 0.2044218942189422
Root Mean Squared Error: 0.45213039515049436 



In [14]:
###################################################################
### RandomForestRegressor

RF_Regression= RandomForestRegressor(random_state=20)
RF_Regression.fit(X_train,y_train)
y_RF_pred=RF_Regression.predict(X_test)
RF_Regression_score=RF_Regression.score(X_test,y_test)

print("with train test split_RandomForestRegression", RF_Regression_score)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_RF_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_RF_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_RF_pred)), '\n')


with train test split_RandomForestRegression 0.9300828559826425
Mean Absolute Error: 0.2539384993849937
Mean Squared Error: 0.17966207441574408
Root Mean Squared Error: 0.42386563250132003 



In [15]:
###################################################################
### XGBRegressor


XGB_Regression= XGBRegressor(random_state=20)
XGB_Regression.fit(X_train,y_train)
y_XBG_pred=XGB_Regression.predict(X_test)
RF_Regression_score=XGB_Regression.score(X_test,y_test)

print("with train test split_XGBRegression", RF_Regression_score)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_XBG_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_XBG_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_XBG_pred)), '\n')


with train test split_XGBRegression 0.9245566313261124
Mean Absolute Error: 0.26835015173107934
Mean Squared Error: 0.19386249692203422
Root Mean Squared Error: 0.4402981909138785 



In [16]:
###################################################################
### LightGBM


LGBM_Regression = LGBMRegressor(random_state=20)
LGBM_Regression.fit(X_train,y_train)
y_LGBM_pred=LGBM_Regression.predict(X_test)
LGBM_Regression_socre=LGBM_Regression.score(X_test,y_test)

print("with train test split_LGBM_Regression", LGBM_Regression_socre)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_LGBM_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_LGBM_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_LGBM_pred)), '\n')

with train test split_LGBM_Regression 0.9302825505053431
Mean Absolute Error: 0.25793118637779705
Mean Squared Error: 0.17914893085557576
Root Mean Squared Error: 0.4232598857151192 



The cross validation is performed in order to evaluate the model with multiple train-test splits. But it is more time consuming compared to train test split

In [17]:
####################################################
## Result with Cross Validation Score and Prediction

score_CV_XGB_Regression = cross_val_score(XGB_Regression,X,y,cv=5 ,scoring='r2')
print("with CV XGB_Regression", score_CV_XGB_Regression.mean())

with CV XGB_Regression 0.9255910217946182
