In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.ml
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler,StringIndexer

import sys; sys.path.append("../")
from DataPreprocessing.DataPreprocessing import read_data,encode_categ_features,remove_useless_col


# Spark session

In [2]:
sc = SparkContext(master='local')
spark = SparkSession.builder.appName("InstallationsPrediction").getOrCreate()

# Read the data

In [3]:
df= read_data(spark, features='all')
df.dtypes

Detecting outliers...
Number of outliers in Rating: 0 (0.00%)
Number of outliers in Rating Count: 388994 (16.82%)
Number of outliers in Minimum Installs: 260554 (11.27%)
Number of outliers in Maximum Installs: 407678 (17.63%)
Number of outliers in Price: 44943 (1.94%)
Number of rows before removing outliers: 2312944
Number of rows after removing those having more than 1 outlier in its columns: 1961130
Removing useless columns...
Handling missing values...
Total Number of rows : 1961130
Number of rows after dropping nulls: 1954764
Total Number of rows : 1954764
Total Number of rows : 1954764
Converting size to bytes...
Converted all sizes to Bytes.


[('App Name', 'string'),
 ('App Id', 'string'),
 ('Category', 'string'),
 ('Rating', 'float'),
 ('Rating Count', 'int'),
 ('Installs', 'string'),
 ('Minimum Installs', 'int'),
 ('Maximum Installs', 'int'),
 ('Free', 'string'),
 ('Price', 'float'),
 ('Size', 'string'),
 ('Minimum Android', 'string'),
 ('Developer Id', 'string'),
 ('Developer Email', 'string'),
 ('Released', 'string'),
 ('Last Updated', 'string'),
 ('Content Rating', 'string'),
 ('Ad Supported', 'string'),
 ('In App Purchases', 'string'),
 ('Editors Choice', 'string')]

In [4]:
'''
App Name, App Id, Rating, Rating Count, Released, Last Updated, Content Rating are not interesting here for max installations prediction,
so we can remove them safely
'''

useless_cols= ['App Name','App Id','Rating','Rating Count',\
               'Released','Last Updated','Content Rating','Installs','Minimum Installs']
df= remove_useless_col(df,useless_cols)
df.dtypes

[('Category', 'string'),
 ('Maximum Installs', 'int'),
 ('Free', 'string'),
 ('Price', 'float'),
 ('Size', 'string'),
 ('Minimum Android', 'string'),
 ('Developer Id', 'string'),
 ('Developer Email', 'string'),
 ('Ad Supported', 'string'),
 ('In App Purchases', 'string'),
 ('Editors Choice', 'string')]

# Features


In [5]:
interesting_categ_cols= ['Category', 'Ad Supported', 'In App Purchases', 'Editors Choice',\
                  'Free', 'Size', "Minimum Android","Developer Id","Developer Email"]

df= encode_categ_features(df,interesting_categ_cols)

In [6]:
# Rearrange to make label the last column
df = df.select('Price','Category', 'Ad Supported', 'In App Purchases', 'Editors Choice',\
            'Free', 'Size', "Minimum Android","Developer Id","Developer Email",\
            'Maximum Installs')

In [7]:
required_features = df.columns[:-1]      #all except the target variable which is at the last index
print(required_features)

['Price', 'Category', 'Ad Supported', 'In App Purchases', 'Editors Choice', 'Free', 'Size', 'Minimum Android', 'Developer Id', 'Developer Email']


In [8]:
vec_assembler = VectorAssembler(inputCols=required_features,outputCol='Features')
vec_df = vec_assembler.setHandleInvalid("skip").transform(df)

# Splitting the data 

In [9]:
train_df,test_df = vec_df.select(['Features', 'Maximum Installs']).randomSplit([0.7,0.3])

# Model Building
## Linear Regression

In [10]:
lr = LinearRegression(featuresCol='Features',labelCol='Maximum Installs')

In [11]:
lr_model = lr.fit(train_df)

#just some training metrics
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 5789.517479
r2: 0.033304


In [12]:
y_pred = lr_model.transform(test_df)
y_pred.show(5)

+--------------------+----------------+------------------+
|            Features|Maximum Installs|        prediction|
+--------------------+----------------+------------------+
|(10,[0,1,5,8,9],[...|               8|   68.136875719925|
|(10,[0,1,5,8,9],[...|               2|103.00643060375387|
|(10,[0,1,5,8,9],[...|             475| 85.52440582609961|
|(10,[0,1,5,8,9],[...|             422| 85.47926251734839|
|(10,[0,5,6,8,9],[...|              14| 37.53743873067924|
+--------------------+----------------+------------------+
only showing top 5 rows



## Model Evaluation

In [13]:
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="Maximum Installs",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(y_pred))

R Squared (R2) on test data = 0.0319308


In [14]:
test_result = lr_model.evaluate(test_df)
print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)

Root Mean Squared Error (RMSE) on test data = 5812


## Decision Tree Regression

In [17]:
from pyspark.ml.regression import DecisionTreeRegressor
dt = DecisionTreeRegressor(featuresCol ='Features', labelCol = 'Maximum Installs')   #setting a larger maxbins(default=32) is an ad-hoc step just to solve an error I dont understand :)
dt_model = dt.fit(train_df)
dt_predictions = dt_model.transform(test_df)
dt_evaluator = RegressionEvaluator(
    labelCol="Maximum Installs", predictionCol="prediction", metricName="rmse")
rmse = dt_evaluator.evaluate(dt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

IllegalArgumentException: requirement failed: DecisionTree requires maxBins (= 32) to be at least as large as the number of values in each categorical feature, but categorical feature 9 has 810026 values. Consider removing this and other categorical features with a large number of values, or add more training examples.

In [None]:
#Feature Importance
dt_model.featureImportances

#notice that Feature at index 0 has higher importance, this feature is the 'Rating Count'

## Gradient-Boosted Decision Tree Regressor

In [18]:
from pyspark.ml.regression import GBTRegressor
gbt = GBTRegressor(featuresCol = 'Features', labelCol = 'Maximum Installs', maxIter=10, maxBins=48) #same as above, maxbins is set just to avoid an error I dont understand
# maxIter is just a hyperparameter we put by hand
gbt_model = gbt.fit(train_df)
gbt_predictions = gbt_model.transform(test_df)
gbt_predictions.select('prediction', 'Maximum Installs', 'Features').show(5)

IllegalArgumentException: requirement failed: DecisionTree requires maxBins (= 48) to be at least as large as the number of values in each categorical feature, but categorical feature 9 has 810026 values. Consider removing this and other categorical features with a large number of values, or add more training examples.

In [None]:
gbt_evaluator = RegressionEvaluator(
    labelCol="Maximum Installs", predictionCol="prediction", metricName="rmse")
rmse = gbt_evaluator.evaluate(gbt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

### Saving Model


In [None]:
# #this is giving me errors I still cannot resolve

# from pyspark.ml.pipeline import PipelineModel
# gbt_model.save("D:/CLASSSWORKKK/BD/BD_PROJECT/GBT_Model")
# #to overwrite an existing model
# gbt_model.write().overwrite().save("D:/CLASSSWORKKK/BD/BD_PROJECT/GBT_Model")

### Loading Model

In [None]:
# from pyspark.ml.pipeline import PipelineModel
# LoadedModel = PipelineModel.load("./GBT_Model")