In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.ml
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler,StringIndexer
from pyspark.sql.functions import col

import sys; sys.path.append("../../")
from DataPreprocessing.DataPreprocessing import read_data,encode_categ_features,remove_useless_col


# Spark session

In [2]:
sc = SparkContext(master='local')
spark = SparkSession.builder.appName("InstallationsPrediction").getOrCreate()

# Read the data

In [3]:
df= read_data(spark, features='all')
df.dtypes

[('App Name', 'string'),
 ('App Id', 'string'),
 ('Category', 'string'),
 ('Rating', 'float'),
 ('Rating Count', 'int'),
 ('Installs', 'string'),
 ('Minimum Installs', 'int'),
 ('Maximum Installs', 'int'),
 ('Free', 'string'),
 ('Price', 'float'),
 ('Size', 'string'),
 ('Minimum Android', 'string'),
 ('Developer Id', 'string'),
 ('Developer Email', 'string'),
 ('Released', 'string'),
 ('Last Updated', 'string'),
 ('Content Rating', 'string'),
 ('Ad Supported', 'string'),
 ('In App Purchases', 'string'),
 ('Editors Choice', 'string')]

In [4]:
'''
App Name, App Id, Rating, Rating Count, Released, Last Updated, Content Rating are not interesting here for max installations prediction,
so we can remove them safely
'''

useless_cols= ['App Name','App Id']
df= remove_useless_col(df,useless_cols)
df.dtypes

[('Category', 'string'),
 ('Rating', 'float'),
 ('Rating Count', 'int'),
 ('Installs', 'string'),
 ('Minimum Installs', 'int'),
 ('Maximum Installs', 'int'),
 ('Free', 'string'),
 ('Price', 'float'),
 ('Size', 'string'),
 ('Minimum Android', 'string'),
 ('Developer Id', 'string'),
 ('Developer Email', 'string'),
 ('Released', 'string'),
 ('Last Updated', 'string'),
 ('Content Rating', 'string'),
 ('Ad Supported', 'string'),
 ('In App Purchases', 'string'),
 ('Editors Choice', 'string')]

# Features


In [5]:
interesting_categ_cols= ['Category', 'Ad Supported', 'In App Purchases', 'Editors Choice',\
                  'Free', 'Size', "Minimum Android","Developer Id","Developer Email",\
                    'Rating','Rating Count',\
               'Released','Last Updated','Content Rating','Installs','Minimum Installs']

df= encode_categ_features(df,interesting_categ_cols)

In [11]:
label = "Maximum Installs"
column_names = df.columns

# Remove the column to move from the list of column names
column_names.remove(label)

# Reorder the list of column names so that the column to move is at the end
new_column_order = column_names + [label]

# Use the new column order to select the columns in the DataFrame,
# and then add the column to move to the end using withColumn()
df = df.select(*new_column_order).withColumn(label, col(label))

In [12]:
required_features = df.columns[:-1]      #all except the target variable which is at the last index
print(required_features)

['Price', 'Category', 'Ad Supported', 'In App Purchases', 'Editors Choice', 'Free', 'Size', 'Minimum Android', 'Developer Id', 'Developer Email', 'Rating', 'Rating Count', 'Released', 'Last Updated', 'Content Rating', 'Installs', 'Minimum Installs']


In [13]:
vec_assembler = VectorAssembler(inputCols=required_features,outputCol='Features')
vec_df = vec_assembler.setHandleInvalid("skip").transform(df)

# Splitting the data 

In [14]:
train_df,test_df = vec_df.select(['Features', 'Maximum Installs']).randomSplit([0.7,0.3])

# Model Building
## Linear Regression

In [15]:
lr = LinearRegression(featuresCol='Features',labelCol='Maximum Installs', regParam=0.3, )

In [16]:
lr_model = lr.fit(train_df)

#just some training metrics
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 4747.646864
r2: 0.349121


In [17]:
y_pred = lr_model.transform(test_df)
y_pred.show(5)

+--------------------+----------------+-------------------+
|            Features|Maximum Installs|         prediction|
+--------------------+----------------+-------------------+
|(17,[0,1,2,5,6,7,...|             238| -842.1016698885486|
|(17,[0,1,2,5,6,7,...|             218| -573.1449446455225|
|(17,[0,1,2,5,6,7,...|             130|-2381.4550009059467|
|(17,[0,1,2,5,6,7,...|             314| -2159.499181247639|
|(17,[0,1,2,5,6,7,...|             214| -943.7404244715199|
+--------------------+----------------+-------------------+
only showing top 5 rows



## Model Evaluation

In [18]:
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="Maximum Installs",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(y_pred))

R Squared (R2) on test data = 0.349139


In [19]:
test_result = lr_model.evaluate(test_df)
print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)

Root Mean Squared Error (RMSE) on test data = 4772.44


## Decision Tree Regression

In [20]:
# from pyspark.ml.regression import DecisionTreeRegressor, RandomForestRegressor, GBTRegressor
# from pyspark.ml.evaluation import RegressionEvaluator

# # Decision Tree Regressor
# try:
#     dt= DecisionTreeRegressor(featuresCol='Features',labelCol='Maximum Installs', maxDepth=3)
#     dt_model = dt.fit(train_df)
#     y_pred = dt_model.transform(test_df)
#     y_pred.show(5)
#     dt_evaluator = RegressionEvaluator(predictionCol="prediction", \
#                     labelCol="Maximum Installs",metricName="r2")
#     print("R Squared (R2) on test data = %g" % dt_evaluator.evaluate(y_pred))
#     test_result = dt_model.evaluate(test_df)
#     print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)
# except Exception as e:
#     print("Error occurred while training or evaluating Decision Tree Regressor:", e)

# # Random Forest Regressor
# try:
#     rf = RandomForestRegressor(featuresCol='Features',labelCol='Maximum Installs', numTrees=100)
#     rf_model = rf.fit(train_df)
#     y_pred = rf_model.transform(test_df)
#     y_pred.show(5)
#     rf_evaluator = RegressionEvaluator(predictionCol="prediction", \
#                     labelCol="Maximum Installs",metricName="r2")
#     print("R Squared (R2) on test data = %g" % rf_evaluator.evaluate(y_pred))
#     test_result = rf_model.evaluate(test_df)
#     print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)
# except Exception as e:
#     print("Error occurred while training or evaluating Random Forest Regressor:", e)

# # Gradient Boosted Tree Regressor
# try:
#     gbt = GBTRegressor(featuresCol='Features',labelCol='Maximum Installs', maxIter=10)
#     gbt_model = gbt.fit(train_df)
#     y_pred = gbt_model.transform(test_df)
#     y_pred.show(5)
#     gbt_evaluator = RegressionEvaluator(predictionCol="prediction", \
#                     labelCol="Maximum Installs",metricName="r2")
#     print("R Squared (R2) on test data = %g" % gbt_evaluator.evaluate(y_pred))
#     test_result = gbt_model.evaluate(test_df)
#     print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)
# except Exception as e:
#     print("Error occurred while training or evaluating Gradient Boosted Tree Regressor:", e)
# # from pyspark.ml.regression import AFTSurvivalRegression

Error occurred while training or evaluating Decision Tree Regressor: requirement failed: DecisionTree requires maxBins (= 32) to be at least as large as the number of values in each categorical feature, but categorical feature 9 has 809942 values. Consider removing this and other categorical features with a large number of values, or add more training examples.
Error occurred while training or evaluating Random Forest Regressor: requirement failed: DecisionTree requires maxBins (= 32) to be at least as large as the number of values in each categorical feature, but categorical feature 9 has 809942 values. Consider removing this and other categorical features with a large number of values, or add more training examples.
Error occurred while training or evaluating Gradient Boosted Tree Regressor: requirement failed: DecisionTree requires maxBins (= 32) to be at least as large as the number of values in each categorical feature, but categorical feature 9 has 809942 values. Consider removin

In [21]:
# #Feature Importance
# dt_model.featureImportances

# #notice that Feature at index 0 has higher importance, this feature is the 'Rating Count'

## Gradient-Boosted Decision Tree Regressor

In [None]:
# from pyspark.ml.regression import GBTRegressor
# gbt = GBTRegressor(featuresCol = 'Features', labelCol = 'Maximum Installs', maxIter=10, maxBins=48) #same as above, maxbins is set just to avoid an error I dont understand
# # maxIter is just a hyperparameter we put by hand
# gbt_model = gbt.fit(train_df)
# gbt_predictions = gbt_model.transform(test_df)
# gbt_predictions.select('prediction', 'Maximum Installs', 'Features').show(5)

In [None]:
# gbt_evaluator = RegressionEvaluator(
#     labelCol="Maximum Installs", predictionCol="prediction", metricName="rmse")
# rmse = gbt_evaluator.evaluate(gbt_predictions)
# print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)