In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.ml
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler,StringIndexer
from pyspark.sql.functions import col

import sys; sys.path.append("../../")
from DataPreprocessing.DataPreprocessing import read_data,encode_categ_features,remove_useless_col


# Spark session

In [2]:
sc = SparkContext(master='local')
spark = SparkSession.builder.appName("InstallationsPrediction").getOrCreate()

# Read the data

In [3]:
df= read_data(spark, features='all')
df.dtypes

[('App Name', 'string'),
 ('App Id', 'string'),
 ('Category', 'string'),
 ('Rating', 'float'),
 ('Rating Count', 'int'),
 ('Installs', 'string'),
 ('Minimum Installs', 'int'),
 ('Maximum Installs', 'int'),
 ('Free', 'string'),
 ('Price', 'float'),
 ('Size', 'string'),
 ('Minimum Android', 'string'),
 ('Developer Id', 'string'),
 ('Developer Email', 'string'),
 ('Released', 'string'),
 ('Last Updated', 'string'),
 ('Content Rating', 'string'),
 ('Ad Supported', 'string'),
 ('In App Purchases', 'string'),
 ('Editors Choice', 'string')]

In [4]:
'''
App Name, App Id, Rating, Rating Count, Released, Last Updated, Content Rating are not interesting here for max installations prediction,
so we can remove them safely
'''

useless_cols= ['App Name','App Id']
df= remove_useless_col(df,useless_cols)
df.dtypes

[('Category', 'string'),
 ('Rating', 'float'),
 ('Rating Count', 'int'),
 ('Installs', 'string'),
 ('Minimum Installs', 'int'),
 ('Maximum Installs', 'int'),
 ('Free', 'string'),
 ('Price', 'float'),
 ('Size', 'string'),
 ('Minimum Android', 'string'),
 ('Developer Id', 'string'),
 ('Developer Email', 'string'),
 ('Released', 'string'),
 ('Last Updated', 'string'),
 ('Content Rating', 'string'),
 ('Ad Supported', 'string'),
 ('In App Purchases', 'string'),
 ('Editors Choice', 'string')]

# Features


In [5]:
interesting_categ_cols= ['Category', 'Ad Supported', 'In App Purchases', 'Editors Choice',\
                  'Free', 'Size', "Minimum Android","Developer Id","Developer Email",\
               'Released','Last Updated','Content Rating']

df= encode_categ_features(df,interesting_categ_cols)

In [11]:
label = "Maximum Installs"
column_names = df.columns

# Remove the column to move from the list of column names
column_names.remove(label)

# Reorder the list of column names so that the column to move is at the end
new_column_order = column_names + [label]

# Use the new column order to select the columns in the DataFrame,
# and then add the column to move to the end using withColumn()
df = df.select(*new_column_order).withColumn(label, col(label))

In [12]:
required_features = df.columns[:-1]      #all except the target variable which is at the last index
print(required_features)

['Price', 'Category', 'Ad Supported', 'In App Purchases', 'Editors Choice', 'Free', 'Size', 'Minimum Android', 'Developer Id', 'Developer Email', 'Rating', 'Rating Count', 'Released', 'Last Updated', 'Content Rating', 'Installs', 'Minimum Installs']


In [13]:
vec_assembler = VectorAssembler(inputCols=required_features,outputCol='Features')
vec_df = vec_assembler.setHandleInvalid("skip").transform(df)

# Splitting the data 

In [14]:
train_df,test_df = vec_df.select(['Features', 'Maximum Installs']).randomSplit([0.8,0.2])

# Model Building
## Linear Regression

In [22]:
lr = LinearRegression(featuresCol='Features',labelCol='Maximum Installs', )

In [23]:
lr_model = lr.fit(train_df)

#just some training metrics
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 4747.646684
r2: 0.349121


In [17]:
y_pred = lr_model.transform(test_df)
y_pred.show(5)

+--------------------+----------------+-------------------+
|            Features|Maximum Installs|         prediction|
+--------------------+----------------+-------------------+
|(17,[0,1,2,5,6,7,...|             238| -842.1016698885486|
|(17,[0,1,2,5,6,7,...|             218| -573.1449446455225|
|(17,[0,1,2,5,6,7,...|             130|-2381.4550009059467|
|(17,[0,1,2,5,6,7,...|             314| -2159.499181247639|
|(17,[0,1,2,5,6,7,...|             214| -943.7404244715199|
+--------------------+----------------+-------------------+
only showing top 5 rows



## Model Evaluation

In [18]:
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="Maximum Installs",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(y_pred))

R Squared (R2) on test data = 0.349139


In [19]:
test_result = lr_model.evaluate(test_df)
print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)

Root Mean Squared Error (RMSE) on test data = 4772.44
