In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
import pyspark.ml
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler,StringIndexer
from pyspark.sql.functions import col

import sys; sys.path.append("../../")
from DataPreprocessing.DataPreprocessing import read_data,encode_categ_features,remove_useless_col


# Spark session

In [2]:
sc = SparkContext(master='local')
spark = SparkSession.builder.appName("InstallationsPrediction").getOrCreate()

# Read the data

In [3]:
df= read_data(spark, features='all')
df.dtypes

[('App Name', 'string'),
 ('App Id', 'string'),
 ('Category', 'string'),
 ('Rating', 'float'),
 ('Rating Count', 'int'),
 ('Installs', 'int'),
 ('Minimum Installs', 'int'),
 ('Maximum Installs', 'int'),
 ('Free', 'string'),
 ('Price', 'float'),
 ('Size', 'double'),
 ('Minimum Android', 'string'),
 ('Developer Id', 'string'),
 ('Developer Email', 'string'),
 ('Released', 'string'),
 ('Last Updated', 'string'),
 ('Content Rating', 'string'),
 ('Ad Supported', 'string'),
 ('In App Purchases', 'string'),
 ('Editors Choice', 'string')]

In [4]:
'''
App Name and App Id are useless columns here, so we can remove them safely
The Price has a correlation of -0.004 with the max installations, so we must remove it
'''
useless_cols= ['App Name','App Id','Price']
df= remove_useless_col(df,useless_cols)
df.dtypes

[('Category', 'string'),
 ('Rating', 'float'),
 ('Rating Count', 'int'),
 ('Installs', 'int'),
 ('Minimum Installs', 'int'),
 ('Maximum Installs', 'int'),
 ('Free', 'string'),
 ('Size', 'double'),
 ('Minimum Android', 'string'),
 ('Developer Id', 'string'),
 ('Developer Email', 'string'),
 ('Released', 'string'),
 ('Last Updated', 'string'),
 ('Content Rating', 'string'),
 ('Ad Supported', 'string'),
 ('In App Purchases', 'string'),
 ('Editors Choice', 'string')]

# Features


In [5]:
interesting_categ_cols= ['Category', 'Ad Supported', 'In App Purchases', 'Editors Choice',\
                  'Free',"Minimum Android","Developer Id","Developer Email",\
               'Released','Last Updated','Content Rating']

df= encode_categ_features(df,interesting_categ_cols)

In [6]:
label = "Maximum Installs"
column_names = df.columns

# Remove the column to move from the list of column names
column_names.remove(label)

# Reorder the list of column names so that the column to move is at the end
new_column_order = column_names + [label]

df = df.select(*new_column_order).withColumn(label, col(label))

In [7]:
required_features = df.columns[:-1]      #all except the target variable which is at the last index
vec_assembler = VectorAssembler(inputCols=required_features,outputCol='Features')
vec_df = vec_assembler.setHandleInvalid("skip").transform(df)

# Splitting the data 

In [8]:
train_df,test_df = vec_df.select(['Features', 'Maximum Installs']).randomSplit([0.8,0.2])

# Linear Regression

In [9]:
from pyspark.ml.evaluation import RegressionEvaluator

lr = LinearRegression(featuresCol='Features',labelCol='Maximum Installs', )
lr_model = lr.fit(train_df)
lr_predictions = lr_model.transform(test_df)
lr_evaluator = RegressionEvaluator(
    labelCol="Maximum Installs", predictionCol="prediction", metricName="r2")
r2 = lr_evaluator.evaluate(lr_predictions)
print("r2 on test data = %g" % r2)

r2 on test data = 0.791566
