In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import svm
from scipy.stats import loguniform
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import sys
sys.path.append('../../DataPreprocessing')
sys.path.append('../Classification')
import DataPreprocessing
from utils import read_data, hyperparameter_search, encode, decode, floor_col

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[1]") \
    .appName("aymon") \
    .getOrCreate()


In [3]:
df = DataPreprocessing.read_data(spark,file_name='../../Google-Playstore.csv', absolute_csv_path=True)
df= DataPreprocessing.convert_size_to_bytes(df)
df = DataPreprocessing.remove_useless_col(df, ['Scraped time', 'App Name', 'App Id', 'Minimum Installs', 'Maximum Installs', 'Currency', 'Developer Email'\
                                               , 'Developer Id'])
#developer Id might be beneficial, but it is a nominal categorized feature, we cannot grasp its potential, rather we can make another feature
#as number of apps developed for each developer


In [4]:
#DataPreprocessing.show_nulls(df)

In [5]:

uninteresting_cols= ['Minimum Android','Size','Installs','Price','Ad Supported','In App Purchases', 'Released', 'Last Updated']
df=DataPreprocessing.handle_missing_values(df,cols=uninteresting_cols)
interesting_num_cols=['Rating','Rating Count', 'Installs']
df= DataPreprocessing.handle_missing_values(df, handling_method='mean', cols=interesting_num_cols)
df = DataPreprocessing.binarize_col(df, cols=['Privacy Policy', 'Developer Website'])
df = DataPreprocessing.convert_binary_pyspark(df, cols=['Ad Supported', 'In App Purchases', 'Editors Choice', 'Free'])
df = df.filter("Rating>=1")
df = df.filter(col('Rating Count') >=200)
df = df.filter(col('Price') >= 0)
df = df.drop('Free')
df.show(10)

Total Number of rows : 2237972
Number of rows after dropping nulls: 2183417
Total Number of rows : 2183417
+-----------------+------+------------+--------+-----+-----+---------------+------------+------------+--------------+--------------+-----------------+------------+----------------+--------------+
|         Category|Rating|Rating Count|Installs|Price| Size|Minimum Android|    Released|Last Updated|Content Rating|Privacy Policy|Developer Website|Ad Supported|In App Purchases|Editors Choice|
+-----------------+------+------------+--------+-----+-----+---------------+------------+------------+--------------+--------------+-----------------+------------+----------------+--------------+
|  Personalization|   4.7|         820|   50000|  0.0|  3.5|     4.1 and up|Sep 22, 2019|Oct 07, 2020|      Everyone|          true|             true|        true|           false|         false|
|   Travel & Local|   3.7|        1572|   10000|  0.0|2.9E7|     4.2 and up| Sep 5, 2018|May 30, 2020|      E

In [6]:
df.printSchema()

root
 |-- Category: string (nullable = true)
 |-- Rating: float (nullable = true)
 |-- Rating Count: integer (nullable = true)
 |-- Installs: integer (nullable = true)
 |-- Price: float (nullable = true)
 |-- Size: float (nullable = true)
 |-- Minimum Android: string (nullable = true)
 |-- Released: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Privacy Policy: boolean (nullable = false)
 |-- Developer Website: boolean (nullable = false)
 |-- Ad Supported: boolean (nullable = false)
 |-- In App Purchases: boolean (nullable = false)
 |-- Editors Choice: boolean (nullable = false)



In [7]:
df = DataPreprocessing.convert_Last_Updated_to_Year(df)
df.show(10)

Total Number of rows : 267147
Number of rows after dropping nulls: 267141
+------------+------+
|Last Updated| count|
+------------+------+
|        2021|107641|
|        2020| 70608|
|        2019| 32809|
|        2018| 20270|
|        2017| 13441|
|        2016|  8978|
|        2015|  6490|
|        2014|  4109|
|        2013|  1913|
|        2012|   586|
|        2011|   221|
|        2010|    73|
|        2009|     2|
+------------+------+

+-----------------+------+------------+--------+-----+-----+---------------+------------+------------+--------------+--------------+-----------------+------------+----------------+--------------+
|         Category|Rating|Rating Count|Installs|Price| Size|Minimum Android|    Released|Last Updated|Content Rating|Privacy Policy|Developer Website|Ad Supported|In App Purchases|Editors Choice|
+-----------------+------+------------+--------+-----+-----+---------------+------------+------------+--------------+--------------+-----------------+---------

In [8]:
#DataPreprocessing.get_info(df)

In [9]:
df = encode(df, columns=['Category', 'Minimum Android', 'Content Rating'])
#df = floor_col(df, 'Rating')
df.show(10)

+-----------------+------+------------+--------+-----+-----+---------------+------------+------------+--------------+--------------+-----------------+------------+----------------+--------------+------------+-------------------+------------------+
|         Category|Rating|Rating Count|Installs|Price| Size|Minimum Android|    Released|Last Updated|Content Rating|Privacy Policy|Developer Website|Ad Supported|In App Purchases|Editors Choice|Category_enc|Minimum Android_enc|Content Rating_enc|
+-----------------+------+------------+--------+-----+-----+---------------+------------+------------+--------------+--------------+-----------------+------------+----------------+--------------+------------+-------------------+------------------+
|  Personalization|   4.7|         820|   50000|  0.0|  3.5|     4.1 and up|Sep 22, 2019|        2020|      Everyone|          true|             true|        true|           false|         false|           3|                  0|                 0|
|   Trav

In [10]:
df.printSchema()

root
 |-- Category: string (nullable = true)
 |-- Rating: float (nullable = true)
 |-- Rating Count: integer (nullable = true)
 |-- Installs: integer (nullable = true)
 |-- Price: float (nullable = true)
 |-- Size: float (nullable = true)
 |-- Minimum Android: string (nullable = true)
 |-- Released: string (nullable = true)
 |-- Last Updated: integer (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Privacy Policy: boolean (nullable = false)
 |-- Developer Website: boolean (nullable = false)
 |-- Ad Supported: boolean (nullable = false)
 |-- In App Purchases: boolean (nullable = false)
 |-- Editors Choice: boolean (nullable = false)
 |-- Category_enc: integer (nullable = true)
 |-- Minimum Android_enc: integer (nullable = true)
 |-- Content Rating_enc: integer (nullable = true)



In [11]:
inputColums=['Rating','Rating Count','Installs',\
             'Size','Last Updated','Privacy Policy','Developer Website',\
                'Ad Supported','In App Purchases','Editors Choice','Category_enc','Minimum Android_enc','Content Rating_enc']
assembler = VectorAssembler(inputCols=inputColums, outputCol='features')

traindf = assembler.transform(df).select(['features', 'Price'])
traindf.show(10)

+--------------------+-----+
|            features|Price|
+--------------------+-----+
|[4.69999980926513...|  0.0|
|[3.70000004768371...|  0.0|
|(13,[0,1,2,3,4,5,...|  0.0|
|[4.40000009536743...|  0.0|
|[3.79999995231628...|  0.0|
|[2.29999995231628...|  0.0|
|[2.70000004768371...|  0.0|
|[4.40000009536743...|  0.0|
|[4.30000019073486...|  0.0|
|[4.09999990463256...|  0.0|
+--------------------+-----+
only showing top 10 rows



In [14]:
# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = traindf.randomSplit([0.7, 0.3])
# Train a GBT model.
gbt = GBTRegressor(featuresCol="features", labelCol='Price',maxIter=160)

# Train model.  This also runs the indexer.
model = gbt.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "Price", "features").show(5)
predictionAndTarget = predictions.select(['Price', 'prediction'])
# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="Price", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictionAndTarget)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)


evaluator = RegressionEvaluator(
    labelCol="Price", predictionCol="prediction", metricName="r2")
rmse = evaluator.evaluate(predictionAndTarget)
print("R2 Error (r2) on test data = %g" % rmse)


+--------------------+-----+--------------------+
|          prediction|Price|            features|
+--------------------+-----+--------------------+
| 0.11020852400554654|  0.0|(13,[0,1,2,3,4],[...|
| 0.07866405698763794|  0.0|(13,[0,1,2,3,4],[...|
|0.010835049221353879|  0.0|(13,[0,1,2,3,4],[...|
|0.037273392174358884|  0.0|(13,[0,1,2,3,4],[...|
| 0.09633439803424768|  0.0|(13,[0,1,2,3,4],[...|
+--------------------+-----+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 1.7384
R2 Error (r2) on test data = -0.158066
