
## Overview

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [0]:
# File location and type
file_location = "/FileStore/tables/PySpark_tips.csv"
file_type = "csv"

In [0]:
# Reading the 'PySpark_tips.csv' file as df_org
df_org = spark.read.csv(file_location, header = True, inferSchema = True)

In [0]:
# Check the entire data frame
df_org.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [0]:
# Check the columns of the df_org pyspark data frame
df_org.columns

Out[20]: ['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

In [0]:
# Import StringIndexer to deal with categorical variables
from pyspark.ml.feature import StringIndexer


In [0]:
# Creating StringIndexer object with required parameters
# Try to convert only the 'sex' column to 'sex_indexed' column
indexer1 = StringIndexer(inputCol = 'sex', outputCol = 'sex_indexed')
df = indexer1.fit(df_org).transform(df_org)
df.show()

# So, for female it is 1 and for male, it is 0

+----------+----+------+------+---+------+----+-----------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_indexed|
+----------+----+------+------+---+------+----+-----------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|        1.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|        0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|        0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|        0.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|        1.0|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|        0.0|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|        0.0|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|        0.0|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|        0.0|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|        0.0|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|        0.0|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|        1.0|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|        0.0|
|     18.43| 3.0|  Male|    No|Sun|Dinne

In [0]:
# Convert the 'smoker', 'day' and 'time' column to its ordinal columns
# Name the output columns as 'smoker_indexed', 'day_indexed' and 'time_indexed'
indexer2 = StringIndexer(inputCols = ['smoker','day','time'], outputCols = ['smoker_indexed','day_indexed', 
                                                                            'time_indexed'])
df = indexer2.fit(df).transform(df)
df.show()

# For non-smoker, the index is 0, for smoker the index is 1
# For sunday, the index is 1
# For dinner time, the index is 0

+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_indexed|smoker_indexed|day_indexed|time_indexed|
+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|        1.0|           0.0|        1.0|         0.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|        1.0|         0.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|        1.0|           0.0|        1.0|         0.0|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|        0.0|           0.0|        1.0|         0.0|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|        1.0|         0.0|


In [0]:
# Creating the final data frame as df that will contain the one hot encoded featues only
# It will not contain any string data type columns
df = df.drop('sex')
df = df.drop('smoker')
df = df.drop('day')
df = df.drop('time')
df.show()

+----------+----+----+-----------+--------------+-----------+------------+
|total_bill| tip|size|sex_indexed|smoker_indexed|day_indexed|time_indexed|
+----------+----+----+-----------+--------------+-----------+------------+
|     16.99|1.01|   2|        1.0|           0.0|        1.0|         0.0|
|     10.34|1.66|   3|        0.0|           0.0|        1.0|         0.0|
|     21.01| 3.5|   3|        0.0|           0.0|        1.0|         0.0|
|     23.68|3.31|   2|        0.0|           0.0|        1.0|         0.0|
|     24.59|3.61|   4|        1.0|           0.0|        1.0|         0.0|
|     25.29|4.71|   4|        0.0|           0.0|        1.0|         0.0|
|      8.77| 2.0|   2|        0.0|           0.0|        1.0|         0.0|
|     26.88|3.12|   4|        0.0|           0.0|        1.0|         0.0|
|     15.04|1.96|   2|        0.0|           0.0|        1.0|         0.0|
|     14.78|3.23|   2|        0.0|           0.0|        1.0|         0.0|
|     10.27|1.71|   2|   

In [0]:
# Get the column names of the df dataframe
df.columns

Out[30]: ['total_bill',
 'tip',
 'size',
 'sex_indexed',
 'smoker_indexed',
 'day_indexed',
 'time_indexed']

In [0]:
# Import VectorAssembler to group the independent variables together for building ML
from pyspark.ml.feature import VectorAssembler

In [0]:
# Create the Vectorassembler object and transform the independent variable into a new variable called 'IndF'
features = VectorAssembler(inputCols = ['tip', 'sex_indexed', 'smoker_indexed', 'day_indexed', 'time_indexed'], 
                           outputCol = 'IndF')
# Create the output data frame by transforming (assembling) the independent variables
output_df = features.transform(df)
output_df.show()

+----------+----+----+-----------+--------------+-----------+------------+--------------------+
|total_bill| tip|size|sex_indexed|smoker_indexed|day_indexed|time_indexed|                IndF|
+----------+----+----+-----------+--------------+-----------+------------+--------------------+
|     16.99|1.01|   2|        1.0|           0.0|        1.0|         0.0|[1.01,1.0,0.0,1.0...|
|     10.34|1.66|   3|        0.0|           0.0|        1.0|         0.0|(5,[0,3],[1.66,1.0])|
|     21.01| 3.5|   3|        0.0|           0.0|        1.0|         0.0| (5,[0,3],[3.5,1.0])|
|     23.68|3.31|   2|        0.0|           0.0|        1.0|         0.0|(5,[0,3],[3.31,1.0])|
|     24.59|3.61|   4|        1.0|           0.0|        1.0|         0.0|[3.61,1.0,0.0,1.0...|
|     25.29|4.71|   4|        0.0|           0.0|        1.0|         0.0|(5,[0,3],[4.71,1.0])|
|      8.77| 2.0|   2|        0.0|           0.0|        1.0|         0.0| (5,[0,3],[2.0,1.0])|
|     26.88|3.12|   4|        0.0|      

In [0]:
# Now create the final_df data frame: contains the 'total_bill' and 'IndF' column
final_df = output_df.select(['total_bill','IndF'])
final_df.show()

+----------+--------------------+
|total_bill|                IndF|
+----------+--------------------+
|     16.99|[1.01,1.0,0.0,1.0...|
|     10.34|(5,[0,3],[1.66,1.0])|
|     21.01| (5,[0,3],[3.5,1.0])|
|     23.68|(5,[0,3],[3.31,1.0])|
|     24.59|[3.61,1.0,0.0,1.0...|
|     25.29|(5,[0,3],[4.71,1.0])|
|      8.77| (5,[0,3],[2.0,1.0])|
|     26.88|(5,[0,3],[3.12,1.0])|
|     15.04|(5,[0,3],[1.96,1.0])|
|     14.78|(5,[0,3],[3.23,1.0])|
|     10.27|(5,[0,3],[1.71,1.0])|
|     35.26|[5.0,1.0,0.0,1.0,...|
|     15.42|(5,[0,3],[1.57,1.0])|
|     18.43| (5,[0,3],[3.0,1.0])|
|     14.83|[3.02,1.0,0.0,1.0...|
|     21.58|(5,[0,3],[3.92,1.0])|
|     10.33|[1.67,1.0,0.0,1.0...|
|     16.29|(5,[0,3],[3.71,1.0])|
|     16.97|[3.5,1.0,0.0,1.0,...|
|     20.65|      (5,[0],[3.35])|
+----------+--------------------+
only showing top 20 rows



In [0]:
# Separating the final_df data into train and test part
# Store the results in variables like train and test
train,test = final_df.randomSplit([0.75,0.25])

In [0]:
# Import the LinearRegression object from the ml.regression module of pyspark
from pyspark.ml.regression import LinearRegression

In [0]:
# Build the Linear Regression model with train set as regressor
regressor = LinearRegression(featuresCol = 'IndF', labelCol = 'total_bill')

# Fit the training data into the model using .fit() method
regressor = regressor.fit(train)

In [0]:
# Get the slopes of corresponding to each variable of 'IndF' feature
regressor.coefficients

Out[37]: DenseVector([4.4388, -1.1839, 1.6141, -0.5068, -0.6902])

In [0]:
# Get the intercept of the linear regression model
regressor.intercept

Out[39]: 7.014345028137183

In [0]:
# Predicting the the results using the test data
predictions = regressor.evaluate(test)

In [0]:
# Get the predictions
predictions.predictions.show()

+----------+--------------------+------------------+
|total_bill|                IndF|        prediction|
+----------+--------------------+------------------+
|      7.25|[5.15,0.0,1.0,1.0...| 30.98145146723623|
|      7.51|[2.0,0.0,0.0,2.0,...|14.188122838053774|
|      8.58|[1.92,0.0,1.0,3.0...|14.940289817819002|
|       9.6|[4.0,1.0,1.0,1.0,...|24.692953597036524|
|     10.27|(5,[0,3],[1.71,1.0])|14.097870778408504|
|     10.63|[2.0,1.0,1.0,0.0,...| 16.32217498955549|
|     12.74|[2.01,1.0,1.0,2.0...|  14.6627349172399|
|      13.0|[2.0,1.0,1.0,2.0,...|14.618346887680946|
|     13.37|       (5,[0],[2.0])|15.891950939928318|
|     13.51|[2.0,0.0,1.0,2.0,...|15.802221358600743|
|     13.81| (5,[0,3],[2.0,1.0])|15.385123635618218|
|     14.07| (5,[0,3],[2.5,1.0])|   17.604525113566|
|     14.15|[2.0,1.0,0.0,2.0,...|13.004248367133975|
|     14.31|[4.0,1.0,1.0,0.0,...| 25.19978090134662|
|     14.52|[2.0,1.0,0.0,2.0,...|13.004248367133975|
|     15.42|(5,[0,3],[1.57,1.0])|13.4764383645

In [0]:
# Compute the r2 value to judge the model accuracy
print('The R2 value for the model is:', predictions.r2)
print('The MAE value for the model is:', predictions.meanAbsoluteError)
print('The MSE value for the model is:', predictions.meanSquaredError)

The R2 value for the model is: 0.42476916588121993
The MAE value for the model is: 4.810615026055834
The MSE value for the model is: 47.144383159992714
