In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
# Creating SparkSession with name 'abc'
spark = SparkSession.builder.appName('abc').getOrCreate()

In [4]:
spark

In [5]:
# Spark code to create dataframe with column name as number containing numbers ranging from 0 to 999 
myRange = spark.range(1000).toDF("number")

In [6]:
myRange

DataFrame[number: bigint]

In [7]:
myRange.show()

+------+
|number|
+------+
|     0|
|     1|
|     2|
|     3|
|     4|
|     5|
|     6|
|     7|
|     8|
|     9|
|    10|
|    11|
|    12|
|    13|
|    14|
|    15|
|    16|
|    17|
|    18|
|    19|
+------+
only showing top 20 rows



### Reading CSV

In [8]:
f_path = 'tips.csv'
tips = spark.read.csv(f_path, header=True)
tips.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

### Check schema

In [9]:
tips.printSchema()

root
 |-- total_bill: string (nullable = true)
 |-- tip: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: string (nullable = true)



In [10]:
tips = spark.read.csv(f_path, header=True, inferSchema=True)

In [11]:
tips.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)



In [12]:
# Datatpe
type(tips)

pyspark.sql.dataframe.DataFrame

In [13]:
# Listing columns
tips.columns

['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

In [14]:
# Filtering by column name
tips.select('sex').show()

+------+
|   sex|
+------+
|Female|
|  Male|
|  Male|
|  Male|
|Female|
|  Male|
|  Male|
|  Male|
|  Male|
|  Male|
|  Male|
|Female|
|  Male|
|  Male|
|Female|
|  Male|
|Female|
|  Male|
|Female|
|  Male|
+------+
only showing top 20 rows



In [15]:
tips.select(['sex','tip']).show()

+------+----+
|   sex| tip|
+------+----+
|Female|1.01|
|  Male|1.66|
|  Male| 3.5|
|  Male|3.31|
|Female|3.61|
|  Male|4.71|
|  Male| 2.0|
|  Male|3.12|
|  Male|1.96|
|  Male|3.23|
|  Male|1.71|
|Female| 5.0|
|  Male|1.57|
|  Male| 3.0|
|Female|3.02|
|  Male|3.92|
|Female|1.67|
|  Male|3.71|
|Female| 3.5|
|  Male|3.35|
+------+----+
only showing top 20 rows



In [16]:
# Adding column
tips.withColumn('2 extra tips', tips['tip']+2).show()

+----------+----+------+------+---+------+----+------------------+
|total_bill| tip|   sex|smoker|day|  time|size|      2 extra tips|
+----------+----+------+------+---+------+----+------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|              3.01|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|              3.66|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|               5.5|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|5.3100000000000005|
|     24.59|3.61|Female|    No|Sun|Dinner|   4| 5.609999999999999|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|              6.71|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|               4.0|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|              5.12|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|              3.96|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|              5.23|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|              3.71|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|               

In [17]:
# Dropping column
tips.drop('2 extra tips').show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [18]:
# Rename columns
tips.withColumnRenamed('sex','gender').show()

+----------+----+------+------+---+------+----+
|total_bill| tip|gender|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [19]:
# Datatypes
tips.dtypes

[('total_bill', 'double'),
 ('tip', 'double'),
 ('sex', 'string'),
 ('smoker', 'string'),
 ('day', 'string'),
 ('time', 'string'),
 ('size', 'int')]

In [20]:
# Describe
tips.describe().show()

+-------+------------------+------------------+------+------+----+------+------------------+
|summary|        total_bill|               tip|   sex|smoker| day|  time|              size|
+-------+------------------+------------------+------+------+----+------+------------------+
|  count|               244|               244|   244|   244| 244|   244|               244|
|   mean|19.785942622950824|2.9982786885245902|  null|  null|null|  null| 2.569672131147541|
| stddev| 8.902411954856857|1.3836381890011815|  null|  null|null|  null|0.9510998047322347|
|    min|              3.07|               1.0|Female|    No| Fri|Dinner|                 1|
|    max|             50.81|              10.0|  Male|   Yes|Thur| Lunch|                 6|
+-------+------------------+------------------+------+------+----+------+------------------+



In [21]:
# Dropping rows with nan/null values
tips.na.drop().show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [22]:
tips.createOrReplaceTempView("tips")

In [23]:
query = "SELECT * FROM tips LIMIT 2"

In [24]:
tips2 = spark.sql(query)
tips2.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
+----------+----+------+------+---+------+----+



In [25]:
query = """
SELECT day, sex, COUNT(*) as N FROM tips
GROUP BY day, sex
ORDER BY day
"""

In [26]:
tip_count = spark.sql(query)
tip_count.show()

+----+------+---+
| day|   sex|  N|
+----+------+---+
| Fri|  Male| 10|
| Fri|Female|  9|
| Sat|  Male| 59|
| Sat|Female| 28|
| Sun|  Male| 58|
| Sun|Female| 18|
|Thur|  Male| 30|
|Thur|Female| 32|
+----+------+---+



### Converting into pandas dataframe

In [27]:
pd_tip_count = tip_count.toPandas()
pd_tip_count

Unnamed: 0,day,sex,N
0,Fri,Male,10
1,Fri,Female,9
2,Sat,Male,59
3,Sat,Female,28
4,Sun,Male,58
5,Sun,Female,18
6,Thur,Male,30
7,Thur,Female,32


### Imputation

In [28]:
f_path ='test.csv'
test = spark.read.csv(f_path, header=True, inferSchema=True)

In [29]:
test.show()

+----+----+-----+
|Name| age|score|
+----+----+-----+
|  as|  23|   97|
|  ds|  54|   45|
|  av|null| null|
|  bl|null|   87|
+----+----+-----+



In [30]:
# Mean of all columns
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['age','score'],
    outputCols=["{}_imputed".format(c) for c in ['age','score']]
    ).setStrategy("median")

In [31]:
imputer.fit(test).transform(test).show()

+----+----+-----+-----------+-------------+
|Name| age|score|age_imputed|score_imputed|
+----+----+-----+-----------+-------------+
|  as|  23|   97|         23|           97|
|  ds|  54|   45|         54|           45|
|  av|null| null|         23|           87|
|  bl|null|   87|         23|           87|
+----+----+-----+-----------+-------------+



### Filters

In [50]:
f_path = 'tips.csv'
tips = spark.read.csv(f_path, header=True,inferSchema=True)

In [51]:
filterA = tips.sex == "Female"
filterB = tips.day == "Sun"
tip_filter = tips.filter(filterA).filter(filterB)

In [52]:
tip_filter.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.97| 3.5|Female|    No|Sun|Dinner|   3|
|     10.29| 2.6|Female|    No|Sun|Dinner|   2|
|     34.81| 5.2|Female|    No|Sun|Dinner|   4|
|     25.71| 4.0|Female|    No|Sun|Dinner|   3|
|     17.31| 3.5|Female|    No|Sun|Dinner|   2|
|     29.85|5.14|Female|    No|Sun|Dinner|   5|
|      25.0|3.75|Female|    No|Sun|Dinner|   4|
|     13.39|2.61|Female|    No|Sun|Dinner|   2|
|     16.21| 2.0|Female|    No|Sun|Dinner|   3|
|     17.51| 3.0|Female|   Yes|Sun|Dinner|   2|
|       9.6| 4.0|Female|   Yes|Sun|Dinner|   2|
|      20.9| 3.5|Female|   Yes|Sun|Dinner|   3|
|     18.15| 3.5|Female|   Yes|Sun|Dinne

In [53]:
# Method 1: Tips greater than 5
tips.filter("tip>5.00").show()

+----------+----+------+------+----+------+----+
|total_bill| tip|   sex|smoker| day|  time|size|
+----------+----+------+------+----+------+----+
|     39.42|7.58|  Male|    No| Sat|Dinner|   4|
|      30.4| 5.6|  Male|    No| Sun|Dinner|   4|
|      32.4| 6.0|  Male|    No| Sun|Dinner|   4|
|     34.81| 5.2|Female|    No| Sun|Dinner|   4|
|     48.27|6.73|  Male|    No| Sat|Dinner|   4|
|     34.83|5.17|Female|    No|Thur| Lunch|   4|
|     24.71|5.85|  Male|    No|Thur| Lunch|   2|
|     29.93|5.07|  Male|    No| Sun|Dinner|   4|
|      34.3| 6.7|  Male|    No|Thur| Lunch|   6|
|     29.85|5.14|Female|    No| Sun|Dinner|   5|
|     50.81|10.0|  Male|   Yes| Sat|Dinner|   3|
|      7.25|5.15|  Male|   Yes| Sun|Dinner|   2|
|     23.33|5.65|  Male|   Yes| Sun|Dinner|   2|
|     23.17| 6.5|  Male|   Yes| Sun|Dinner|   4|
|     25.89|5.16|  Male|   Yes| Sat|Dinner|   4|
|     48.33| 9.0|  Male|    No| Sat|Dinner|   4|
|     28.17| 6.5|Female|   Yes| Sat|Dinner|   3|
|     29.03|5.92|  M

In [54]:
# Method 2: Tips greater than 5
tips.filter(tips['tip']>5.00).show()

+----------+----+------+------+----+------+----+
|total_bill| tip|   sex|smoker| day|  time|size|
+----------+----+------+------+----+------+----+
|     39.42|7.58|  Male|    No| Sat|Dinner|   4|
|      30.4| 5.6|  Male|    No| Sun|Dinner|   4|
|      32.4| 6.0|  Male|    No| Sun|Dinner|   4|
|     34.81| 5.2|Female|    No| Sun|Dinner|   4|
|     48.27|6.73|  Male|    No| Sat|Dinner|   4|
|     34.83|5.17|Female|    No|Thur| Lunch|   4|
|     24.71|5.85|  Male|    No|Thur| Lunch|   2|
|     29.93|5.07|  Male|    No| Sun|Dinner|   4|
|      34.3| 6.7|  Male|    No|Thur| Lunch|   6|
|     29.85|5.14|Female|    No| Sun|Dinner|   5|
|     50.81|10.0|  Male|   Yes| Sat|Dinner|   3|
|      7.25|5.15|  Male|   Yes| Sun|Dinner|   2|
|     23.33|5.65|  Male|   Yes| Sun|Dinner|   2|
|     23.17| 6.5|  Male|   Yes| Sun|Dinner|   4|
|     25.89|5.16|  Male|   Yes| Sat|Dinner|   4|
|     48.33| 9.0|  Male|    No| Sat|Dinner|   4|
|     28.17| 6.5|Female|   Yes| Sat|Dinner|   3|
|     29.03|5.92|  M

In [55]:
# Multiple Filter with select
tips.filter("tip>5.00 and smoker=='No'").select(['sex','time']).show()

+------+------+
|   sex|  time|
+------+------+
|  Male|Dinner|
|  Male|Dinner|
|  Male|Dinner|
|Female|Dinner|
|  Male|Dinner|
|Female| Lunch|
|  Male| Lunch|
|  Male|Dinner|
|  Male| Lunch|
|Female|Dinner|
|  Male|Dinner|
|  Male|Dinner|
+------+------+



In [56]:
# Method 2
tips.filter((tips['tip']>5.00) & (tips['smoker']=='No')).select(['sex','time']).show()

+------+------+
|   sex|  time|
+------+------+
|  Male|Dinner|
|  Male|Dinner|
|  Male|Dinner|
|Female|Dinner|
|  Male|Dinner|
|Female| Lunch|
|  Male| Lunch|
|  Male|Dinner|
|  Male| Lunch|
|Female|Dinner|
|  Male|Dinner|
|  Male|Dinner|
+------+------+



### Group By and Aggregate Function

In [59]:
# Group by and aggregating all numeric columns
tips.groupBy('sex').sum().show()

+------+------------------+-----------------+---------+
|   sex|   sum(total_bill)|         sum(tip)|sum(size)|
+------+------------------+-----------------+---------+
|Female|1570.9499999999998|           246.51|      214|
|  Male|3256.8200000000024|485.0700000000001|      413|
+------+------------------+-----------------+---------+



In [60]:
# Group by and aggregating single column
tips.groupBy('sex').sum('tip').show()

+------+-----------------+
|   sex|         sum(tip)|
+------+-----------------+
|Female|           246.51|
|  Male|485.0700000000001|
+------+-----------------+



In [61]:
# Calculating sum of tip column using aggregate function
tips.agg({'tip':'sum'}).show()

+--------+
|sum(tip)|
+--------+
|  731.58|
+--------+



### PySpark ML

In [62]:
training = spark.read.csv('Salary.csv',header=True,inferSchema=True)

In [63]:
training.show()

+--------+---+----------+------+
|    Name|Age|Experience|Salary|
+--------+---+----------+------+
| Bharath| 31|        10| 30000|
|   Emlie| 30|         8| 25000|
|Florence| 29|         4| 20000|
|  Sydney| 24|         3| 20000|
|    Matt| 21|         1| 15000|
| Cillian| 23|         2| 18000|
+--------+---+----------+------+



In [64]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols=['Age','Experience'],outputCol="IndependentFeatures")

In [65]:
output = featureassembler.transform(training)

In [66]:
output.show()

+--------+---+----------+------+-------------------+
|    Name|Age|Experience|Salary|IndependentFeatures|
+--------+---+----------+------+-------------------+
| Bharath| 31|        10| 30000|        [31.0,10.0]|
|   Emlie| 30|         8| 25000|         [30.0,8.0]|
|Florence| 29|         4| 20000|         [29.0,4.0]|
|  Sydney| 24|         3| 20000|         [24.0,3.0]|
|    Matt| 21|         1| 15000|         [21.0,1.0]|
| Cillian| 23|         2| 18000|         [23.0,2.0]|
+--------+---+----------+------+-------------------+



In [67]:
finalized_data = output.select("IndependentFeatures","Salary")
finalized_data.show()

+-------------------+------+
|IndependentFeatures|Salary|
+-------------------+------+
|        [31.0,10.0]| 30000|
|         [30.0,8.0]| 25000|
|         [29.0,4.0]| 20000|
|         [24.0,3.0]| 20000|
|         [21.0,1.0]| 15000|
|         [23.0,2.0]| 18000|
+-------------------+------+



In [68]:
from pyspark.ml.regression import LinearRegression
train_data, test_data = finalized_data.randomSplit([0.75,0.25])
regressor = LinearRegression(featuresCol='IndependentFeatures',labelCol='Salary')
regressor = regressor.fit(train_data)

In [69]:
regressor.coefficients

DenseVector([-258.8832, 1670.0508])

In [70]:
regressor.intercept

20543.147208121274

In [71]:
pred_results = regressor.evaluate(test_data)

In [72]:
pred_results.predictions.show()

+-------------------+------+------------------+
|IndependentFeatures|Salary|        prediction|
+-------------------+------+------------------+
|         [21.0,1.0]| 15000|16776.649746192812|
|         [24.0,3.0]| 20000|19340.101522842593|
+-------------------+------+------------------+





In [73]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError

(1218.2741116751095, 1795975.1604008244)

### Linear Regression

In [76]:
f_path ='tips.csv'
tips = spark.read.csv(f_path, header=True, inferSchema=True)

In [77]:
tips.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [78]:
from pyspark.ml.feature import StringIndexer

In [85]:
# Handling Categorical Values - Ordinal encoding
indexer = StringIndexer(inputCols=['sex','smoker','day','time'],outputCols=['sex_indexed','smoker_indexed','day_indexed','time_indexed'])

In [86]:
tips_2=indexer.fit(tips).transform(tips)
tips_2.show()

+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_indexed|smoker_indexed|day_indexed|time_indexed|
+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|        1.0|           0.0|        1.0|         0.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|        1.0|         0.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|        1.0|           0.0|        1.0|         0.0|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|        0.0|           0.0|        1.0|         0.0|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|        1.0|         0.0|


In [87]:
from pyspark.ml.feature import VectorAssembler
featureAssembler = VectorAssembler(inputCols=
                                   ['tip','size','sex_indexed','smoker_indexed','day_indexed','time_indexed'],
                                   outputCol="Independent_Features")
output = featureAssembler.transform(tips_2)

In [88]:
finalized_data = output.select("Independent_Features","total_bill")
finalized_data.show()

+--------------------+----------+
|Independent_Features|total_bill|
+--------------------+----------+
|[1.01,2.0,1.0,0.0...|     16.99|
|[1.66,3.0,0.0,0.0...|     10.34|
|[3.5,3.0,0.0,0.0,...|     21.01|
|[3.31,2.0,0.0,0.0...|     23.68|
|[3.61,4.0,1.0,0.0...|     24.59|
|[4.71,4.0,0.0,0.0...|     25.29|
|[2.0,2.0,0.0,0.0,...|      8.77|
|[3.12,4.0,0.0,0.0...|     26.88|
|[1.96,2.0,0.0,0.0...|     15.04|
|[3.23,2.0,0.0,0.0...|     14.78|
|[1.71,2.0,0.0,0.0...|     10.27|
|[5.0,4.0,1.0,0.0,...|     35.26|
|[1.57,2.0,0.0,0.0...|     15.42|
|[3.0,4.0,0.0,0.0,...|     18.43|
|[3.02,2.0,1.0,0.0...|     14.83|
|[3.92,2.0,0.0,0.0...|     21.58|
|[1.67,3.0,1.0,0.0...|     10.33|
|[3.71,3.0,0.0,0.0...|     16.29|
|[3.5,3.0,1.0,0.0,...|     16.97|
|(6,[0,1],[3.35,3.0])|     20.65|
+--------------------+----------+
only showing top 20 rows



In [89]:
from pyspark.ml.regression import LinearRegression
train_data, test_data = finalized_data.randomSplit([0.75,0.25])
regressor = LinearRegression(featuresCol='Independent_Features', labelCol='total_bill')
regressor = regressor.fit(train_data)

In [90]:
regressor.coefficients

DenseVector([3.0675, 3.1701, -1.7096, 1.9556, 0.0829, -1.6518])

In [91]:
regressor.intercept

2.609481858663409

In [93]:
# Prediction
pred_res = regressor.evaluate(test_data)

In [94]:
pred_res.predictions.show()

+--------------------+----------+------------------+
|Independent_Features|total_bill|        prediction|
+--------------------+----------+------------------+
|(6,[0,1],[1.45,2.0])|      9.55|13.397607946970869|
|(6,[0,1],[1.75,2.0])|     17.82| 14.31786107786833|
|(6,[0,1],[1.97,2.0])|     12.02|  14.9927133738598|
|(6,[0,1],[2.24,3.0])|     16.04|18.991059169485712|
| (6,[0,1],[3.0,2.0])|      14.0|18.152249123274412|
| (6,[0,1],[3.0,4.0])|     20.45| 24.49248507891081|
|(6,[0,1],[6.73,4.0])|     48.27| 35.93429900640257|
|(6,[0,1],[7.58,4.0])|     39.42| 38.54168287727871|
|[1.0,2.0,1.0,1.0,...|      5.75|12.511739227919783|
|[1.32,2.0,0.0,0.0...|      9.68| 13.08168741242842|
|[1.36,3.0,1.0,0.0...|     18.64|13.095909258370925|
|[1.5,2.0,0.0,0.0,...|     12.46|13.799550935326465|
|[1.5,2.0,0.0,1.0,...|     11.59|15.506571954664349|
|[1.83,1.0,1.0,0.0...|     10.07| 8.197403207807211|
|[1.98,2.0,0.0,1.0...|     11.02|16.978976964100287|
|[2.0,2.0,0.0,0.0,...|      7.51|13.5986429349



In [95]:
# Performance metrics
pred_res.r2, pred_res.meanAbsoluteError,pred_res.meanSquaredError

(0.5352515033959393, 4.35359568329469, 36.17349705467136)