In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('multiModel').getOrCreate()

In [3]:
spark

In [4]:
df = spark.read.csv('tips.csv',header=True,inferSchema=True)
df.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [5]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCols=['sex','smoker','day','time'],
                       outputCols=['sex_index','smoker_index','day_index','time_index'])
df_encoded = indexer.fit(df).transform(df)

In [6]:
from pyspark.ml.feature import OneHotEncoder

onehot = OneHotEncoder(inputCols=['sex_index','smoker_index','day_index','time_index'],
                      outputCols=['sex_onehot','smoker_onehot','day_onehot','time_onehot'])
df_onehot = onehot.fit(df_encoded).transform(df_encoded)

In [7]:
df_onehot.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)
 |-- sex_index: double (nullable = false)
 |-- smoker_index: double (nullable = false)
 |-- day_index: double (nullable = false)
 |-- time_index: double (nullable = false)
 |-- sex_onehot: vector (nullable = true)
 |-- smoker_onehot: vector (nullable = true)
 |-- day_onehot: vector (nullable = true)
 |-- time_onehot: vector (nullable = true)



In [8]:
df_onehot.columns

['total_bill',
 'tip',
 'sex',
 'smoker',
 'day',
 'time',
 'size',
 'sex_index',
 'smoker_index',
 'day_index',
 'time_index',
 'sex_onehot',
 'smoker_onehot',
 'day_onehot',
 'time_onehot']

In [9]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=['total_bill','size','sex_onehot','smoker_onehot','day_onehot','time_onehot'],outputCol='Independent features')
output = assembler.transform(df_onehot)

In [10]:
output.select('Independent features').show()

+--------------------+
|Independent features|
+--------------------+
|[16.99,2.0,0.0,1....|
|[10.34,3.0,1.0,1....|
|[21.01,3.0,1.0,1....|
|[23.68,2.0,1.0,1....|
|[24.59,4.0,0.0,1....|
|[25.29,4.0,1.0,1....|
|[8.77,2.0,1.0,1.0...|
|[26.88,4.0,1.0,1....|
|[15.04,2.0,1.0,1....|
|[14.78,2.0,1.0,1....|
|[10.27,2.0,1.0,1....|
|[35.26,4.0,0.0,1....|
|[15.42,2.0,1.0,1....|
|[18.43,4.0,1.0,1....|
|[14.83,2.0,0.0,1....|
|[21.58,2.0,1.0,1....|
|[10.33,3.0,0.0,1....|
|[16.29,3.0,1.0,1....|
|[16.97,3.0,0.0,1....|
|[20.65,3.0,1.0,1....|
+--------------------+
only showing top 20 rows



In [11]:
finalized_data = output.select(['Independent features','tip'])

In [12]:
from pyspark.ml.regression import LinearRegression

regr = LinearRegression(featuresCol='Independent features',labelCol='tip')

In [13]:
train_data,test_data=finalized_data.randomSplit([0.75,0.25])

In [14]:
regr=regr.fit(train_data)

In [15]:
regr.coefficients

DenseVector([0.0795, 0.2782, 0.104, 0.1406, -0.5185, -0.2089, -0.1727, 0.162])

In [16]:
regr.intercept

0.7718044489786926

In [18]:
predictions= regr.evaluate(test_data)

In [19]:
predictions.predictions.show()

+--------------------+----+------------------+
|Independent features| tip|        prediction|
+--------------------+----+------------------+
|(8,[0,1,2],[12.16...| 2.2|2.3984645091540697|
|(8,[0,1,2,6],[10....| 2.0|2.0811585372635255|
|(8,[0,1,2,6],[16....| 2.0| 2.530881769036042|
|(8,[0,1,2,6],[19....| 3.0|2.8042117897599734|
|(8,[0,1,2,7],[12....| 1.5|2.5501019179366997|
|(8,[0,1,2,7],[21....| 3.0|3.2636204022683586|
|(8,[0,1,2,7],[28....| 3.0| 3.896093357199317|
|(8,[0,1,3],[15.98...| 3.0| 3.016816670841422|
|(8,[0,1,3,6],[8.5...|1.25| 1.972337863329255|
|(8,[0,1,3,6],[13....|1.68|2.3624687940718436|
|(8,[0,1,3,6],[14....| 2.0| 2.420471967074306|
|(8,[0,1,3,6],[17....| 3.5|2.6842672196334494|
|(8,[0,1,3,6],[18....|1.36| 3.055475377102974|
|(8,[0,1,3,7],[22....|3.25|  3.43845908123654|
|(8,[0,1,4,7],[14....| 4.0| 2.108766885710804|
|(8,[0,1,4,7],[22....|2.88| 2.729321380435777|
|(8,[0,1,4,7],[22....|3.48| 2.753158300847748|
|(8,[0,1,4,7],[26....|3.14|3.1059447229449155|
|(8,[0,1,4,7]

In [20]:
predictions.meanAbsoluteError,predictions.meanSquaredError,predictions.r2adj

(0.7985874322281933, 1.2656114935381209, 0.2862391893783953)