In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('abc').getOrCreate()


In [4]:
df = spark.read.csv('tips.csv',header=True,inferSchema=True)
df.show(2)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
+----------+----+------+------+---+------+----+
only showing top 2 rows



In [7]:
from pyspark.ml.feature import StringIndexer
si  = StringIndexer(inputCols=['sex','smoker','day','time'],outputCols=['sexq','smokerq','dayq','timeq'])
df1 = si.fit(df).transform(df)


DataFrame[total_bill: double, tip: double, sex: string, smoker: string, day: string, time: string, size: int, sexq: double, smokerq: double, dayq: double, timeq: double]

In [8]:
df1.show(2)

+----------+----+------+------+---+------+----+----+-------+----+-----+
|total_bill| tip|   sex|smoker|day|  time|size|sexq|smokerq|dayq|timeq|
+----------+----+------+------+---+------+----+----+-------+----+-----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2| 1.0|    0.0| 1.0|  0.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3| 0.0|    0.0| 1.0|  0.0|
+----------+----+------+------+---+------+----+----+-------+----+-----+
only showing top 2 rows



In [9]:
df1.columns

['total_bill',
 'tip',
 'sex',
 'smoker',
 'day',
 'time',
 'size',
 'sexq',
 'smokerq',
 'dayq',
 'timeq']

In [11]:
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(inputCols=['tip','size',
 'sexq',
 'smokerq',
 'dayq',
 'timeq'], outputCol='all_input')
df2 = va.transform(df1)

In [13]:
df_final = df2.select('all_input','total_bill')
df_final.show(3)

+--------------------+----------+
|           all_input|total_bill|
+--------------------+----------+
|[1.01,2.0,1.0,0.0...|     16.99|
|[1.66,3.0,0.0,0.0...|     10.34|
|[3.5,3.0,0.0,0.0,...|     21.01|
+--------------------+----------+
only showing top 3 rows



In [19]:
from pyspark.ml.regression import LinearRegression

train, test = df_final.randomSplit([0.80,0.20])
model = LinearRegression(featuresCol='all_input',labelCol='total_bill')
model = model.fit(train)


In [20]:
model.coefficients

DenseVector([2.9627, 3.4183, -1.1303, 1.6915, 0.1884, -1.8407])

In [21]:
model.intercept

2.166718784631114

In [22]:
pred = model.evaluate(test)
pred.predictions.show(10)

+--------------------+----------+------------------+
|           all_input|total_bill|        prediction|
+--------------------+----------+------------------+
|(6,[0,1],[1.25,2.0])|     10.07| 12.70655144000765|
|(6,[0,1],[1.25,2.0])|     10.51| 12.70655144000765|
|(6,[0,1],[1.97,2.0])|     12.02|14.839666850218515|
| (6,[0,1],[2.0,3.0])|     16.31| 18.34680030419086|
|(6,[0,1],[2.34,4.0])|     17.81| 22.77235844867066|
| (6,[0,1],[2.5,4.0])|     18.35|23.246384095384187|
| (6,[0,1],[3.0,4.0])|     20.45|24.727714241363955|
| (6,[0,1],[5.0,3.0])|     31.27|27.234781180069476|
|[1.0,1.0,1.0,0.0,...|      7.25| 7.417315858192289|
|[1.0,2.0,0.0,1.0,...|      12.6|13.657344908619063|
+--------------------+----------+------------------+
only showing top 10 rows

