# Linear regression with PySpark - solutions

In [2]:
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

## Your turn 1

In [4]:
grades = spark.read.csv("/FileStore/tables/m7yuthhq1490310580121/grades.txt", 
                        sep=' ', header=True, inferSchema=True).drop('id')
grades.show(5)

### Part I

In [6]:
for subject in ['Math', 'English', 'Literature']:
    # Preparing the data
    grades2 = VectorAssembler(inputCols=[subject],
                              outputCol='features')\
                    .transform(grades)
        
    # Instantiating the estimator
    SAT_lr = LinearRegression(featuresCol='features',
                              labelCol='SAT', 
                              predictionCol='predicted SAT')
    
    # Fitting the model and using it for prediction
    grades2 = SAT_lr.fit(grades2).transform(grades2)
    
    # Evaluating the model
    rmse = RegressionEvaluator(predictionCol='predicted SAT', 
                               labelCol='SAT', 
                               metricName='rmse').evaluate(grades2) 
    
    print "The RMSE of a model based on the {:^10} grade alone is {:.2f}"\
        .format(subject, rmse)

### Part II

In [8]:
# Preparing the data
grades2 = VectorAssembler(inputCols=['Math', 'English', 'Literature'],
                          outputCol='features')\
                .transform(grades)

# Instantiating the estimator
SAT_lr = LinearRegression(featuresCol='features',
                          labelCol='SAT', 
                          predictionCol='predicted SAT')

# Fitting the model and using it for prediction
grades2 = SAT_lr.fit(grades2).transform(grades2)

# Evaluating the model
rmse = RegressionEvaluator(predictionCol='predicted SAT', 
                           labelCol='SAT', 
                           metricName='rmse')\
            .evaluate(grades2) 

print "The RMSE of a model based on all the grades is {:.2f}"\
    .format(rmse)

## Your turn 2

In [10]:
prices = spark.read.csv("/FileStore/tables/7frd8efu1490310817439/prices.csv", 
                        header=True, inferSchema=True, quote='"')

prices = prices.withColumnRenamed('Air.conditioner', 'AC')

prices.show(5)

We transform all the features except _Rooms_.

In [12]:
for feature in ['City', 'Balcony', 'Parking', 'AC']:
    si = StringIndexer(inputCol=feature, outputCol=feature+'_ix')
    prices = si.fit(prices).transform(prices)
    ohe = OneHotEncoder(inputCol=feature+'_ix', outputCol=feature+'_ohe')
    prices = ohe.transform(prices)
prices.show(5)

### Part I

In [14]:
# Preparing the data
va = VectorAssembler(inputCols=['City_ohe', 'Balcony_ohe', 'Parking_ohe', 'AC_ohe', 'Rooms'],
                     outputCol='features')
prices_int_rooms = va.transform(prices)

# Instantiating the estimator
prices_lr = LinearRegression(featuresCol='features',
                             labelCol='Price', 
                             predictionCol='predicted Price')

# Fitting the model and using it for prediction
prices_int_rooms = prices_lr.fit(prices_int_rooms).transform(prices_int_rooms)

# Evaluating the model
rmse = RegressionEvaluator(predictionCol='predicted Price', 
                           labelCol='Price', 
                           metricName='rmse')\
            .evaluate(prices_int_rooms) 

print "The RMSE of a model based on all the grades alone is {:.2f}"\
    .format(rmse)

### Part II

In [16]:
si = StringIndexer(inputCol='Rooms', outputCol='Rooms_ix')
prices_cat_rooms = si.fit(prices).transform(prices)

ohe = OneHotEncoder(inputCol='Rooms_ix', outputCol='Rooms_ohe')
prices_cat_rooms = ohe.transform(prices_cat_rooms)

prices_cat_rooms.show(5)

In [17]:
# Preparing the data
va = VectorAssembler(inputCols=['City_ohe', 'Balcony_ohe', 'Parking_ohe', 'AC_ohe', 'Rooms_ohe'],
                     outputCol='features')
prices_cat_rooms = va.transform(prices_cat_rooms)

# Instantiating the estimator
prices_lr = LinearRegression(featuresCol='features',
                             labelCol='Price', 
                             predictionCol='predicted Price')

# Fitting the model and using it for prediction
prices_cat_rooms = prices_lr.fit(prices_cat_rooms).transform(prices_cat_rooms)

# Evaluating the model
rmse = RegressionEvaluator(predictionCol='predicted Price', 
                           labelCol='Price', 
                           metricName='rmse')\
            .evaluate(prices_cat_rooms) 

print "The RMSE of a model based on all the grades alone is {:.2f}"\
    .format(rmse)