# County Gross Rent Prediction
Regressin - Linear Regression using pyspark

In [1]:
# Import Spark SQL and Spark ML libraries
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [2]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

In [3]:
from pyspark import SparkContext 
from pyspark.sql import SQLContext 
import pandas as pd  

In [6]:
sqlc=SQLContext(sc)

In [11]:
df=pd.read_csv(r'C:\Users\bhargavi\Final Project\train\train_values_mydata3.csv')

In [12]:
sdf=sqlc.createDataFrame(df)
sdf.show()

+------+----------+--------------------------+-------------------+---------+------------------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------------+-------------------+--------------------+-------------------+-------------------+-------------------+-------------------+----------------------+-----------------------+-------------------+-------------------------+---------------------------+------------------------------------------+-----------------------------------+----------------------------+------------------------------+----------+
|row_id|population|renter_occupied_households|pct_renter_occupied|evictions|       rent_burden|          pct_white|           pct_af_am|        pct_hispanic|          pct_am_ind|           pct_asian|           pct_nh_pi|        pct_multiple|           pct_other|      poverty_rate| pct_civilian_labor|    pct_unemployment|  pc

In [15]:
data = sdf.select("row_id", "population", "renter_occupied_households", "pct_renter_occupied", "evictions", "rent_burden", "pct_white", "pct_af_am", "pct_hispanic", "pct_am_ind", "pct_asian", "pct_nh_pi", "pct_multiple", "pct_other", "poverty_rate", "pct_civilian_labor", "pct_unemployment", "pct_adult_obesity", "pct_adult_smoking", "pct_diabetes", "pct_low_birthweight", "pct_excessive_drinking", "pct_physical_inactivity", "pct_female", "pct_below_18_years_of_age", "pct_aged_65_years_and_older", "pct_adults_less_than_a_high_school_diploma", "pct_adults_with_high_school_diploma", "pct_adults_with_some_college", "pct_adults_bachelors_or_higher", col("gross_rent").alias("label"))

In [16]:
# Split the data
splits = data.randomSplit([0.75, 0.25])
train = splits[0]
test = splits[1].withColumnRenamed("label", "trueLabel")

In [17]:
assembler = VectorAssembler(inputCols = ["row_id", "population", "renter_occupied_households", "pct_renter_occupied", "evictions", "rent_burden", "pct_white", "pct_af_am", "pct_hispanic", "pct_am_ind", "pct_asian", "pct_nh_pi", "pct_multiple", "pct_other", "poverty_rate", "pct_civilian_labor", "pct_unemployment", "pct_adult_obesity", "pct_adult_smoking", "pct_diabetes", "pct_low_birthweight", "pct_excessive_drinking", "pct_physical_inactivity", "pct_female", "pct_below_18_years_of_age", "pct_aged_65_years_and_older", "pct_adults_less_than_a_high_school_diploma", "pct_adults_with_high_school_diploma", "pct_adults_with_some_college", "pct_adults_bachelors_or_higher"], outputCol="features")
lr = LinearRegression(labelCol="label",featuresCol="features", maxIter=7, regParam=0.3)
pipeline = Pipeline(stages=[assembler, lr])

In [18]:
piplineModel = pipeline.fit(train)

In [19]:
prediction = piplineModel.transform(test)
predicted = prediction.select("features", "prediction", "trueLabel")
predicted.show()

+--------------------+------------------+---------+
|            features|        prediction|trueLabel|
+--------------------+------------------+---------+
|[2.0,27023.0,2927...| 687.9427991085346|      700|
|[3.0,8735.0,1039....|   585.52966539623|      592|
|[5.0,8540.0,751.0...| 607.2463443190838|      531|
|[17.0,87762.0,114...| 726.4344753336368|      694|
|[21.0,7816.0,970....| 635.1054831197239|      691|
|[23.0,2318272.0,4...|1452.9568381141546|     1367|
|[25.0,537958.0,76...| 830.8014862391816|      744|
|[35.0,3302.0,276....|1201.0991233531731|      930|
|[36.0,60011.0,730...| 544.3315624583856|      579|
|[40.0,20981.0,243...| 685.6520196042825|      558|
|[45.0,325767.0,58...| 841.0790669854372|      869|
|[46.0,7306.0,1135...| 577.5145121130297|      592|
|[51.0,18303.0,172...| 726.3013645113789|      713|
|[55.0,5114.0,576....| 588.6704733029253|      659|
|[62.0,313101.0,48...|1205.2782174783329|     1187|
|[65.0,3213.0,403....| 566.4342203411275|      582|
|[68.0,5741.