In [20]:
import pandas as pd
from pyspark.ml.regression import LinearRegression
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.ml.feature import VectorAssembler
import six
sqlContext = SQLContext(sc)

In [21]:
df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('../data/dataset1.csv')

In [22]:
df.cache()
df.printSchema()

root
 |-- year: integer (nullable = true)
 |-- Money_printed: integer (nullable = true)
 |-- GDP: integer (nullable = true)
 |-- Interest_RATE: double (nullable = true)
 |-- WPI: double (nullable = true)



Descriptive statistics for dataframe

In [23]:
df.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
year,35,1987.0,10.246950765959598,1970,2004
Money_printed,35,146641.82857142857,176494.31999767112,7374,647495
GDP,35,1079313.4857142856,560456.1953579775,474131,2389660
Interest_RATE,35,9.785714285714286,1.9599790450774868,5.375,13.0
WPI,35,77.37142857142858,54.1085541319679,14.3,187.3


In [24]:
for i in df.columns:
    if not( isinstance(df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to Money_Printed for ", i, df.stat.corr('Money_printed',i))

Correlation to Money_Printed for  year 0.8743167817297022
Correlation to Money_Printed for  Money_printed 1.0
Correlation to Money_Printed for  GDP 0.976212687611339
Correlation to Money_Printed for  Interest_RATE -0.2991204055878383
Correlation to Money_Printed for  WPI 0.9546405759266342


# Model 1
Separate DataFrame into features and target for Model 1

In [25]:
vAssembler = VectorAssembler(inputCols=['year','GDP','Interest_RATE','WPI'],outputCol = 'features')
v_df = vAssembler.transform(df)
v_df = v_df.select(['features','Money_Printed'])
v_df.show(10)

+--------------------+-------------+
|            features|Money_Printed|
+--------------------+-------------+
|[1970.0,474131.0,...|         7374|
|[1971.0,478918.0,...|         8323|
|[1972.0,477392.0,...|         9700|
|[1973.0,499120.0,...|        11200|
|[1974.0,504914.0,...|        11975|
|[1975.0,550379.0,...|        13325|
|[1976.0,557258.0,...|        16024|
|[1977.0,598885.0,...|        14388|
|[1978.0,631839.0,...|        17292|
|[1979.0,598974.0,...|        20000|
+--------------------+-------------+
only showing top 10 rows



Setting up the regression model

In [26]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(featuresCol = 'features',labelCol = 'Money_Printed')
lr_model = lr.fit(v_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [-8849.943441049678,0.46695301166639036,-4920.335722554735,-74.16652385867204]
Intercept: 17281378.132724542


In [32]:
m1_summary = lr_model.summary
print("r2: %f" % m1_summary.r2)

r2: 0.992933


# Model 2


In [29]:
vAssembler = VectorAssembler(inputCols=['year','GDP','Interest_RATE'],outputCol = 'features')
v_df = vAssembler.transform(df)
v_df = v_df.select(['features','Money_Printed'])
v_df.show(10)

+--------------------+-------------+
|            features|Money_Printed|
+--------------------+-------------+
|[1970.0,474131.0,...|         7374|
|[1971.0,478918.0,...|         8323|
|[1972.0,477392.0,...|         9700|
|[1973.0,499120.0,...|        11200|
|[1974.0,504914.0,...|        11975|
|[1975.0,550379.0,...|        13325|
|[1976.0,557258.0,...|        16024|
|[1977.0,598885.0,...|        14388|
|[1978.0,631839.0,...|        17292|
|[1979.0,598974.0,...|        20000|
+--------------------+-------------+
only showing top 10 rows



In [30]:
lr = LinearRegression(featuresCol = 'features',labelCol = 'Money_Printed')
lr_model = lr.fit(v_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [-8844.925443609924,0.45967465096523946,-5100.486324043806]
Intercept: 17275287.537081845


In [36]:
m2_summary = lr_model.summary
print("model 1 r2: %f" % m1_summary.r2)
print("model 2 r2: %f" % m2_summary.r2)

model 1 r2: 0.992933
model 2 r2: 0.992933
