In [1]:
import pyspark
import matplotlib.pyplot as plt
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer

#MLlib
from pyspark.ml.regression import LinearRegression, LinearRegressionModel
#from pyspark.ml.classification import 

In [2]:
sc = pyspark.SparkContext('local[*]')
sqlContext = pyspark.SQLContext(sc)

In [3]:
def print_train_info(model):
    print("Model:")
    print("Coefficients: %s" % str(model.coefficients))
    print("Intercept: %s" % str(model.intercept))
    print("  ")
    print("Model info")
    trainingSummary = model.summary
    print("numIterations: %d" % trainingSummary.totalIterations)
    print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
    print("MAE: %f" % trainingSummary.meanAbsoluteError)
    print("r2: %f" % trainingSummary.r2)

def print_eval_info(model_eval):
    print("Eval info:")
    print("RMSE: %f" % model_eval.rootMeanSquaredError)
    print("MAE: %f" % model_eval.meanAbsoluteError)
    print("r2: %f" % model_eval.r2)

In [4]:
df = sqlContext.read.format("csv").option("header", "true").option("inferSchema","true").load("data/curitiba/prediction_data.csv")

df = df.withColumn('date_timestamp', df['date'].cast('Integer'))

In [5]:
print df.printSchema()
print df.show(1)

root
 |-- route: string (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- departure: integer (nullable = true)
 |-- arrival: integer (nullable = true)
 |-- totalpassengers: integer (nullable = true)
 |-- week_day: string (nullable = true)
 |-- group_15_minutes: integer (nullable = true)
 |-- duration: double (nullable = true)
 |-- difference_previous_schedule: string (nullable = true)
 |-- difference_next_schedule: string (nullable = true)
 |-- date_timestamp: integer (nullable = true)

None
+-----+--------------------+---------+-------+---------------+--------+----------------+----------------+----------------------------+------------------------+--------------+
|route|                date|departure|arrival|totalpassengers|week_day|group_15_minutes|        duration|difference_previous_schedule|difference_next_schedule|date_timestamp|
+-----+--------------------+---------+-------+---------------+--------+----------------+----------------+----------------------------+-------

In [6]:
string_columns = ["route", "week_day", "difference_previous_schedule", "difference_next_schedule"]
indexers = [StringIndexer(inputCol=column, outputCol=column+"_index").fit(df) for column in string_columns]

pipeline = Pipeline(stages=indexers)
df_r = pipeline.fit(df).transform(df)

In [7]:
features = ["route_index", "date_timestamp", "week_day_index", "group_15_minutes", "difference_next_schedule_index", "difference_previous_schedule_index"]

assembler = VectorAssembler(
    inputCols=features,
    outputCol='features')

assembled_df = assembler.transform(df_r)

In [8]:
train, test = assembled_df.randomSplit([0.6, 0.4], seed=0)

## Trip Duratiom model

### Lasso regresion

In [9]:
duration_lr = LinearRegression(maxIter=10, regParam=0.01, elasticNetParam=1.0).setLabelCol("duration").setFeaturesCol("features")

duration_lrModel = duration_lr.fit(train)

In [10]:
print_train_info(duration_lrModel)

Model:
Coefficients: [-0.0700377141198,3.0096734781e-07,-0.531780519936,-2.41339299044e-05,-0.000596150175759,-0.000530622183158]
Intercept: -401.380450276
  
Model info
numIterations: 11
RMSE: 20.849891
MAE: 13.726247
r2: 0.054802


In [11]:
duration_model_eval = duration_lrModel.evaluate(test)

In [12]:
print_eval_info(duration_model_eval)

Eval info:
RMSE: 20.260875
MAE: 13.681778
r2: 0.053832


Saving model:

In [29]:
duration_lrModel.write().overwrite().save("data/models/duration_lasso_model")

Loading model:

In [30]:
duration_lrModel_loaded = LinearRegressionModel.load("data/models/duration_lasso_model")

Evaluating loaded model:

In [31]:
duration_model_loaded_eval = duration_lrModel_loaded.evaluate(test)

print_eval_info(duration_model_loaded_eval)

Eval info:
RMSE: 20.260875
MAE: 13.681778
r2: 0.053832


### SVM regresion

In [26]:
#duration_svm = SVMModel(maxIter=10).setLabelCol("duration").setFeaturesCol("features")

#duration_svmModel = duration_svm.fit(train)

TypeError: __init__() got an unexpected keyword argument 'maxIter'

## Trip Crowdedness model

In [32]:
crowdedness_lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=1).setLabelCol("totalpassengers").setFeaturesCol("features")

crowdedness_lrModel = crowdedness_lr.fit(train)

In [33]:
print_train_info(crowdedness_lrModel)

Model:
Coefficients: [0.00915223769293,0.0,-1.58388101213,-0.000145362432562,4.31352086852e-06,0.000313375439836]
Intercept: 21.1096148239
  
Model info
numIterations: 11
RMSE: 14.368144
MAE: 9.658412
r2: 0.093718


In [34]:
crowdedness_model_eval = crowdedness_lrModel.evaluate(test)

In [35]:
print_eval_info(crowdedness_model_eval)

Eval info:
RMSE: 14.005498
MAE: 9.558157
r2: 0.095711


Saving model:

In [36]:
crowdedness_lrModel.write().overwrite().save("data/models/crowdedness_lasso_model")

Loading model:

In [37]:
crowdedness_lrModel_loaded = LinearRegressionModel.load("data/models/crowdedness_lasso_model")

Evaluating loaded model:

In [38]:
crowdedness_model_loaded_eval = crowdedness_lrModel_loaded.evaluate(test)

print_eval_info(crowdedness_model_loaded_eval)

Eval info:
RMSE: 14.005498
MAE: 9.558157
r2: 0.095711


In [None]:
sc.stop()