In [43]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import SQLContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier

## setup
conf = SparkConf().setAppName("final_project")
sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()

def toIntegerSafe(v):
    try:
        return float(v)
    except ValueError:
        return str(v) #if it is not a float type return as a string.

### 1) Load Cleaned Data

In [44]:
# load and convert data
filename = "./../data/311_Cases_small.csv"

data_raw = sc.textFile(filename)\
             .map(lambda x: x.split(","))

data_raw = data_raw.map(lambda row:  [toIntegerSafe(x) for x in row])

In [45]:
data_raw.collect()

[[0.0,
  'Cathedral Hill',
  'Encampments',
  'NORTHERN',
  'Duplicate Case Hold Queue'],
 [23.0,
  'South of Market',
  'Street and Sidewalk Cleaning',
  'SOUTHERN',
  'DPW Ops Queue'],
 [0.0, 'South of Market', 'Homeless Concerns', 'SOUTHERN', 'DPW Ops Queue'],
 [0.0, 'South of Market', 'Graffiti', 'SOUTHERN', 'DPW Ops Queue'],
 [1.0, 'Outer Richmond', 'Graffiti', 'RICHMOND', 'DPW Ops Queue'],
 [0.0,
  'Mission',
  'Street and Sidewalk Cleaning',
  'MISSION',
  'Recology_Abandoned'],
 [1.0, 'Civic Center', 'Damaged Property', 'TENDERLOIN', 'DPW BSM Queue'],
 [0.0,
  'Western Addition',
  'Street and Sidewalk Cleaning',
  'PARK',
  'Recology_Abandoned'],
 [0.0,
  'Oceanview',
  'Street and Sidewalk Cleaning',
  'TARAVAL',
  'Recology_Abandoned'],
 [0.0,
  'Lower Pacific Heights',
  'Street and Sidewalk Cleaning',
  'NORTHERN',
  'DPW Ops Queue'],
 [0.0,
  'Eureka Valley',
  'Street and Sidewalk Cleaning',
  'MISSION',
  'Recology_Abandoned'],
 [6.0, 'Oceanview', 'Graffiti', 'TARAVAL',

### 2) Create Data Frame

In [51]:
# define schema
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import Row

schema = StructType([
    StructField("closing_time", FloatType(),False),
     StructField("neighborhood", StringType(),False),
     StructField("category", StringType(),False),
     StructField("police_district", StringType(),False),
    StructField("responsible_agency", StringType(), False)
])

df = ss.createDataFrame(data_raw.map(lambda x : Row(x[0],x[1],x[2],x[3],x[4])), schema)

In [49]:
#df.show()
df.dtypes

[('closing_time', 'float'),
 ('neighborhood', 'string'),
 ('category', 'string'),
 ('police_district', 'string'),
 ('responsible_agency', 'string')]

### 3) Numericalize Categorical Variables

In [52]:
# String to Numbers
from pyspark.ml.feature import StringIndexer

def indexStringColumns(df, cols):
    #variable newdf will be updated several times
    newdf = df
    
    for c in cols:
        #For each given colum, fits StringIndexerModel.
        si = StringIndexer(inputCol=c, outputCol=c+"-num")
        sm = si.fit(newdf)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-num" 
        #and then drops the original columns.
        #and drop the "-num" suffix. 
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-num", c)
    return newdf

In [53]:
dfnumeric = indexStringColumns(df, ['closing_time', "neighborhood", "category", "police_district", "responsible_agency"]) 

In [54]:
dfnumeric.show() # very nice (in borat voice)

+------------+------------+--------+---------------+------------------+
|closing_time|neighborhood|category|police_district|responsible_agency|
+------------+------------+--------+---------------+------------------+
|         0.0|        33.0|     2.0|            3.0|              14.0|
|        30.0|         1.0|     0.0|            1.0|               0.0|
|         0.0|         1.0|     4.0|            1.0|               0.0|
|         0.0|         1.0|     1.0|            1.0|               0.0|
|         1.0|        16.0|     1.0|            8.0|               0.0|
|         0.0|         0.0|     0.0|            0.0|               1.0|
|         1.0|         5.0|    13.0|            9.0|               2.0|
|         0.0|        22.0|     0.0|            6.0|               1.0|
|         0.0|        57.0|     0.0|            7.0|               1.0|
|         0.0|        48.0|     0.0|            3.0|               0.0|
|         0.0|        42.0|     0.0|            0.0|            

In [55]:
# One-hot Encoding

from pyspark.ml.feature import OneHotEncoder
def oneHotEncodeColumns(df, cols):
    newdf = df
    for c in cols:
        #For each given colum, create OneHotEncoder. 
        #dropLast : Whether to drop the last category in the encoded vector (default: true)
        onehotenc = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-onehot" 
        #and then drops the original columns.
        #and drop the "-onehot" suffix. 
        newdf = onehotenc.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-onehot", c)
    return newdf

dfhot = oneHotEncodeColumns(dfnumeric, ["neighborhood", "category", "police_district", "responsible_agency"]) 

In [56]:
dfhot.show()

+------------+----------------+---------------+---------------+------------------+
|closing_time|    neighborhood|       category|police_district|responsible_agency|
+------------+----------------+---------------+---------------+------------------+
|         0.0|(117,[33],[1.0])| (34,[2],[1.0])| (10,[3],[1.0])|  (154,[14],[1.0])|
|        30.0| (117,[1],[1.0])| (34,[0],[1.0])| (10,[1],[1.0])|   (154,[0],[1.0])|
|         0.0| (117,[1],[1.0])| (34,[4],[1.0])| (10,[1],[1.0])|   (154,[0],[1.0])|
|         0.0| (117,[1],[1.0])| (34,[1],[1.0])| (10,[1],[1.0])|   (154,[0],[1.0])|
|         1.0|(117,[16],[1.0])| (34,[1],[1.0])| (10,[8],[1.0])|   (154,[0],[1.0])|
|         0.0| (117,[0],[1.0])| (34,[0],[1.0])| (10,[0],[1.0])|   (154,[1],[1.0])|
|         1.0| (117,[5],[1.0])|(34,[13],[1.0])| (10,[9],[1.0])|   (154,[2],[1.0])|
|         0.0|(117,[22],[1.0])| (34,[0],[1.0])| (10,[6],[1.0])|   (154,[1],[1.0])|
|         0.0|(117,[57],[1.0])| (34,[0],[1.0])| (10,[7],[1.0])|   (154,[1],[1.0])|
|   

### 4) Create a Feature Vector

In [57]:
# Merging the data with Vector Assembler.
from pyspark.ml.feature import VectorAssembler
input_cols= ["neighborhood", "category", "police_district", "responsible_agency"]

#VectorAssembler takes a number of collumn names(inputCols) and output column name (outputCol)
#and transforms a DataFrame to assemble the values in inputCols into one single vector with outputCol.
va = VectorAssembler(outputCol="features", inputCols=input_cols)
#lpoints - labeled data.
lpoints = va.transform(dfhot).select("features", "closing_time").withColumnRenamed("closing_time", "label")

In [58]:
lpoints.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(315,[33,119,154,...|  0.0|
|(315,[1,117,152,1...| 30.0|
|(315,[1,121,152,1...|  0.0|
|(315,[1,118,152,1...|  0.0|
|(315,[16,118,159,...|  1.0|
|(315,[0,117,151,1...|  0.0|
|(315,[5,130,160,1...|  1.0|
|(315,[22,117,157,...|  0.0|
|(315,[57,117,158,...|  0.0|
|(315,[48,117,154,...|  0.0|
|(315,[42,117,151,...|  0.0|
|(315,[57,118,158,...|  6.0|
|(315,[0,118,151,1...|  3.0|
|(315,[1,118,152,1...|  0.0|
|(315,[1,117,152,1...|  0.0|
|(315,[10,117,153,...|  0.0|
|(315,[1,118,160,1...|  6.0|
|(315,[20,117,154,...|  0.0|
|(315,[13,120,155,...|  1.0|
|(315,[14,117,153,...|  0.0|
+--------------------+-----+
only showing top 20 rows



### 5) Train Model

In [59]:
# ******************* TEMP ******************* # 
# (later, divide the dataset by time, and have two files reading in,
#  instead of a train test split)

# Train test split

splits = lpoints.randomSplit([0.8, 0.2])

adulttrain = splits[0].cache()
adultvalid = splits[1].cache()

In [61]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(regParam=0.01, maxIter=10, fitIntercept=True)
lrmodel = lr.fit(adulttrain)

### 6) Evaluate Model

In [63]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

predictions = lrmodel.transform(adultvalid)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(predictions)
print("F1 = {:.4f}".format(f1))

F1 = 0.3866
