In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import SQLContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier

## setup
conf = SparkConf().setAppName("final_project")
sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()

def toFloatSafe(v):
    try:
        return float(v)
    except ValueError:
        return str(v) #if it is not a float type return as a string.

### 1) Load Cleaned Data

In [2]:
# load and convert data
filename = "./../data/311_Cases_small.csv"

data_raw = sc.textFile(filename)\
             .map(lambda x: x.split(","))

data_raw = data_raw.map(lambda row:  [toFloatSafe(x) for x in row])

### 2) Create Data Frame

In [4]:
# define schema
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import Row

schema = StructType([
    StructField("closing_time", FloatType(),False),
     StructField("neighborhood", StringType(),False),
     StructField("category", StringType(),False),
     StructField("police_district", StringType(),False),
    StructField("responsible_agency", StringType(), False),
    StructField("source", StringType(), False)
])

df = ss.createDataFrame(data_raw.map(lambda x : Row(x[0],x[1],x[2],x[3],x[4], x[5])), schema)

In [5]:
#df.show()
df.dtypes

[('closing_time', 'float'),
 ('neighborhood', 'string'),
 ('category', 'string'),
 ('police_district', 'string'),
 ('responsible_agency', 'string'),
 ('source', 'string')]

### 3) Numericalize Categorical Variables

In [6]:
# String to Numbers
from pyspark.ml.feature import StringIndexer

def indexStringColumns(df, cols):
    #variable newdf will be updated several times
    newdf = df
    
    for c in cols:
        #For each given colum, fits StringIndexerModel.
        si = StringIndexer(inputCol=c, outputCol=c+"-num")
        sm = si.fit(newdf)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-num" 
        #and then drops the original columns.
        #and drop the "-num" suffix. 
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-num", c)
    return newdf

In [7]:
dfnumeric = indexStringColumns(df, ['closing_time', "neighborhood", "category", "police_district", "responsible_agency", "source"]) 

In [8]:
# One-hot Encoding

from pyspark.ml.feature import OneHotEncoder
def oneHotEncodeColumns(df, cols):
    newdf = df
    for c in cols:
        #For each given colum, create OneHotEncoder. 
        #dropLast : Whether to drop the last category in the encoded vector (default: true)
        onehotenc = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-onehot" 
        #and then drops the original columns.
        #and drop the "-onehot" suffix. 
        newdf = onehotenc.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-onehot", c)
    return newdf

dfhot = oneHotEncodeColumns(dfnumeric, ["neighborhood", "category", "police_district", "responsible_agency", "source"]) 

In [9]:
dfhot.show()

+------------+----------------+---------------+---------------+------------------+-------------+
|closing_time|    neighborhood|       category|police_district|responsible_agency|       source|
+------------+----------------+---------------+---------------+------------------+-------------+
|         3.0|(117,[36],[1.0])| (42,[0],[1.0])| (10,[1],[1.0])|   (208,[0],[1.0])|(8,[0],[1.0])|
|        30.0|(117,[23],[1.0])| (42,[1],[1.0])| (10,[3],[1.0])|   (208,[0],[1.0])|(8,[0],[1.0])|
|         3.0| (117,[6],[1.0])| (42,[9],[1.0])| (10,[3],[1.0])|   (208,[2],[1.0])|(8,[0],[1.0])|
|         2.0|(117,[36],[1.0])| (42,[1],[1.0])| (10,[1],[1.0])|   (208,[6],[1.0])|(8,[0],[1.0])|
|         3.0|(117,[10],[1.0])| (42,[1],[1.0])| (10,[8],[1.0])|   (208,[0],[1.0])|(8,[0],[1.0])|
|        24.0|(117,[35],[1.0])|(42,[14],[1.0])| (10,[0],[1.0])|  (208,[52],[1.0])|(8,[0],[1.0])|
|         0.0| (117,[0],[1.0])| (42,[1],[1.0])| (10,[0],[1.0])|   (208,[0],[1.0])|(8,[3],[1.0])|
|         0.0| (117,[2],[1.0])

### 4) Create a Feature Vector

In [10]:
# Merging the data with Vector Assembler.
from pyspark.ml.feature import VectorAssembler
input_cols= ["neighborhood", "category", "police_district", "responsible_agency", "source"]

#VectorAssembler takes a number of collumn names(inputCols) and output column name (outputCol)
#and transforms a DataFrame to assemble the values in inputCols into one single vector with outputCol.
va = VectorAssembler(outputCol="features", inputCols=input_cols)
#lpoints - labeled data.
lpoints = va.transform(dfhot).select("features", "closing_time").withColumnRenamed("closing_time", "label")

In [11]:
lpoints.show()

+--------------------+------+
|            features| label|
+--------------------+------+
|(385,[36,117,160,...|   3.0|
|(385,[23,118,162,...|  30.0|
|(385,[6,126,162,1...|   3.0|
|(385,[36,118,160,...|   2.0|
|(385,[10,118,167,...|   3.0|
|(385,[35,131,159,...|  24.0|
|(385,[0,118,159,1...|   0.0|
|(385,[2,117,159,1...|   0.0|
|(385,[12,117,161,...|   2.0|
|(385,[15,127,163,...|  11.0|
|(385,[87,129,162,...|1089.0|
|(385,[22,118,161,...|   1.0|
|(385,[22,117,161,...|   0.0|
|(385,[4,124,161,1...|   0.0|
|(385,[9,119,165,1...|   1.0|
|(385,[0,117,159,1...|  41.0|
|(385,[39,117,159,...|   0.0|
|(385,[20,118,160,...|   5.0|
|(385,[3,117,159,1...|   0.0|
|(385,[18,122,159,...|  23.0|
+--------------------+------+
only showing top 20 rows



### 5) Train Model

In [12]:
# ******************* TEMP ******************* # (currently under construction, waiting for data)
# (later, divide the dataset by time, and have two files reading in,
#  instead of a train test split)

# Train test split

splits = lpoints.randomSplit([0.8, 0.2])

adulttrain = splits[0].cache()
adultvalid = splits[1].cache()

In [13]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(regParam=0.01, maxIter=10, fitIntercept=True)
lrmodel = lr.fit(adulttrain)

### 6) Evaluate Model

In [15]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

predictions = lrmodel.transform(adultvalid)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(predictions)
print("F1 = {:.4f}".format(f1)) # F1 = 0.3843

F1 = 0.3843
