In [None]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import SQLContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier

## setup
conf = SparkConf().setAppName("final_project")
sc = SparkContext.getOrCreate()
ss = SparkSession.builder.getOrCreate()

def toIntegerSafe(v):
    try:
        return int(v)
    except ValueError:
        return str(v) #if it is not a float type return as a string.

### 1) Load Cleaned Data

In [None]:
# load and convert data
filename = "./../data/311_Cases_small.csv"

data_raw = sc.textFile(filename)\
             .map(lambda x: x.split(","))

data_raw = data_raw.map(lambda row:  [toIntegerSafe(x) for x in row])

In [None]:
data_raw.collect()

### 2) Create Data Frame

In [None]:
# define schema
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import Row

schema = StructType([
    StructField("closing_time", IntegerType(),False),
     StructField("neighborhood", StringType(),False),
     StructField("category", StringType(),False),
     StructField("police_district", StringType(),False),
    StructField("responsible_agency", StringType(), False)
])

df = ss.createDataFrame(data_raw.map(lambda x : Row(x[0],x[1],x[2],x[3],x[4])), schema)

In [None]:
df.show()

### 3) Numericalize Categorical Variables

In [21]:
from pyspark.ml.feature import StringIndexer

def indexStringColumns(df, cols):
    #variable newdf will be updated several times
    newdf = df
    
    for c in cols:
        #For each given colum, fits StringIndexerModel.
        si = StringIndexer(inputCol=c, outputCol=c+"-num")
        sm = si.fit(newdf)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-num" 
        #and then drops the original columns.
        #and drop the "-num" suffix. 
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-num", c)
    return newdf

In [26]:
dfnumeric = indexStringColumns(df, ["neighborhood", "category", "police_district", "responsible_agency"]) 

In [27]:
dfnumeric.show() # very nice (in borat voice)

+------------+------------+--------+---------------+------------------+
|closing_time|neighborhood|category|police_district|responsible_agency|
+------------+------------+--------+---------------+------------------+
|           0|        33.0|     2.0|            3.0|              14.0|
|          23|         1.0|     0.0|            1.0|               0.0|
|           0|         1.0|     4.0|            1.0|               0.0|
|           0|         1.0|     1.0|            1.0|               0.0|
|           1|        14.0|     1.0|            8.0|               0.0|
|           0|         0.0|     0.0|            0.0|               1.0|
|           1|         5.0|    13.0|            9.0|               2.0|
|           0|        23.0|     0.0|            6.0|               1.0|
|           0|        60.0|     0.0|            7.0|               1.0|
|           0|        46.0|     0.0|            3.0|               0.0|
|           0|        43.0|     0.0|            0.0|            