In [1]:
#INPUT: train.csv
#Output: 
"""

1. Clean up wrong X and Y values (very few of them)

2. visualize data.

2. Parse input to get features: for e.g: get date, time, year, month, etc..)

3. Select, and generate features.

3. Remove outliers.

4. do PCA

Output: train dataframe with features and labels column

        test dataframe with features and lables column

        visuals to provide insights on data that help select, and tune the models.       

 a toolbox list to choose from:

         Typical graphical techniques used in EDA are


Box plot

Histogram

Multi-vari chart

Run chart

Pareto chart

Scatter plot

Stem-and-leaf plot

Parallel coordinates

Odds ratio

Targeted projection pursuit

Glyph-based visualization methods such as PhenoPlot[8] and Chernoff faces

Projection methods such as grand tour, guided tour and manual tour

Interactive versions of these plots

        Dimensionality reduction:

Multidimensional scaling

Principal component analysis (PCA)

Multilinear PCA

Nonlinear dimensionality reduction (NLDR)

        Typical quantitative techniques are:
Median polish

Trimean

Ordination

History

        
"""

In [2]:
filename="/FileStore/tables/train.csv"
data=spark.read.csv(filename, header=True, inferSchema=True)
print(data.count())
print(len(data.columns))
data.printSchema() #the data was inferred properly. Class is an int. Features are double.

In [3]:
"""
Dates - timestamp of the crime incident
Category - category of the crime incident (only in train.csv). This is the target variable you are going to predict.
Descript - detailed description of the crime incident (only in train.csv)
DayOfWeek - the day of the week
PdDistrict - name of the Police Department District
Resolution - how the crime incident was resolved (only in train.csv)
Address - the approximate street address of the crime incident 
X - Longitude
Y - Latitude
"""

In [4]:
#Parsing the time column to generate features, year, month, day, hour, season
from pyspark.sql.functions import col, hour, minute, second, year, month, dayofmonth, date_format

def season(month):
  switcher={
    1:'winter',
    2:'winter',
    3:'spring',
    4:'spring',
    5:'spring',
    6:'summer',
    7:'summer',
    8:'summer',
    9:'autumn',
    10:'autumn',
    11:'autumn',
    12:'winter'
  }
  return switcher.get(month,"NA")

from pyspark.sql.types import StringType
season_udf_string= udf(lambda x: season(x), StringType())

data = data.withColumn("hour", hour(col("Dates"))).withColumn("minute", minute(col("Dates"))).withColumn("dayOfMonth", dayofmonth(col("Dates"))).withColumn("year", year(col("Dates"))).withColumn("month", month(col("Dates"))).withColumn("weekday", date_format(col("Dates"), "EEEE")).withColumn("season", season_udf_string(col("month"))).drop(col("Dates"))

data.printSchema()

In [5]:
#generate more features here... get either numerical or string valued features. 

In [7]:
#1. Data Wrangling to audit the quality of the data and perform all the necessary actions to clean the dataset.
#1- check how many categorical and numerical features we have
cat_cols = [item[0] for item in data.dtypes if item[1].startswith('string')] 
print(str(len(cat_cols)) + '  categorical features')

num_var = [i[0] for i in data.dtypes if ((i[1]=='int') | (i[1]=='double')) ]
print(str(len(num_var)) + '  numerical features')

In [8]:
#check for nulls
from pyspark.sql.functions import isnan, when, count, col
data.select([count(when( col(c).isNull(), c)).alias(c) for c in data.columns]).show()
#conclusion :From above it seems the data is clean with no missing values

In [9]:
#Visualizations:

import matplotlib.pyplot as plt
import pandas as pd
data_pd=data.toPandas()
plt.clf()
data_pd.plot(kind="scatter", x="X", y="Y")
display(plt.show())

In [10]:
#we can see an outlier in the dataset, removing it.

In [11]:
data=data.where('X<-122')
data_pd=data.toPandas()
plt.clf()
data_pd.plot(kind="scatter", x="X", y="Y")
display(plt.show())

In [12]:
#More preprocessing to the features:

#1. get mean and stddev for each of the numerical features and then scale the features to standardize all to mean of 0 and stddev of 1.
from pyspark.sql.functions import mean, stddev
data_stats={num_var[counter]:([data.select(mean(c)).first()[0], data.select(stddev(c)).first()[0]]) for counter, c in enumerate(data[num_var])}
for i in range(len(num_var)):
  data=data.withColumn(num_var[i], (data[num_var[i]]-data_stats.get(num_var[i])[0])/data_stats.get(num_var[i])[1])

  

In [13]:
encoding_var = [i[0] for i in data.dtypes if (i[1]=='string')& (i[0]!='Category')] #where Category is the label/target
#encoding_var = [i[0] for i in crime_df.dtypes if (i[1]=='string')& (i[0]!='Category') ]#where category is the label/target

print(encoding_var)

In [14]:
#apply StringIndexer() to assign indices to each category in our categorical columns.
from pyspark.ml.feature import StringIndexer
string_indexes = [StringIndexer(inputCol = c, outputCol = 'IDX_' + c, handleInvalid = 'keep') for c in encoding_var]
string_indexes

In [15]:
#ONE Hot Encoding 
from pyspark.ml.feature import OneHotEncoderEstimator
onehot_indexes = [OneHotEncoderEstimator(inputCols = ['IDX_' + c], outputCols = ['OHE_' + c]) for c in encoding_var]
onehot_indexes

In [16]:
label_indexes = StringIndexer(inputCol = 'Category', outputCol = 'label', handleInvalid = 'keep')

In [17]:
from pyspark.ml.feature import  VectorAssembler
assembler = VectorAssembler(inputCols = num_var + ['OHE_' + c for c in encoding_var], outputCol = "features")

In [18]:
from pyspark.ml.classification import  RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", featuresCol="features", seed = 8464,
                            numTrees=10, cacheNodeIds = True, subsamplingRate = 0.7)

In [19]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages = string_indexes + onehot_indexes + [assembler,label_indexes, rf])

In [20]:
#fit the data:

pipelineModel = pipeline.fit(data)

In [21]:
#transform:
new_df = pipelineModel.transform(data)
vhouse_df = new_df.select(['features', 'label'])
vhouse_df.show()