# Schemas

## Download and install Spark

In [0]:
#from google.colab import drive
#drive.mount('/content/gdrive')

In [0]:
#!cp reported-crimes.csv gdrive/My\ Drive/Training/
#!cp -R spark-2.3.1-bin-hadoop2.7/ gdrive/My\ Drive/Training/
#!ls gdrive/My\ Drive/Training/spark-2.3.1-bin-hadoop2.7
!ls

In [0]:
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz
!tar xf spark-2.3.1-bin-hadoop2.7.tgz
!pip install -q findspark

## Setup environment

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.3.1-bin-hadoop2.7"

import findspark
findspark.init()
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 
spark

## Downloading and preprocessing Chicago's Reported Crime Data

In [0]:
!wget https://data.cityofchicago.org/api/views/ijzp-q8t2/rows.csv?accessType=DOWNLOAD
#!ls -l

In [0]:
!mv rows.csv\?accessType\=DOWNLOAD reported-crimes.csv
#!ls -l

In [0]:
from pyspark.sql.functions import to_timestamp,col,lit
rc = spark.read.csv('reported-crimes.csv',header=True).withColumn('Date',to_timestamp(col('Date'),'MM/dd/yyyy hh:mm:ss a')).filter(col('Date') <= lit('2018-11-11'))
rc.show(5)

## Schemas

In [0]:
rc.printSchema()

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, TimestampType, BooleanType

In [0]:
rc.columns

Below method of manually adding the Type and set nullable as True isn't best and so using the feature of Python to set labels and assign the Schema

In [0]:
#StructType([
#            StructField('ID', StringType, true),
#            StructField('Case Number', StringType, true),
#            StructField('Date', TimestampType, true),
#            StructField('Block',
#            StructField('IUCR',
#            StructField('Primary Type',
#            StructField('Description',
#            StructField('Location Description',
#            StructField('Arrest',
#            StructField('Domestic',
#            StructField('Beat',
#            StructField('District',
#            StructField('Ward',
#            StructField('Community Area',
#            StructField('FBI Code',
#            StructField('X Coordinate',
#            StructField('Y Coordinate',
#           StructField('Year',
#            StructField('Updated On',
#            StructField('Latitude',
#            StructField('Longitude',
#            StructField('Location'
#            )
#])

In [0]:
labels = [
          ('ID', StringType()),
          ('Case Number', StringType()),
          ('Date', TimestampType()),
          ('Block', StringType()),
          ('IUCR', StringType()),
          ('Primary Type', StringType()),
          ('Description', StringType()),
          ('Location Description', StringType()),
          ('Arrest', BooleanType()),
          ('Domestic', BooleanType()),
          ('Beat', StringType()),
          ('District', StringType()),
          ('Ward', StringType()),
          ('Community Area', StringType()),
          ('FBI Code', StringType()),
          ('X Coordinate', StringType()),
          ('Y Coordinate', StringType()),
          ('Year', IntegerType()),
          ('Updated On', StringType()),
          ('Latitude', DoubleType()),
          ('Longitude', DoubleType()),
          ('Location', StringType()),
]

In [0]:
schema = StructType([StructField(x[0], x[1], True) for x in labels])
schema

In [0]:
rc = spark.read.csv('reported-crimes.csv', schema=schema)

In [0]:
rc.printSchema()

Above schema would not fit the data as the schema might not match with the data. Hence, its good to us the **Sparks infer schema** to avoid such issues

---



In [0]:
rc.show(5)