In [3]:
###############To start working with Spark DataFrames, you first have to create a SparkSession object from your SparkContext.


#######You can think of the SparkContext as your connection to the cluster and the SparkSession as your interface with that connection.


import findspark
findspark.init()
from pyspark import SparkContext

sc = SparkContext.getOrCreate()
sc


In [4]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 
spark

### SCHEMAS

In [5]:
from pyspark.sql.functions import to_timestamp,col,lit
rc = spark.read.csv('rows.csv@accessType=DOWNLOAD',header=True).withColumn('Date',to_timestamp(col('Date'),'MM/dd/yyyy hh:mm:ss a')).filter(col('Date') <= lit('2018-11-11'))

####################      only filtering till 2018       #######################

print(rc.show(5))

+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|      ID|Case Number|               Date|               Block|IUCR|Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|    Longitude|            Location|
+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|10224738|   HY411648|2015-09-05 13:30:00|     043XX S WOOD ST|0486|     BATTERY|DOMESTIC BATTERY ...|           RESIDENCE| false|    true|0924|     00

In [5]:
rc.show(5,truncate= True)

+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|      ID|Case Number|               Date|               Block|IUCR|Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|    Longitude|            Location|
+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|10224738|   HY411648|2015-09-05 13:30:00|     043XX S WOOD ST|0486|     BATTERY|DOMESTIC BATTERY ...|           RESIDENCE| false|    true|0924|     00

### SCHEMAS

Spark Schemas
A primary function of data cleaning is to verify all data is in the expected format. Spark provides a built-in ability to validate datasets with schemas. You may have used schemas before with databases or XML; Spark is similar. A schema defines and validates the number and types of columns for a given DataFrame. A schema can contain many different types of fields - integers, floats, dates, strings, and even arrays or mapping structures. A defined schema allows Spark to filter out data that doesn't conform during read, ensuring expected correctness.

In [6]:
rc.printSchema()    ##############3   shows the  coluymn and types ...ie current schema

root
 |-- ID: string (nullable = true)
 |-- Case Number: string (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: string (nullable = true)
 |-- Domestic: string (nullable = true)
 |-- Beat: string (nullable = true)
 |-- District: string (nullable = true)
 |-- Ward: string (nullable = true)
 |-- Community Area: string (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- X Coordinate: string (nullable = true)
 |-- Y Coordinate: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Updated On: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- Location: string (nullable = true)



In [7]:
rc.dtypes   ##############3   shows the  coluymn and types

[('ID', 'string'),
 ('Case Number', 'string'),
 ('Date', 'timestamp'),
 ('Block', 'string'),
 ('IUCR', 'string'),
 ('Primary Type', 'string'),
 ('Description', 'string'),
 ('Location Description', 'string'),
 ('Arrest', 'string'),
 ('Domestic', 'string'),
 ('Beat', 'string'),
 ('District', 'string'),
 ('Ward', 'string'),
 ('Community Area', 'string'),
 ('FBI Code', 'string'),
 ('X Coordinate', 'string'),
 ('Y Coordinate', 'string'),
 ('Year', 'string'),
 ('Updated On', 'string'),
 ('Latitude', 'string'),
 ('Longitude', 'string'),
 ('Location', 'string')]

In [None]:
#################


############# EXCIPLTY DEFINING SCHEMA IS RECOMMENDED

#########  import different types from pyspark.sql.types

##########   from pyspark.sql.types import *  is good 

In [8]:
from pyspark.sql.types import StructType,StructField,IntegerType,StringType,TimestampType,BooleanType,DoubleType

In [9]:
rc.columns

['ID',
 'Case Number',
 'Date',
 'Block',
 'IUCR',
 'Primary Type',
 'Description',
 'Location Description',
 'Arrest',
 'Domestic',
 'Beat',
 'District',
 'Ward',
 'Community Area',
 'FBI Code',
 'X Coordinate',
 'Y Coordinate',
 'Year',
 'Updated On',
 'Latitude',
 'Longitude',
 'Location']

In [None]:
# Define a new schema using the StructType method
# Define a StructField for each field,, HERE FASLE MEANS CANT HAVE NULL VALUES
people_schema = StructType([
  
  StructField('ID', StringType(), False),
  StructField('Case Number', StringTypeType(), False),
  StructField('Date', TimestampType(), False)
  StructField( 'Block',
 'IUCR',
 'Primary Type',
 'Description',
 'Location Description',
 'Arrest',
 'Domestic',
 'Beat',
 'District',
 'Ward',
 'Community Area',
 'FBI Code',
 'X Coordinate',
 'Y Coordinate',
 'Year',
 'Updated On',
 'Latitude',
 'Longitude',
 'Location']
              
##########3 this is pretty tiring ... so we sue tuples

In [24]:
###############################  THIS IS BETTER  #################################33333
labels=[
 ('ID',StringType()),
 ('Case Number',StringType()),
 ('Date',StringType()),
 ('Block',StringType()),
 ('IUCR',StringType()),
 ('Primary Type',StringType()),
 ('Description',StringType()),
 ('Location Description',StringType()),
 ('Arrest',StringType()),
 ('Domestic',BooleanType()),
 ('Beat',StringType()),
 ('District',StringType()),
 ('Ward',StringType()),
 ('Community Area',StringType()),
 ('FBI Code',StringType()),
 ('X Coordinate',StringType()),
 ('Y Coordinate',StringType()),
 ('Year',IntegerType()),
 ('Updated On',StringType()),
 ('Latitude',DoubleType()),
 ('Longitude',DoubleType()),
 ('Location',StringType())]

In [25]:
myschema=StructType([StructField (x[0],x[1],True) for x in labels])
myschema

StructType(List(StructField(ID,StringType,true),StructField(Case Number,StringType,true),StructField(Date,StringType,true),StructField(Block,StringType,true),StructField(IUCR,StringType,true),StructField(Primary Type,StringType,true),StructField(Description,StringType,true),StructField(Location Description,StringType,true),StructField(Arrest,StringType,true),StructField(Domestic,BooleanType,true),StructField(Beat,StringType,true),StructField(District,StringType,true),StructField(Ward,StringType,true),StructField(Community Area,StringType,true),StructField(FBI Code,StringType,true),StructField(X Coordinate,StringType,true),StructField(Y Coordinate,StringType,true),StructField(Year,IntegerType,true),StructField(Updated On,StringType,true),StructField(Latitude,DoubleType,true),StructField(Longitude,DoubleType,true),StructField(Location,StringType,true)))

In [23]:
%%html
<style>
div.output_area pre {
    white-space: pre;
}
</style>



In [26]:
from pyspark.sql.functions import to_timestamp,col,lit
rc = spark.read.csv('rows.csv@accessType=DOWNLOAD',header=True,schema=myschema)
####################     some values are null as they could not  be convereted into required form    #######################

rc.show(n=5)

+--------+-----------+--------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|      ID|Case Number|                Date|               Block|IUCR|Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|    Longitude|            Location|
+--------+-----------+--------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|10224738|   HY411648|09/05/2015 01:30:...|     043XX S WOOD ST|0486|     BATTERY|DOMESTIC BATTERY ...|           RESIDENCE| false|    true|0924|   

In [35]:
rc.dtypes   ##############3   shows the  coluymn and types

[('ID', 'string'),
 ('Case Number', 'string'),
 ('Date', 'string'),
 ('Block', 'string'),
 ('IUCR', 'string'),
 ('Primary Type', 'string'),
 ('Description', 'string'),
 ('Location Description', 'string'),
 ('Arrest', 'string'),
 ('Domestic', 'boolean'),
 ('Beat', 'string'),
 ('District', 'string'),
 ('Ward', 'string'),
 ('Community Area', 'string'),
 ('FBI Code', 'string'),
 ('X Coordinate', 'string'),
 ('Y Coordinate', 'string'),
 ('Year', 'int'),
 ('Updated On', 'string'),
 ('Latitude', 'double'),
 ('Longitude', 'double'),
 ('Location', 'string')]

In [None]:
#### changing datatypes.............DATE HERE IS OF THE TYPE STRING

In [41]:
rc=rc.withColumn('Date',to_timestamp(col('Date'),'MM/dd/yyyy hh:mm:ss a'))


In [99]:
########  also with select

rc=rc.select('Date',to_timestamp(col('Date'),'MM/dd/yyyy hh:mm:ss a'))


In [42]:
rc=rc.withColumn("Ward",col("Ward").cast("int"))

In [43]:
rc.show(n=5)

+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|      ID|Case Number|               Date|               Block|IUCR|Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|    Longitude|            Location|
+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|10224738|   HY411648|2015-09-05 13:30:00|     043XX S WOOD ST|0486|     BATTERY|DOMESTIC BATTERY ...|           RESIDENCE| false|    true|0924|     00

In [44]:
rc.dtypes

[('ID', 'string'),
 ('Case Number', 'string'),
 ('Date', 'timestamp'),
 ('Block', 'string'),
 ('IUCR', 'string'),
 ('Primary Type', 'string'),
 ('Description', 'string'),
 ('Location Description', 'string'),
 ('Arrest', 'string'),
 ('Domestic', 'boolean'),
 ('Beat', 'string'),
 ('District', 'string'),
 ('Ward', 'int'),
 ('Community Area', 'string'),
 ('FBI Code', 'string'),
 ('X Coordinate', 'string'),
 ('Y Coordinate', 'string'),
 ('Year', 'int'),
 ('Updated On', 'string'),
 ('Latitude', 'double'),
 ('Longitude', 'double'),
 ('Location', 'string')]

## COLUMNS
### selecting columns

In [45]:
rc.select("ID").show()   ###### selecting single columns

+--------+
|      ID|
+--------+
|10224738|
|10224739|
|11646166|
|10224740|
|10224741|
|10224742|
|10224743|
|10224744|
|10224745|
|11645836|
|10224746|
|10224749|
|10224750|
|10224751|
|10224752|
|10224753|
|10224754|
|10224756|
|10224757|
|10224758|
+--------+
only showing top 20 rows



In [46]:
rc.select(col("Ward")).show()   ###### selecting single columns

+----+
|Ward|
+----+
|  12|
|  29|
|   8|
|  35|
|  28|
|  21|
|  32|
|  25|
|  27|
|  15|
|  13|
|  45|
|  34|
|   4|
|   1|
|  21|
|  28|
|  10|
|  21|
|  38|
+----+
only showing top 20 rows



In [47]:
rc.select("ID","Block","Description").show()  #####33 multiple coloumn

+--------+--------------------+--------------------+
|      ID|               Block|         Description|
+--------+--------------------+--------------------+
|10224738|     043XX S WOOD ST|DOMESTIC BATTERY ...|
|10224739| 008XX N CENTRAL AVE|      POCKET-PICKING|
|11646166|082XX S INGLESIDE...|           OVER $500|
|10224740|   035XX W BARRY AVE|POSS: HEROIN(BRN/...|
|10224741| 0000X N LARAMIE AVE|              SIMPLE|
|10224742| 082XX S LOOMIS BLVD|      FORCIBLE ENTRY|
|10224743|021XX W CHURCHILL ST|      UNLAWFUL ENTRY|
|10224744|   025XX W CERMAK RD|        RETAIL THEFT|
|10224745|031XX W WASHINGTO...|STRONGARM - NO WE...|
|11645836| 055XX S ROCKWELL ST|FINANCIAL IDENTIT...|
|10224746|  071XX S PULASKI RD|      $500 AND UNDER|
|10224749|052XX N MILWAUKEE...|              SIMPLE|
|10224750|    0000X W 103RD ST|    TELEPHONE THREAT|
|10224751|     013XX E 47TH ST|DOMESTIC BATTERY ...|
|10224752| 020XX W SCHILLER ST|           OVER $500|
|10224753|  080XX S JUSTINE ST|AGGRAVATED DOME

#####  ADDING NEW COLUMN TO DATAFRAME or  modifyng it



In [48]:
##############   create new create column ,changing datatype, creating categories AND FILTERING IT ---WITH WHEN...................

from pyspark.sql.functions import when
df = spark.createDataFrame([["amit", 30], ["rohit", 45], ["sameer", 50]], ["name", "age"])
df = df.withColumn("profile", when(df.age >= 40, "Senior").otherwise("Executive"))
df.show()

+------+---+---------+
|  name|age|  profile|
+------+---+---------+
|  amit| 30|Executive|
| rohit| 45|   Senior|
|sameer| 50|   Senior|
+------+---+---------+



In [76]:


spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
simpleData = [("James",34),("Ann",34),
    ("Michael",33),("Scott",53),
    ("Robert",37),("Chad",27)
  ]

columns = ["firstname","age",]       #############  can also give headers like this
df = spark.createDataFrame(data = simpleData, schema = columns)
df.show()


+---------+---+
|firstname|age|
+---------+---+
|    James| 34|
|      Ann| 34|
|  Michael| 33|
|    Scott| 53|
|   Robert| 37|
|     Chad| 27|
+---------+---+



In [None]:
########  ADDING NEW COLUMN TO DATAFRAME

In [50]:
chumma=rc.withColumn("double _year", 2*rc["Year"])

In [51]:
chumma.show(5)       ############ if we print rc, we wont see double_year, its temporary... but with this its nice

+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+------------+
|      ID|Case Number|               Date|               Block|IUCR|Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|    Longitude|            Location|double _year|
+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+------------+
|10224738|   HY411648|2015-09-05 13:30:00|     043XX S WOOD ST|0486|     BATTERY|DOMESTIC BATTERY ...|          

In [52]:
###** Add a column with name One, with entries all 1s **
from pyspark.sql.functions import lit


rc.withColumn("One", lit(1)).show(5)

+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+---+
|      ID|Case Number|               Date|               Block|IUCR|Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|    Longitude|            Location|One|
+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+---+
|10224738|   HY411648|2015-09-05 13:30:00|     043XX S WOOD ST|0486|     BATTERY|DOMESTIC BATTERY ...|           RESIDENCE| false|    true|

In [53]:
###** Add a column with name random, with entries all being random **

from pyspark.sql.functions import rand

rc.withColumn("random", rand()).show(5)

+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+-------------------+
|      ID|Case Number|               Date|               Block|IUCR|Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|    Longitude|            Location|             random|
+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+-------------------+
|10224738|   HY411648|2015-09-05 13:30:00|     043XX S WOOD ST|0486|     BATTERY|DOMESTIC B

### Renaming column

In [54]:
rc.withColumnRenamed("Year","YEAR").show(5)

+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|      ID|Case Number|               Date|               Block|IUCR|Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|YEAR|          Updated On|    Latitude|    Longitude|            Location|
+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|10224738|   HY411648|2015-09-05 13:30:00|     043XX S WOOD ST|0486|     BATTERY|DOMESTIC BATTERY ...|           RESIDENCE| false|    true|0924|     00

In [55]:
rc.show(5)

+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|      ID|Case Number|               Date|               Block|IUCR|Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|    Longitude|            Location|
+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|10224738|   HY411648|2015-09-05 13:30:00|     043XX S WOOD ST|0486|     BATTERY|DOMESTIC BATTERY ...|           RESIDENCE| false|    true|0924|     00

###  Remove the column  

In [56]:
chumma.show(5)

+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+------------+
|      ID|Case Number|               Date|               Block|IUCR|Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|    Longitude|            Location|double _year|
+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+------------+
|10224738|   HY411648|2015-09-05 13:30:00|     043XX S WOOD ST|0486|     BATTERY|DOMESTIC BATTERY ...|          

In [57]:
chumma.drop("Longitude").show(5)

+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+--------------------+------------+
|      ID|Case Number|               Date|               Block|IUCR|Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|            Location|double _year|
+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+--------------------+------------+
|10224738|   HY411648|2015-09-05 13:30:00|     043XX S WOOD ST|0486|     BATTERY|DOMESTIC BATTERY ...|           RESIDENCE| false|    true|0924|     009| 

In [58]:
########################################  TO PERMENANTLY REMOVE IT  ##############################

chumma=chumma.drop("Latitude","Description")

In [59]:
chumma.show(5)

+--------+-----------+-------------------+--------------------+----+------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+-------------+--------------------+------------+
|      ID|Case Number|               Date|               Block|IUCR|Primary Type|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Longitude|            Location|double _year|
+--------+-----------+-------------------+--------------------+----+------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+-------------+--------------------+------------+
|10224738|   HY411648|2015-09-05 13:30:00|     043XX S WOOD ST|0486|     BATTERY|           RESIDENCE| false|    true|0924|     009|  12|            61|     08B|     1165074|     1875917|2015|02/10/2018 03:50:...|-

### GROUPBY

In [60]:
chumma.groupBy("Primary Type").count().show(10)

+--------------------+-----+
|        Primary Type|count|
+--------------------+-----+
|OFFENSE INVOLVING...| 6129|
|CRIMINAL SEXUAL A...|   71|
|            STALKING|  468|
|PUBLIC PEACE VIOL...| 4559|
|           OBSCENITY|  174|
|NON-CRIMINAL (SUB...|    4|
|                null|    5|
|               ARSON| 1239|
|            GAMBLING|  545|
|   CRIMINAL TRESPASS|17937|
+--------------------+-----+
only showing top 10 rows



**What are the top 10 number of reported crimes by Primary type, in descending order of occurence?**

In [62]:
chumma.groupBy("Primary Type").count().orderBy("count",ascending=False).show(10)

+-------------------+------+
|       Primary Type| count|
+-------------------+------+
|              THEFT|164981|
|            BATTERY|133205|
|    CRIMINAL DAMAGE| 81135|
|            ASSAULT| 50024|
| DECEPTIVE PRACTICE| 49635|
|      OTHER OFFENSE| 47257|
|          NARCOTICS| 38486|
|           BURGLARY| 36826|
|            ROBBERY| 30704|
|MOTOR VEHICLE THEFT| 29772|
+-------------------+------+
only showing top 10 rows



In [None]:
############################################################################################################################

## ROWS

### FILTERING ROWS

In [63]:
chumma.filter(col("Year") > 2010).show()

+--------+-----------+-------------------+--------------------+----+------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+-------------+--------------------+------------+
|      ID|Case Number|               Date|               Block|IUCR|      Primary Type|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Longitude|            Location|double _year|
+--------+-----------+-------------------+--------------------+----+------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+-------------+--------------------+------------+
|10224738|   HY411648|2015-09-05 13:30:00|     043XX S WOOD ST|0486|           BATTERY|           RESIDENCE| false|    true|0924|     009|  12|            61|     08B|     1165074|     1875917|201

In [96]:
chumma.filter((col("Arrest") == "true") | (col("Arrest") == "True")).show(5)   ####   | for OR and & for AND

+--------+-----------+-------------------+------------------+----+------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+-------------+--------------------+------------+
|      ID|Case Number|               Date|             Block|IUCR|Primary Type|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Longitude|            Location|double _year|
+--------+-----------+-------------------+------------------+----+------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+-------------+--------------------+------------+
|10224740|   HY411595|2015-09-05 12:45:00| 035XX W BARRY AVE|2023|   NARCOTICS|            SIDEWALK|  true|   false|1412|     014|  35|            21|      18|     1152037|     1920384|2015|02/10/2018 03:50:...|-87.71664

##### Add the reported crimes for an additional day, Dec-2018, to our dataset.

In [126]:
from pyspark.sql.functions import to_timestamp,col,lit
df = spark.read.csv('rows.csv@accessType=DOWNLOAD',header=True).withColumn('Date',to_timestamp(col('Date'),'MM/dd/yyyy hh:mm:ss a')).filter(col('Date') == lit('2018-12'))

####################      only filtering till 2018       #######################

df.show()

## ADDING THIS TO PUR ORIGINAL DATAFRAME


+--------+-----------+-------------------+-------------------+----+-------------+----------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+
|      ID|Case Number|               Date|              Block|IUCR| Primary Type|     Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|Latitude|Longitude|Location|
+--------+-----------+-------------------+-------------------+----+-------------+----------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+
|12017906|   JD195459|2018-12-01 00:00:00|040XX N MARMORA AVE|2820|OTHER OFFENSE|TELEPHONE THREAT|           APARTMENT| false|    true|1624|     016|  38|            15|     08A|        null|        null|2018|03/27/2020 03:

In [98]:
df = spark.read.csv('rows.csv@accessType=DOWNLOAD',header=True).filter(col('Arrest') == 'true')

####################      only filtering till 2018       #######################

df.show()

+--------+-----------+--------------------+--------------------+----+-----------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|      ID|Case Number|                Date|               Block|IUCR|     Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|    Longitude|            Location|
+--------+-----------+--------------------+--------------------+----+-----------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|10224740|   HY411595|09/05/2015 12:45:...|   035XX W BARRY AVE|2023|        NARCOTICS|POSS: HEROIN(BRN/...|            SIDEWALK|  tr

### UNIQUE ROWS

In [154]:
rc.select("Year").distinct().show()

+----+
|Year|
+----+
|2003|
|2007|
|2018|
|2015|
|2006|
|2013|
|null|
|2014|
|2019|
|2004|
|2020|
|2012|
|2009|
|2016|
|2001|
|2005|
|2010|
|2011|
|2008|
|2017|
+----+
only showing top 20 rows



### SORTING ROWS

In [97]:
rc.orderBy("IUCR").show()           ###### in pandas we use sort fucntion

+--------------------+-----------+----+--------------------+----+------------+-------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|                  ID|Case Number|Date|               Block|IUCR|Primary Type|        Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|    Longitude|            Location|
+--------------------+-----------+----+--------------------+----+------------+-------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|  "message" : "In...|       null|null|                null|null|        null|               null|                null|  null|    null|null|    null|null|         

### APPENDING/ union / CONCAT 

In [100]:
chumma1=chumma
chumma1.show(5)

+--------+-----------+----+--------------------+----+------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+-------------+--------------------+------------+
|      ID|Case Number|Date|               Block|IUCR|Primary Type|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Longitude|            Location|double _year|
+--------+-----------+----+--------------------+----+------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+-------------+--------------------+------------+
|10224738|   HY411648|null|     043XX S WOOD ST|0486|     BATTERY|           RESIDENCE| false|    true|0924|     009|  12|            61|     08B|     1165074|     1875917|2015|02/10/2018 03:50:...|-87.669999562|(41.815117282, -8...|        4030|
|10224739|  

In [103]:
chumma.union(chumma1)

chumma.show()

+--------+-----------+----+--------------------+----+------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+-------------+--------------------+------------+
|      ID|Case Number|Date|               Block|IUCR|      Primary Type|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Longitude|            Location|double _year|
+--------+-----------+----+--------------------+----+------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+-------------+--------------------+------------+
|10224738|   HY411648|null|     043XX S WOOD ST|0486|           BATTERY|           RESIDENCE| false|    true|0924|     009|  12|            61|     08B|     1165074|     1875917|2015|02/10/2018 03:50:...|-87.669999562|(41.815117282, -8...|  

In [84]:
spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
simpleData = [("James",34),("Ann",34),
    ("Michael",33),("Scott",53),
    ("Robert",37),("Chad",27)
  ]

columns = ["firstname","age",]
df = spark.createDataFrame(data = simpleData, schema = columns)
df.show()
df1 = spark.createDataFrame(data = [("antony",27)], schema = columns)
df1.show()
df=df.union(df1)
df.show()

+---------+---+
|firstname|age|
+---------+---+
|    James| 34|
|      Ann| 34|
|  Michael| 33|
|    Scott| 53|
|   Robert| 37|
|     Chad| 27|
+---------+---+

+---------+---+
|firstname|age|
+---------+---+
|   antony| 27|
+---------+---+

+---------+---+
|firstname|age|
+---------+---+
|    James| 34|
|      Ann| 34|
|  Michael| 33|
|    Scott| 53|
|   Robert| 37|
|     Chad| 27|
|   antony| 27|
+---------+---+



#### Add the reported crimes for an additional day, Dec-2018, to our dataset.


In [85]:
from pyspark.sql.functions import to_timestamp,col,lit
df1 = spark.read.csv('rows.csv@accessType=DOWNLOAD',header=True).withColumn('Date',to_timestamp(col('Date'),'MM/dd/yyyy hh:mm:ss a')).filter(col('Date') == lit('2018-12'))

####################      only filtering till 2018       #######################

df1.show()




+--------+-----------+-------------------+-------------------+----+-------------+----------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+
|      ID|Case Number|               Date|              Block|IUCR| Primary Type|     Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|Latitude|Longitude|Location|
+--------+-----------+-------------------+-------------------+----+-------------+----------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+
|12017906|   JD195459|2018-12-01 00:00:00|040XX N MARMORA AVE|2820|OTHER OFFENSE|TELEPHONE THREAT|           APARTMENT| false|    true|1624|     016|  38|            15|     08A|        null|        null|2018|03/27/2020 03:

In [86]:
df = spark.read.csv('rows.csv@accessType=DOWNLOAD',header=True)
df.show()

+--------+-----------+--------------------+--------------------+----+------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|      ID|Case Number|                Date|               Block|IUCR|      Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|    Longitude|            Location|
+--------+-----------+--------------------+--------------------+----+------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|10224738|   HY411648|09/05/2015 01:30:...|     043XX S WOOD ST|0486|           BATTERY|DOMESTIC BATTERY ...|           RESIDENCE|

In [73]:
## ADDING THIS TO PUR ORIGINAL DATAFRAME

df=df.union(df1)


In [87]:
df.tail(2)

[Row(ID='  "status" : 500', Case Number=None, Date=None, Block=None, IUCR=None, Primary Type=None, Description=None, Location Description=None, Arrest=None, Domestic=None, Beat=None, District=None, Ward=None, Community Area=None, FBI Code=None, X Coordinate=None, Y Coordinate=None, Year=None, Updated On=None, Latitude=None, Longitude=None, Location=None),
 Row(ID='}', Case Number=None, Date=None, Block=None, IUCR=None, Primary Type=None, Description=None, Location Description=None, Arrest=None, Domestic=None, Beat=None, District=None, Ward=None, Community Area=None, FBI Code=None, X Coordinate=None, Y Coordinate=None, Year=None, Updated On=None, Latitude=None, Longitude=None, Location=None)]

In [88]:
df.filter(col('ID') == lit('12017906')).show()

+--------+-----------+--------------------+-------------------+----+-------------+----------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+
|      ID|Case Number|                Date|              Block|IUCR| Primary Type|     Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|Latitude|Longitude|Location|
+--------+-----------+--------------------+-------------------+----+-------------+----------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+
|12017906|   JD195459|12/01/2018 12:00:...|040XX N MARMORA AVE|2820|OTHER OFFENSE|TELEPHONE THREAT|           APARTMENT| false|    true|1624|     016|  38|            15|     08A|        null|        null|2018|03/27/2020

### CHALLENGE QNS

In [144]:
### WHAT PERCENT OF REPORTED FUNCTIONS RESULTED IN AN ARREST

TOTAL TRUE ARREST CASES / TOTAL NUMBER OF ROWS * 100

In [147]:
rc.cache()
rc.count()

718078

In [158]:
rc.filter(col("Arrest") == "true").count()

149136

In [159]:
rc.select("Arrest").count()

718078

In [150]:
(149136/718078) * 100

20.768774422834287

In [145]:
### TOP 3 LOCATIONS FOR REPORTED CRIMES

In [91]:
rc.show(5)

+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|      ID|Case Number|               Date|               Block|IUCR|Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|    Latitude|    Longitude|            Location|
+--------+-----------+-------------------+--------------------+----+------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+------------+-------------+--------------------+
|10224738|   HY411648|2015-09-05 13:30:00|     043XX S WOOD ST|0486|     BATTERY|DOMESTIC BATTERY ...|           RESIDENCE| false|    true|0924|     00

In [92]:
rc.select("Location Description").distinct().show()
rc.select("Community Area").distinct().show()

+--------------------+
|Location Description|
+--------------------+
|SCHOOL - PRIVATE ...|
|AIRPORT TERMINAL ...|
|VEHICLE - COMMERCIAL|
|POLICE FACILITY/V...|
|RESIDENCE - YARD ...|
|CHA PARKING LOT /...|
|            SIDEWALK|
|AIRPORT TERMINAL ...|
|OTHER RAILROAD PR...|
|CTA GARAGE / OTHE...|
|            CAR WASH|
|    AIRPORT/AIRCRAFT|
|MEDICAL/DENTAL OF...|
|    FEDERAL BUILDING|
|         CTA STATION|
|SCHOOL, PUBLIC, G...|
|SPORTS ARENA/STADIUM|
|                FARM|
|VEHICLE - OTHER R...|
|               HOUSE|
+--------------------+
only showing top 20 rows

+--------------+
|Community Area|
+--------------+
|            51|
|             7|
|            15|
|            54|
|            11|
|            29|
|            69|
|            42|
|            73|
|            64|
|             3|
|            30|
|            34|
|            59|
|             8|
|            28|
|            22|
|            52|
|            35|
|            16|
+--------------+
only showing t

In [93]:
rc.groupBy("Community Area").count().orderBy("count",ascending=False).show(3)

+--------------+-----+
|Community Area|count|
+--------------+-----+
|            25|43315|
|             8|29772|
|            32|25882|
+--------------+-----+
only showing top 3 rows



In [94]:
rc.groupBy("Location Description").count().orderBy("count",ascending=False).show(3)

+--------------------+------+
|Location Description| count|
+--------------------+------+
|              STREET|159735|
|           RESIDENCE|121804|
|           APARTMENT| 92478|
+--------------------+------+
only showing top 3 rows

