In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

spark = SparkSession.builder \
         .master("local") \
         .appName("Project") \
         .config("spark.some.config.option", "some-value") \
         .getOrCreate()
        
sc = spark.sparkContext

In [8]:
from pyspark.sql.types import *

df = spark.read.csv('Crimes.csv', header=True, inferSchema=True)

In [9]:
from pyspark.sql.types import  (StructType, 
                                StructField, 
                                DateType, 
                                BooleanType,
                                DoubleType,
                                IntegerType,
                                StringType,
                               TimestampType)
crimes_schema = StructType([StructField("ID", StringType(), True),
                            StructField("CaseNumber", StringType(), True),
                            StructField("Date", StringType(), True ),
                            StructField("Block", StringType(), True),
                            StructField("IUCR", StringType(), True),
                            StructField("PrimaryType", StringType(), True  ),
                            StructField("Description", StringType(), True ),
                            StructField("LocationDescription", StringType(), True ),
                            StructField("Arrest", BooleanType(), True),
                            StructField("Domestic", BooleanType(), True),
                            StructField("Beat", StringType(), True),
                            StructField("District", StringType(), True),
                            StructField("Ward", StringType(), True),
                            StructField("CommunityArea", StringType(), True),
                            StructField("FBICode", StringType(), True ),
                            StructField("XCoordinate", DoubleType(), True),
                            StructField("YCoordinate", DoubleType(), True ),
                            StructField("Year", IntegerType(), True),
                            StructField("UpdatedOn", DateType(), True ),
                            StructField("Latitude", DoubleType(), True),
                            StructField("Longitude", DoubleType(), True),
                            StructField("Location", StringType(), True )
                            ])

In [10]:
crimes = spark.read.csv("Crimes.csv",
                       header = True, 
                        schema = crimes_schema)

In [11]:
print(" The crimes dataframe has {} records".format(crimes.count()))

 The crimes dataframe has 6600889 records


In [12]:
crimes.columns

['ID',
 'CaseNumber',
 'Date',
 'Block',
 'IUCR',
 'PrimaryType',
 'Description',
 'LocationDescription',
 'Arrest',
 'Domestic',
 'Beat',
 'District',
 'Ward',
 'CommunityArea',
 'FBICode',
 'XCoordinate',
 'YCoordinate',
 'Year',
 'UpdatedOn',
 'Latitude',
 'Longitude',
 'Location']

In [13]:
crimes.select("Block","Arrest").show(10, truncate = False)

+----------------------+------+
|Block                 |Arrest|
+----------------------+------+
|047XX W OHIO ST       |false |
|066XX S MARSHFIELD AVE|true  |
|044XX S LAKE PARK AVE |false |
|051XX S MICHIGAN AVE  |false |
|047XX W ADAMS ST      |false |
|049XX S DREXEL BLVD   |false |
|070XX S MORGAN ST     |false |
|042XX S PRAIRIE AVE   |false |
|036XX S WOLCOTT AVE   |true  |
|097XX S PRAIRIE AVE   |false |
+----------------------+------+
only showing top 10 rows



In [16]:
crimes.select("PrimaryType").distinct().count()

35

In [20]:
crimes.select("District").distinct().count()

25

In [21]:
crimes.filter(crimes["Arrest"]==True).count()/crimes.count() * 100

27.918754579875525