In [44]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.sql.functions import format_number, when, col, array, udf, lit
import pandas as pd
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer, IndexToString
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.sql.types import *
import pyspark.sql.functions as F
import matplotlib.pyplot as plt

In [45]:
spark = SparkSession.builder.appName("Predictive Crime Analysis - Indian Dataset")\
                .config("spark.some.config.option", "some-value")\
                .config("spark.driver.memory", "4g")\
                .config("spark.driver.cores", "1")\
                .getOrCreate()

In [46]:
crime_schema = StructType([
    StructField("Event", StringType(), True),
    StructField("Circle", StringType(), True),
    StructField("Police Station", StringType(), True),
    StructField("Caller Source", StringType(), True),
    StructField("Event Type", StringType(), True),
    StructField("Event Sub-Type", StringType(), True),        
    StructField("Date", StringType(), True),
    StructField("Latitude", DoubleType(), True),
    StructField("Longitude", DoubleType(), True)
])

In [47]:
dataset =spark.read.option("header", "True")\
        .option("inferSchema", "true").csv("originaldataset2.csv")

In [48]:
dataset.printSchema()

root
 |-- District: string (nullable = true)
 |-- Event: string (nullable = true)
 |-- Circle: string (nullable = true)
 |-- Police Station: string (nullable = true)
 |-- Caller Source: string (nullable = true)
 |-- Event Type: string (nullable = true)
 |-- Event Sub-Type: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)



In [49]:
dataset.limit(5).toPandas().head()

Unnamed: 0,District,Event,Circle,Police Station,Caller Source,Event Type,Event Sub-Type,Date,Latitude,Longitude
0,LUCKNOW,P01042100004,C1,PS1,PHONE,Information Against Police,Misbehavior By Prv,01/04/2021 00:00:00,26.834,81.008
1,LUCKNOW,P01042104316,C1,PS1,PHONE,Threat In Person,Attack,01/04/2021 12:09:00,26.828,81.014
2,LUCKNOW,P01042104847,C1,PS1,PHONE,Dispute,Dispute In Hospital,01/04/2021 12:51:00,26.84,81.009
3,LUCKNOW,P01042105074,C1,PS1,PHONE,Gambling,Play Cards,01/04/2021 13:10:00,26.828,81.002
4,LUCKNOW,P01042105152,C1,PS1,PHONE,Threat In Person,Attack,01/04/2021 13:18:00,26.834,81.033


In [50]:
dataset = dataset.filter((dataset["Event Type"] != "Corona")&
                          (dataset["Event Type"] != "Unknown")&
                          (dataset["Event Type"] != "#NA")&
                          (dataset["Event Type"] != "Sos")&
                          (dataset["Event Type"] != "Unclaimed Information")
                          (dataset["Event Type"] != "Police Help Required By 108") ) 

In [54]:
dataset=dataset.withColumn("Day", F.split(dataset.Date, " ")[0])
dataset=dataset.withColumn("Day", F.to_date(dataset.Day, "dd/MM/yyyy"))
dataset=dataset.withColumn("Month", F.month(dataset.Day))
dataset=dataset.withColumn("WeekDay", F.dayofweek(dataset.Day))
dataset=dataset.withColumn("Hour", F.split(F.split(dataset.Date," ")[1], ":")[0].cast("int"))
dataset=dataset.na.drop()
dataset.limit(5).toPandas().head()

Unnamed: 0,District,Event,Circle,Police Station,Caller Source,Event Type,Event Sub-Type,Date,Latitude,Longitude,Day,Month,WeekDay,Hour
0,LUCKNOW,P01042100004,C1,PS1,PHONE,Information Against Police,Misbehavior By Prv,01/04/2021 00:00:00,26.834,81.008,2021-04-01,4,5,0
1,LUCKNOW,P01042104316,C1,PS1,PHONE,Threat In Person,Attack,01/04/2021 12:09:00,26.828,81.014,2021-04-01,4,5,12
2,LUCKNOW,P01042104847,C1,PS1,PHONE,Dispute,Dispute In Hospital,01/04/2021 12:51:00,26.84,81.009,2021-04-01,4,5,12
3,LUCKNOW,P01042105074,C1,PS1,PHONE,Gambling,Play Cards,01/04/2021 13:10:00,26.828,81.002,2021-04-01,4,5,13
4,LUCKNOW,P01042105152,C1,PS1,PHONE,Threat In Person,Attack,01/04/2021 13:18:00,26.834,81.033,2021-04-01,4,5,13


AttributeError: 'GroupedData' object has no attribute 'show'

In [59]:
dataset.write.option("header", True) \
                .csv("exportedDF")