In [14]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType
import pandas as pd

In [15]:
spark = SparkSession.builder.appName('CrimePrediction').getOrCreate()
df = spark.read.format("csv").load("../../datasets/FIR_Details_Data.csv",header=True,inferSchema=True)

In [16]:
df.columns

['District_Name',
 'UnitName',
 'FIRNo',
 'RI',
 'Year',
 'Month',
 'Offence_From_Date',
 'Offence_To_Date',
 'FIR_Reg_DateTime',
 'FIR_Date',
 'FIR Type',
 'FIR_Stage',
 'Complaint_Mode',
 'CrimeGroup_Name',
 'CrimeHead_Name',
 'Latitude',
 'Longitude',
 'ActSection',
 'IOName',
 'KGID',
 'IOAssigned_Date',
 'Internal_IO',
 'Place of Offence',
 'Distance from PS',
 'Beat_Name',
 'Village_Area_Name',
 'Male',
 'Female',
 'Boy',
 'Girl',
 'Age 0',
 'VICTIM COUNT',
 'Accused Count',
 'Arrested Male',
 'Arrested Female',
 'Arrested Count\tNo.',
 'Accused_ChargeSheeted Count',
 'Conviction Count',
 'FIR_ID',
 'Unit_ID',
 'Crime_No']

In [17]:
df.count()

1694191

In [18]:
df.groupBy('CrimeGroup_Name').count().show()

+--------------------+------+
|     CrimeGroup_Name| count|
+--------------------+------+
|      BURGLARY - DAY|  8328|
|      FALSE EVIDENCE|   259|
|Karnataka State L...| 91769|
|      UNNATURAL SEX |    52|
|ESCAPE FROM LAWFU...|   340|
|             SUICIDE|  6145|
|      PUBLIC JUSTICE|    31|
|      COUNTERFEITING|   320|
|      MISSING PERSON|126237|
|Attempting to com...|  1524|
|         CYBER CRIME| 79756|
|            ELECTION|  2671|
|           FOREIGNER|   431|
|CRIMINAL INTIMIDA...| 18092|
|OFFENCES AGAINST ...| 10072|
|      ARMS ACT  1959|  1665|
|ATTEMPT TO CULPAB...|   287|
|        ADULTERATION|   158|
|WRONGFUL RESTRAIN...|  3501|
|CULPABLE HOMICIDE...|   673|
+--------------------+------+
only showing top 20 rows



In [19]:
d_split = pyspark.sql.functions.split(df['FIR_Date'],'/')
df =df.withColumn('Date',d_split.getItem(0))
df = df.withColumn('Date', df['Date'].cast(IntegerType()))

## Replacing District Values

In [20]:
df = df.na.replace(['Belagavi City', 'Belagavi Dist'], ['Belagavi', 'Belagavi'], 'District_Name')
df = df.na.replace(['Bengaluru City', 'Bengaluru Dist','ISD Bengaluru','CID'], ['Bengaluru', 'Bengaluru','Bengaluru', 'Bengaluru'], 'District_Name')
df = df.na.replace(['Kalaburagi City'], ['Kalaburagi'], 'District_Name')
df = df.na.replace(['Hubballi Dharwad City'], ['Dharwad'], 'District_Name')
df = df.na.replace(['K.G.F'], ['Kolar'], 'District_Name')
df = df.na.replace(['Mangaluru City'], ['Mangaluru'], 'District_Name')

In [21]:
df.select('District_Name').distinct().show()

+--------------------+
|       District_Name|
+--------------------+
|             Ballari|
|            Bagalkot|
|            Belagavi|
|           Bengaluru|
|               Bidar|
|      Chamarajanagar|
|      Chickballapura|
|      Chikkamagaluru|
|Coastal Security ...|
|          Davanagere|
|    Dakshina Kannada|
|         Chitradurga|
|             Dharwad|
|              Haveri|
|               Gadag|
|              Hassan|
|  Karnataka Railways|
|              Kodagu|
|          Kalaburagi|
|               Kolar|
+--------------------+
only showing top 20 rows



In [22]:
df.select('FIR Type').distinct().show()

+-----------+
|   FIR Type|
+-----------+
|    Heinous|
|Non Heinous|
|       NULL|
+-----------+



In [23]:
df = df.na.replace(['NULL'], ['Not Applicable'], 'FIR Type')
df.select('FIR Type').distinct().show()

+--------------+
|      FIR Type|
+--------------+
|       Heinous|
|   Non Heinous|
|Not Applicable|
+--------------+



In [24]:
df.printSchema()

root
 |-- District_Name: string (nullable = true)
 |-- UnitName: string (nullable = true)
 |-- FIRNo: string (nullable = true)
 |-- RI: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Offence_From_Date: timestamp (nullable = true)
 |-- Offence_To_Date: string (nullable = true)
 |-- FIR_Reg_DateTime: timestamp (nullable = true)
 |-- FIR_Date: string (nullable = true)
 |-- FIR Type: string (nullable = true)
 |-- FIR_Stage: string (nullable = true)
 |-- Complaint_Mode: string (nullable = true)
 |-- CrimeGroup_Name: string (nullable = true)
 |-- CrimeHead_Name: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- ActSection: string (nullable = true)
 |-- IOName: string (nullable = true)
 |-- KGID: string (nullable = true)
 |-- IOAssigned_Date: string (nullable = true)
 |-- Internal_IO: integer (nullable = true)
 |-- Place of Offence: string (nullable = true)
 |-- Distance fro

In [25]:
model_columns = df.select(['District_Name','UnitName','Date','Month','Year','FIR Type'])

In [26]:
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder

District_NameIdx = StringIndexer(inputCol='District_Name', 
                               outputCol='District_Index') 
DistrictEncode = OneHotEncoder(inputCol='District_Index', 
                               outputCol='District_Vec') 

UnitNameIdx = StringIndexer(inputCol='UnitName', 
                               outputCol='UnitName_Index') 
UnitNameEncode = OneHotEncoder(inputCol='UnitName_Index', 
                               outputCol='UnitName_Vec') 

Fir_TypeIdx = StringIndexer(inputCol='FIR Type', 
                               outputCol='FirType_Index') 
UnitNameEncode = OneHotEncoder(inputCol='FirType_Index', 
                               outputCol='FirType_Vec') 

1055