In [66]:
import findspark
findspark.init()

In [67]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [68]:
df_off = spark.read.csv('C:/Users/Досжан/Desktop/California Crime and Law Enforcement/ca_offenses_by_city.csv', inferSchema=True, header=True)
df_law = spark.read.csv('C:/Users/Досжан/Desktop/California Crime and Law Enforcement/ca_law_enforcement_by_city.csv', inferSchema=True, header=True)

In [69]:
df_law.show(5)

+-----------+----------+---------+
|       City|Population|Total law|
+-----------+----------+---------+
|enforcement|      NULL|     NULL|
| employees"|    Total |     NULL|
|  officers"|    Total |     NULL|
| civilians"|      NULL|     NULL|
|    Alameda|    78,613|      112|
+-----------+----------+---------+
only showing top 5 rows



In [70]:
df_off.show(5)

+------------+----------+-------------+------------------------------------+-------------------------+------------------------+-------+------------------+--------------+--------+-------------+-------------------+-----+
|        City|Population|Violent crime|Murder and nonnegligent manslaughter|Rape (revised definition)|Rape (legacy definition)|Robbery|Aggravated assault|Property crime|Burglary|Larceny-theft|Motor vehicle theft|Arson|
+------------+----------+-------------+------------------------------------+-------------------------+------------------------+-------+------------------+--------------+--------+-------------+-------------------+-----+
|    Adelanto|    33,005|          212|                                   2|                       14|                    NULL|     48|               148|           808|     434|          254|                120|   24|
|Agoura Hills|    20,970|           15|                                   0|                        1|                    NU

In [71]:
import pandas as pd
off_pd = pd.read_csv("C:/Users/Досжан/Desktop/California Crime and Law Enforcement/ca_offenses_by_city.csv")
law_pd = pd.read_csv("C:/Users/Досжан/Desktop/California Crime and Law Enforcement/ca_law_enforcement_by_city.csv")

In [72]:
off_pd.head()

Unnamed: 0,City,Population,Violent crime,Murder and nonnegligent manslaughter,Rape (revised definition),Rape (legacy definition),Robbery,Aggravated assault,Property crime,Burglary,Larceny-theft,Motor vehicle theft,Arson
0,Adelanto,33005,212,2,14,,48,148,808,434,254,120,24
1,Agoura Hills,20970,15,0,1,,6,8,310,82,217,11,0
2,Alameda,78613,148,2,7,,61,78,1819,228,1245,346,18
3,Albany,19723,34,1,6,,16,11,605,95,447,63,0
4,Alhambra3,86175,168,1,13,,74,80,1929,305,1413,211,6


In [73]:
law_pd.head()

Unnamed: 0,City,Population,Total law\renforcement\remployees,Total \rofficers,Total \rcivilians
0,Alameda,78613,112,83,29
1,Albany,19723,30,23,7
2,Alhambra,86175,128,85,43
3,Alturas,2566,6,5,1
4,Anaheim,349471,577,399,178


In [74]:
off_pd['Rape (legacy definition)'].notnull().sum()

0

In [75]:
df_off = df_off.drop("Index")
df_off = df_off.drop('Rape (legacy definition)')
off_pd.drop('Rape (legacy definition)', axis=1, inplace = True)

In [76]:
# Группировка данных по городу и суммирование криминальных преступлений
df_off.select(['City']).distinct().count()

460

In [77]:
df_off.show(5)

+------------+----------+-------------+------------------------------------+-------------------------+-------+------------------+--------------+--------+-------------+-------------------+-----+
|        City|Population|Violent crime|Murder and nonnegligent manslaughter|Rape (revised definition)|Robbery|Aggravated assault|Property crime|Burglary|Larceny-theft|Motor vehicle theft|Arson|
+------------+----------+-------------+------------------------------------+-------------------------+-------+------------------+--------------+--------+-------------+-------------------+-----+
|    Adelanto|    33,005|          212|                                   2|                       14|     48|               148|           808|     434|          254|                120|   24|
|Agoura Hills|    20,970|           15|                                   0|                        1|      6|                 8|           310|      82|          217|                 11|    0|
|     Alameda|    78,613|     

In [78]:
df_off.printSchema()

root
 |-- City: string (nullable = true)
 |-- Population: string (nullable = true)
 |-- Violent crime: string (nullable = true)
 |-- Murder and nonnegligent manslaughter: integer (nullable = true)
 |-- Rape (revised definition): string (nullable = true)
 |-- Robbery: string (nullable = true)
 |-- Aggravated assault: string (nullable = true)
 |-- Property crime: string (nullable = true)
 |-- Burglary: string (nullable = true)
 |-- Larceny-theft: string (nullable = true)
 |-- Motor vehicle theft: string (nullable = true)
 |-- Arson: string (nullable = true)



In [79]:
# Perform an inner join on the 'city' column
merged_df = df_law.join(df_off, on=['city'], how='inner')


In [80]:
from pyspark.sql.functions import col

In [81]:
# Calculate the rate of violent crime per law enforcement employee
merged_df = merged_df.withColumn("violent_crime_rate", col('Violent crime') / col('Total law'))
merged_df.show()

+-----------+----------+---------+----------+-------------+------------------------------------+-------------------------+-------+------------------+--------------+--------+-------------+-------------------+-----+-------------------+
|       City|Population|Total law|Population|Violent crime|Murder and nonnegligent manslaughter|Rape (revised definition)|Robbery|Aggravated assault|Property crime|Burglary|Larceny-theft|Motor vehicle theft|Arson| violent_crime_rate|
+-----------+----------+---------+----------+-------------+------------------------------------+-------------------------+-------+------------------+--------------+--------+-------------+-------------------+-----+-------------------+
|    Alameda|    78,613|      112|    78,613|          148|                                   2|                        7|     61|                78|         1,819|     228|        1,245|                346|   18| 1.3214285714285714|
|     Albany|    19,723|       30|    19,723|           34|     

In [82]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Define a custom schema for the law_enforcement_df
law_enforcement_schema = StructType([
    StructField("law_enforcement_city", StringType(), True),
    StructField("law_enforcement_employees", IntegerType(), True),
    StructField("population", IntegerType(), True)
])

# Define a custom schema for the offenses_df
offenses_schema = StructType([
    StructField("offenses_city", StringType(), True),
    StructField("violent_crime", IntegerType(), True)
])

# Load the datasets with custom schemas
law_enforcement_df = spark.read.csv("ca_law_enforcement_by_city.csv", header=True, schema=law_enforcement_schema)
offenses_df = spark.read.csv("ca_offenses_by_city.csv", header=True, schema=offenses_schema)


In [83]:
# Perform an inner join on the aliased city columns
merged_df = law_enforcement_df.join(offenses_df, (law_enforcement_df["law_enforcement_city"] == offenses_df["offenses_city"]), "inner")

# Filter the data for cities with a population greater than 100,000
filtered_df = merged_df.filter(col("population") > 100000)

In [84]:
# Calculate the rate of violent crime per law enforcement employee
filtered_df = filtered_df.withColumn("violent_crime_rate", col("violent_crime") / col("law_enforcement_employees"))

In [85]:
# Sort the data by the violent crime rate in descending order
sorted_df = filtered_df.orderBy(col("violent_crime_rate").desc())

In [86]:
# Write the results to a CSV file
sorted_df.toPandas().to_csv("output.csv", index=False)
sorted_df.show()

+--------------------+-------------------------+----------+-------------+-------------+------------------+
|law_enforcement_city|law_enforcement_employees|population|offenses_city|violent_crime|violent_crime_rate|
+--------------------+-------------------------+----------+-------------+-------------+------------------+
+--------------------+-------------------------+----------+-------------+-------------+------------------+



In [87]:
result_df = spark.read.csv("output.csv", header=True, inferSchema=True)
result_df.show()

+--------------------+-------------------------+----------+-------------+-------------+------------------+
|law_enforcement_city|law_enforcement_employees|population|offenses_city|violent_crime|violent_crime_rate|
+--------------------+-------------------------+----------+-------------+-------------+------------------+
+--------------------+-------------------------+----------+-------------+-------------+------------------+



In [88]:
# df = df.withColumn("Population", df.Population.cast("Integer"))
# # df = df.withColumn("Violent crime", df["Violent crime"].cast("Integer"))
# # df = df.withColumn("Murder and nonnegligent manslaughter", df["Murder and nonnegligent manslaughter"].cast("Integer"))
# # df = df.withColumn("Rape (revised definition)", df["Rape (revised definition)"].cast("Integer"))
# # df = df.withColumn("Robbery", df["Robbery"].cast("Integer"))
# # df = df.withColumn("Aggravated assault", df["Aggravated assault"].cast("Integer"))
# # df = df.withColumn("Property", df["Property"].cast("Integer"))
# # df = df.withColumn("Burglary", df["Burglary"].cast("Integer"))
# # df = df.withColumn("Larceny-theft", df["Larceny-theft"].cast("Integer"))
# # df = df.withColumn("Motor vehicle theft", df["Motor vehicle theft"].cast("Integer"))
# # df = df.withColumn("Arson", df["Arson"].cast("Integer"))
# # col_lst = df.columns.copy()
# # col_lst.remove('City')
# # for i in col_lst:
# #     df = df.withColumn(f"{i}", df[f"{i}"].cast("Integer"))

# # from pyspark.sql.functions import col, sum

# # df.select(sum(col("Population").isNull().cast("integer"))).show()

# df.printSchema()
# df.show(5)