In [2]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

In [3]:
from pyspark.sql import types
from pyspark.sql import functions as F

In [4]:
import os
import pandas as pd
import pendulum as pdl

# inputs
- year

In [5]:
# for city-specific data
cities = ['Chicago', 'San Francisco', 'Los Angeles', 'Austin']
f_cities = [c.replace(' ', '_').lower() for c in cities]

In [6]:
gcs_bkt = os.getenv('GCP_GCS_BUCKET')

In [7]:
jar_path = os.getenv('JAR_FILE_LOC')
creds_path = '/.google/credentials/' + os.getenv('GOOGLE_APPLICATION_CREDENTIALS')

# additional ignore flag at end for cities with no data for year
conf = SparkConf() \
    .setMaster('local[*]') \
    .setAppName('project_year_read') \
    .set("spark.jars", jar_path) \
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", creds_path) \
    .set("spark.sql.files.ignoreMissingFiles", "true")

### Only if an existing one already runs:
`sc.stop()`

In [8]:
sc = SparkContext(conf=conf)

22/10/26 07:33:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [9]:
hconf = sc._jsc.hadoopConfiguration()

hconf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hconf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hconf.set("fs.gs.auth.service.account.json.keyfile", creds_path)
hconf.set("fs.gs.auth.service.account.enable", "true")

In [10]:
spark = SparkSession.builder \
    .config(conf=sc.getConf()) \
    .getOrCreate()

***

### Replace below with me:
```
dict_df = {}
for city in cities:
    df_pq = spark.read \
        .option("header", "true") \
        .option(, ) \
        .parquet(f'{gcs_bkt}/pq/{city}/{year}/*')
    dict_df.update({city: df_pq})
```

In [18]:
df_pq = spark.read \
    .option("header", "true") \
    .parquet(f'{gcs_bkt}/pq/{city}/2001/*')

[Stage 9:>                                                          (0 + 1) / 1]                                                                                

### check count here: pq df
Command: `df_pq.count()`

Output: `485853`

In [19]:
def parse_dt(dt_str):
    """
    parse datetime object from given date string of format 'MM/DD/YYYY HH:mm:ss A'
    """
    if city == f_cities[0]:
        return pdl.from_format(dt_str, 'MM/DD/YYYY HH:mm:ss A')

parse_dt_udf = F.udf(parse_dt, returnType=types.TimestampType())

In [68]:
# filter out duplicates
# filter out rows with null values in important columns
# add timestamp column
df_time = df_pq \
    .distinct() \
    .filter(F.col('date').isNotNull()) \
    .withColumn('timestamp', parse_dt_udf(F.col('date')))

### check count here: non-null df
Command:
`df_time.count()`

Output:
`38114`

### check parsed dates

Command: `df_time.head(10)`

Output: must contain datetimes

### check date here: csv df, datetime col
Command:
```
df_time \
    .filter(F.month('timestamp') == 1) \
    .groupBy(F.dayofmonth('timestamp')) \
    .count() \
    .orderBy('count', ascending=False).show()
```
Output:
```
+---------------------+-----+
|dayofmonth(Timestamp)|count|
+---------------------+-----+
|                    1| 1825|
|                   12| 1353|
|                   15| 1312|
|                   13| 1311|
|                   26| 1296|
|                    6| 1290|
|                   17| 1288|
|                   18| 1278|
|                    5| 1267|
|                   20| 1256|
|                   16| 1251|
|                   23| 1250|
|                   10| 1237|
|                   11| 1228|
|                   27| 1215|
|                   19| 1214|
|                   30| 1211|
|                   31| 1189|
|                    9| 1184|
|                   25| 1183|
+---------------------+-----+
only showing top 20 rows
```

In [88]:
# selected columns for analysis
df_time_cols = ['case_number', 'timestamp', 'city', 'street', 'primary_type', 'description', 'location_description', 'arrest', 'domestic', 'beat', 'latitude', 'longitude']

In [89]:
def parse_st(block):
    """
    parse street from given block of format 'NNNXX D STREET ST ..'
    """
    split = block.split()
    street = split[2]
    for part in split[3:]:
        street += f' {part}'
    return street

parse_st_udf = F.udf(parse_st, returnType=types.StringType())

# set up street column
# pick out important columns
df_select = df_time \
    .withColumn('street', parse_st_udf('block')) \
    .withColumn('city', F.lit(city.title().replace('_', ' '))) \
    .select(df_time_cols)

In [90]:
df_union.write.parquet(f'{gcs_bkt}/year/{year}/', mode='overwrite')

                                                                                

[Row(case_number='G600427', timestamp=datetime.datetime(2001, 10, 6, 16, 0), city='Chicago', street='RIDGEWAY AV', primary_type='THEFT', description='$500 AND UNDER', location_description='STREET', arrest=False, domestic=False, beat='1723', latitude=41.958370208740234, longitude=-87.72177124023438),
 Row(case_number='G598862', timestamp=datetime.datetime(2001, 10, 5, 20, 30), city='Chicago', street='79 ST', primary_type='ASSAULT', description='SIMPLE', location_description='RESIDENCE', arrest=False, domestic=False, beat='0834', latitude=41.74931716918945, longitude=-87.73786163330078),
 Row(case_number='G600201', timestamp=datetime.datetime(2001, 10, 6, 14, 5), city='Chicago', street='ST LAWRENCE AV', primary_type='BATTERY', description='SIMPLE', location_description='RESIDENCE', arrest=False, domestic=True, beat='0323', latitude=41.76348114013672, longitude=-87.6102294921875),
 Row(case_number='G601505', timestamp=datetime.datetime(2001, 10, 7, 6, 8, 22), city='Chicago', street='COLUM

In [None]:
sc.stop()

### check date Jan: pq df
Command:
```
df_pq \
    .filter(F.month('Timestamp') == 1).count()
```
Output: `1825`

### check date Jan1: pq df
Command:
```
df_pq \
    .filter(F.dayofmonth('Timestamp') == 1).count()
```
Output: `1825`

### check date here: pq df
Command:
```
df_pq \
    .filter(F.month('Timestamp') == 1) \
    .groupBy(F.dayofmonth('Timestamp')) \
    .count() \
    .orderBy('count', ascending=False).show()
```