In [1]:
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

In [2]:
from pyspark.sql import types
from pyspark.sql import functions as F

In [3]:
import os
import pandas as pd
import pendulum as pdl

In [4]:
from city_vars import dict_cities

# inputs
- city
- fname

In [5]:
city = 'Austin'
fname = ''
dict_city = dict_cities[city]

In [7]:
# for city-specific data
cities = ['Chicago', 'San Francisco', 'Los Angeles', 'Austin']
f_cities = [c.replace(' ', '_').lower() for c in cities]

In [6]:
gcs_bkt = os.getenv('GCP_GCS_BUCKET')

In [7]:
jar_path = os.getenv('JAR_FILE_LOC')
creds_path = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')

conf = SparkConf() \
    .setMaster('spark://city-crimes-spark-1:7077') \
    .setAppName('proj_file_read') \
    .set("spark.jars", jar_path)

### Only if an existing one already runs:
`sc.stop()`

In [10]:
sc.stop()

In [10]:
sc = SparkContext(conf=conf)

22/11/11 14:41:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [11]:
sc.addFile("austin2015.csv")
sc.addPyFile("city_vars.py")

In [12]:
hconf = sc._jsc.hadoopConfiguration()

hconf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hconf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hconf.set("fs.gs.auth.service.account.json.keyfile", creds_path)
hconf.set("fs.gs.auth.service.account.enable", "true")

In [13]:
spark = SparkSession.builder \
    .config(conf=sc.getConf()) \
    .getOrCreate()

***

### first, open dataset page and check data dictionary on columns

### 1-time sample download for pandas reading to infer schema for everything else:

command:
`!wget https://data.cityofchicago.org/api/views/hx8q-mf9v/rows.csv?accessType=DOWNLOAD`

note: file is `Crimes_-_2012.csv`

output:
```
--2022-10-22 08:53:42--  https://data.cityofchicago.org/api/views/hx8q-mf9v/rows.csv?accessType=DOWNLOAD
Resolving data.cityofchicago.org (data.cityofchicago.org)... 52.206.140.205, 52.206.68.26, 52.206.140.199
Connecting to data.cityofchicago.org (data.cityofchicago.org)|52.206.140.205|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/csv]
Saving to: ‘rows.csv?accessType=DOWNLOAD’

rows.csv?accessType     [          <=>       ]  75.99M  2.54MB/s    in 28s     

2022-10-22 08:54:11 (2.68 MB/s) - ‘rows.csv?accessType=DOWNLOAD’ saved [79677853]
```

In [16]:
!wget https://data.lacity.org/api/views/2nrs-mtv8/rows.csv?accessType=DOWNLOAD -O laPresent.csv

--2022-11-11 14:44:38--  https://data.lacity.org/api/views/2nrs-mtv8/rows.csv?accessType=DOWNLOAD
Resolving data.lacity.org (data.lacity.org)... 52.206.68.26, 52.206.140.205, 52.206.140.199
Connecting to data.lacity.org (data.lacity.org)|52.206.68.26|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/csv]
Saving to: ‘laPresent.csv’

laPresent.csv           [              <=>   ] 146.26M  5.60MB/s    in 26s     

2022-11-11 14:45:05 (5.70 MB/s) - ‘laPresent.csv’ saved [153366675]



### check count here: raw csv file
command:
`!wc -l rows.csv?accessType=DOWNLOAD`

chicago1: `485854 rows.csv?accessType=DOWNLOAD`
chicago12: `336247 rows.csv?accessType=DOWNLOAD`
chicago15: `264706 chicago.csv`
chicago20: `211936 chicago2020.csv`

austin2015: `38574 austin2015.csv`
austin: `35098 rows.csv?accessType=DOWNLOAD.1`

la old: `2119798 rows.csv?accessType=DOWNLOAD.3`
la present: `599751 laPresent.csv`

san francisco: `2129526 rows.csv?accessType=DOWNLOAD.2`

In [18]:
!wc -l austin2015.csv

38574 austin2015.csv


In [19]:
df_pd = pd.read_csv('laPresent.csv', nrows=1000)
df_pd.columns

Index(['DR_NO', 'Date Rptd', 'DATE OCC', 'TIME OCC', 'AREA', 'AREA NAME',
       'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Crm Cd Desc', 'Mocodes',
       'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Premis Desc',
       'Weapon Used Cd', 'Weapon Desc', 'Status', 'Status Desc', 'Crm Cd 1',
       'Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4', 'LOCATION', 'Cross Street', 'LAT',
       'LON'],
      dtype='object')

### see sample of data
Command: `df_pd`

## Chicago 2011-2022

In [21]:
pd.set_option('display.max_columns', None)
df_pd

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
0,10224738,HY411648,09/05/2015 01:30:00 PM,043XX S WOOD ST,0486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,True,924,9,12,61,08B,1165074.0,1875917.0,2015,02/10/2018 03:50:01 PM,41.815117,-87.670000,"(41.815117282, -87.669999562)"
1,10224739,HY411615,09/04/2015 11:30:00 AM,008XX N CENTRAL AVE,0870,THEFT,POCKET-PICKING,CTA BUS,False,False,1511,15,29,25,06,1138875.0,1904869.0,2015,02/10/2018 03:50:01 PM,41.895080,-87.765400,"(41.895080471, -87.765400451)"
2,10224740,HY411595,09/05/2015 12:45:00 PM,035XX W BARRY AVE,2023,NARCOTICS,POSS: HEROIN(BRN/TAN),SIDEWALK,True,False,1412,14,35,21,18,1152037.0,1920384.0,2015,02/10/2018 03:50:01 PM,41.937406,-87.716650,"(41.937405765, -87.716649687)"
3,10224741,HY411610,09/05/2015 01:00:00 PM,0000X N LARAMIE AVE,0560,ASSAULT,SIMPLE,APARTMENT,False,True,1522,15,28,25,08A,1141706.0,1900086.0,2015,02/10/2018 03:50:01 PM,41.881903,-87.755121,"(41.881903443, -87.755121152)"
4,10224742,HY411435,09/05/2015 10:55:00 AM,082XX S LOOMIS BLVD,0610,BURGLARY,FORCIBLE ENTRY,RESIDENCE,False,False,614,6,21,71,05,1168430.0,1850165.0,2015,02/10/2018 03:50:01 PM,41.744379,-87.658431,"(41.744378879, -87.658430635)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,10225977,HY413363,09/06/2015 11:20:00 PM,080XX S MARQUETTE AVE,0486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE PORCH/HALLWAY,False,True,422,4,7,46,08B,1195571.0,1852233.0,2015,02/10/2018 03:50:01 PM,41.749426,-87.558916,"(41.749426458, -87.558916107)"
996,10225978,HY413353,09/06/2015 10:22:00 PM,061XX W MONTROSE AVE,0560,ASSAULT,SIMPLE,RESIDENCE PORCH/HALLWAY,True,False,1622,16,38,15,08A,1134809.0,1928614.0,2015,02/10/2018 03:50:01 PM,41.960312,-87.779771,"(41.960312361, -87.779771036)"
997,10225979,HY413331,09/06/2015 10:36:00 PM,012XX S STATE ST,0486,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,True,131,1,2,33,08B,1176530.0,1894821.0,2015,02/10/2018 03:50:01 PM,41.866741,-87.627407,"(41.866740943, -87.627407483)"
998,10225980,HY434905,09/06/2015 08:49:00 PM,005XX S CENTRAL AVE,1811,NARCOTICS,POSS: CANNABIS 30GMS OR LESS,STREET,True,False,1522,15,29,25,18,1139141.0,1897027.0,2015,02/10/2018 03:50:01 PM,41.873556,-87.764614,"(41.873556181, -87.764614362)"


## LA Present

In [20]:
pd.set_option('display.max_columns', None)
df_pd

Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,Mocodes,Vict Age,Vict Sex,Vict Descent,Premis Cd,Premis Desc,Weapon Used Cd,Weapon Desc,Status,Status Desc,Crm Cd 1,Crm Cd 2,Crm Cd 3,Crm Cd 4,LOCATION,Cross Street,LAT,LON
0,10304468,01/08/2020 12:00:00 AM,01/08/2020 12:00:00 AM,2230,3,Southwest,377,2,624,BATTERY - SIMPLE ASSAULT,0444 0913,36,F,B,501,SINGLE FAMILY DWELLING,400.0,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",AO,Adult Other,624,,,,1100 W 39TH PL,,34.0141,-118.2978
1,190101086,01/02/2020 12:00:00 AM,01/01/2020 12:00:00 AM,330,1,Central,163,2,624,BATTERY - SIMPLE ASSAULT,0416 1822 1414,25,M,H,102,SIDEWALK,500.0,UNKNOWN WEAPON/OTHER WEAPON,IC,Invest Cont,624,,,,700 S HILL ST,,34.0459,-118.2545
2,200110444,04/14/2020 12:00:00 AM,02/13/2020 12:00:00 AM,1200,1,Central,155,2,845,SEX OFFENDER REGISTRANT OUT OF COMPLIANCE,1501,0,X,X,726,POLICE FACILITY,,,AA,Adult Arrest,845,,,,200 E 6TH ST,,34.0448,-118.2474
3,191501505,01/01/2020 12:00:00 AM,01/01/2020 12:00:00 AM,1730,15,N Hollywood,1543,2,745,VANDALISM - MISDEAMEANOR ($399 OR UNDER),0329 1402,76,F,W,502,"MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)",,,IC,Invest Cont,745,998.0,,,5400 CORTEEN PL,,34.1685,-118.4019
4,191921269,01/01/2020 12:00:00 AM,01/01/2020 12:00:00 AM,415,19,Mission,1998,2,740,"VANDALISM - FELONY ($400 & OVER, ALL CHURCH VA...",0329,31,X,X,409,BEAUTY SUPPLY STORE,,,IC,Invest Cont,740,,,,14400 TITUS ST,,34.2198,-118.4468
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,200105931,01/28/2020 12:00:00 AM,01/28/2020 12:00:00 AM,10,1,Central,152,1,230,"ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT",0305 0431 1822 0334,32,M,W,101,STREET,512.0,MACE/PEPPER SPRAY,IC,Invest Cont,230,,,,7TH,FLOWER,34.0491,-118.2593
996,200105935,01/29/2020 12:00:00 AM,01/28/2020 12:00:00 AM,600,1,Central,124,1,420,THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER),,0,,,108,PARKING LOT,,,IC,Invest Cont,420,,,,200 N MAIN ST,,34.0532,-118.2425
997,200105938,01/28/2020 12:00:00 AM,01/27/2020 12:00:00 AM,1055,1,Central,111,2,956,"LETTERS, LEWD - TELEPHONE CALLS, LEWD",1906 0337,64,F,O,502,"MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)",,,IC,Invest Cont,956,,,,600 ALPINE ST,,34.0632,-118.2408
998,200105943,01/29/2020 12:00:00 AM,01/29/2020 12:00:00 AM,730,1,Central,157,1,236,INTIMATE PARTNER - AGGRAVATED ASSAULT,1414 0907 1402 2004 1218 0416 0401 2000 0913,25,F,H,101,STREET,400.0,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",AA,Adult Arrest,236,998.0,,,600 S SAN PEDRO ST,,34.0423,-118.2452


## Austin 2015

In [15]:
pd.set_option('display.max_columns', None)
df_pd

Unnamed: 0,GO Primary Key,Council District,GO Highest Offense Desc,Highest NIBRS/UCR Offense Description,GO Report Date,GO Location,Clearance Status,Clearance Date,GO District,GO Location Zip,GO Census Tract,GO X Coordinate,GO Y Coordinate
0,201510782,4.0,AGG ROBBERY/DEADLY WEAPON,Robbery,1-Jan-15,9001 N IH 35 SVRD NB ...,N,28-Jan-15,E,78753.0,18.13,3130483.0,10102366.0
1,201511231,4.0,ROBBERY BY ASSAULT,Robbery,1-Jan-15,919 E KOENIG LN SVRD EB ...,N,13-Jan-15,I,78751.0,21.05,3124730.0,10090296.0
2,201511736,1.0,BURGLARY OF RESIDENCE,Burglary,1-Jan-15,12151 N IH 35 SVRD NB ...,N,13-Jan-15,E,78753.0,18.35,3135985.0,10117220.0
3,201511433,4.0,BURGLARY OF RESIDENCE,Burglary,1-Jan-15,1044 NORWOOD PARK BLVD ...,N,5-Jan-15,I,78753.0,18.13,3129896.0,10096032.0
4,201511936,2.0,BURGLARY OF RESIDENCE,Burglary,1-Jan-15,2413 BITTER CREEK DR ...,N,7-Jan-15,F,78744.0,24.27,3110455.0,10039340.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,20155001322,7.0,THEFT,Theft,10-Jan-15,13801 N MOPAC EXPY NB ...,N,13-Jan-15,A,78727.0,18.46,3127003.0,10133822.0
996,2015101186,3.0,THEFT,Theft,10-Jan-15,500 W STASSNEY LN ...,N,21-Jan-15,D,78745.0,24.02,3103087.0,10048581.0
997,20155001334,10.0,THEFT,Theft,10-Jan-15,12424 RESEARCH BLVD SB ...,N,14-Jan-15,A,78759.0,25.00,3108169.0,10128787.0
998,2015100646,4.0,THEFT,Theft,10-Jan-15,1030 NORWOOD PARK BLVD ...,C,19-Jan-15,I,78753.0,18.13,3129997.0,10096983.0


In [23]:
cols = ['ID', 'Case Number', 'Date', 'Block', 'IUCR', 'Primary Type',
    'Description', 'Arrest', 'Domestic', 'Beat',
    'District', 'Ward', 'Community Area',
    'FBI Code', 'X Coordinate', 'Y Coordinate', 'Year',
    'Updated On', 'Latitude', 'Longitude']
df_pd = pd.read_csv('rows.csv?accessType=DOWNLOAD', nrows=1000, usecols=cols)

### initial commands for all (1 cell each)
Commands:
```
df_pd = pd.read_csv('rows.csv?accessType=DOWNLOAD.1', nrows=1000)
df_pd.columns
```
get output, then:
```
spark.createDataFrame(df_pd).schema
```
if with error, then:
```
cols = <PASTE COLUMN LIST HERE, REMOVE PROBLEMATIC COL>

df_pd = pd.read_csv('<EDIT FILENAME>', nrows=1000, usecols=cols)
```

### for Chicago because of `TypeError: Can not merge type (pandas string to spark double) for 'Location Description', 'location' fields`
```
cols = ['Case Number', 'Date', 'Block', 'IUCR', 'Primary Type', 'Description', 'Arrest', 'Domestic', 'Beat', 'Ward', 'FBI Code', 'X Coordinate', 'Y Coordinate', 'Year', 'Latitude', 'Longitude']
```

### for Los Angeles because of `TypeError: Can not merge type (pandas string/double to spark double/string) for 'Cross Street', 'Weapon Desc', 'Mocodes', 'Vict Sex', 'Vict Descent'`
Commands:
```
cols = ['DR_NO', 'Date Rptd', 'DATE OCC', 'TIME OCC', 'AREA ', 'AREA NAME', 'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Crm Cd Desc', 'Vict Age', 'Premis Cd', 'Premis Desc', 'Weapon Used Cd', 'Status', 'Status Desc', 'Crm Cd 1', 'Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4', 'LOCATION', 'LAT', 'LON']
```

### for Austin because of `TypeError: Can not merge type (pandas string to spark double) for 'Clearance Status', 'Clearance Date', 'GO Location' fields`
Commands:
```
cols = ['GO Primary Key', 'Council District', 'GO Highest Offense Desc', 'Highest NIBRS/UCR Offense Description', 'GO Report Date', 'GO District', 'GO Location Zip', 'GO Census Tract', 'GO X Coordinate', 'GO Y Coordinate']
```

In [15]:
schema_template = types.StructType([               # modified pattern from pandas schema
    types.StructField('ID', types.IntegerType(), True),
    types.StructField('Case Number', types.StringType(), True),
    types.StructField('Date', types.StringType(), True),
    types.StructField('Block', types.StringType(), True),
    types.StructField('IUCR', types.StringType(), True),
    types.StructField('Primary Type', types.StringType(), True),
    types.StructField('Description', types.StringType(), True),
    types.StructField('Location Description', types.StringType(), True),
    types.StructField('Arrest', types.BooleanType(), True),
    types.StructField('Domestic', types.BooleanType(), True),
    types.StructField('Beat', types.StringType(), True),
    types.StructField('District', types.StringType(), True),
    types.StructField('Ward', types.IntegerType(), True),
    types.StructField('Community Area', types.IntegerType(), True),
    types.StructField('FBI Code', types.StringType(), True),
    types.StructField('X Coordinate', types.FloatType(), True),
    types.StructField('Y Coordinate', types.FloatType(), True),
    types.StructField('Year', types.IntegerType(), True),
    types.StructField('Updated On', types.StringType(), True),
    types.StructField('Latitude', types.FloatType(), True),
    types.StructField('Longitude', types.FloatType(), True),
    types.StructField('Location', types.StringType(), True)
])

### modify schema output above and removed columns, based on sample output before, then add template below

### Replace below with me:
```
df_csv = spark.read \
    .option("header", "true") \
    .schema(schema_template) \
    .csv(f'{gcs_bkt}/raw/{city}/{fname}')
```

In [32]:
df_csv = spark.read \
    .option("header", "true") \
    .schema(schema_template) \
    .csv('chicago2020.csv')

In [33]:
print(conf.toDebugString())

spark.master=spark://city-crimes-spark-1:7077
spark.app.name=proj_file_read
spark.jars=/opt/hadoop-lib/gcs-connector-hadoop3-latest.jar


In [34]:
sc.getConf().getAll()

[('spark.driver.extraJavaOptions',
  '-XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED'),
 ('spark.app.submitTime', '1668162780443'),
 ('spark.jars', '/opt/hadoop-lib/gcs-connector-hadoop3-latest.jar'),
 ('spark.app.name', 'proj_file_read'),
 ('spark.executor.id', 'driver'),
 ('spark.master', 'spark://city-crimes-spark-1:

In [37]:
p_func = lambda s: pdl.from_format(s, dict_city['date_format'])
parse_dt_udf = F.udf(p_func, returnType=types.TimestampType())

df_csv \
    .withColumn('Timestamp', parse_dt_udf(F.col(dict_city['date_string_col']))) \
    .select('Timestamp').show()

22/11/11 10:52:59 WARN TaskSetManager: Lost task 0.0 in stage 6.0 (TID 12) (172.18.0.5 executor 0): java.io.FileNotFoundException: 
File file:/home/spark-user/files/chicago2020.csv does not exist

It is possible the underlying files have been updated. You can explicitly invalidate
the cache in Spark by running 'REFRESH TABLE tableName' command in SQL or by
recreating the Dataset/DataFrame involved.
       
	at org.apache.spark.sql.errors.QueryExecutionErrors$.readCurrentFileNotFoundError(QueryExecutionErrors.scala:661)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.org$apache$spark$sql$execution$datasources$FileScanRDD$$anon$$readCurrentFile(FileScanRDD.scala:212)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.nextIterator(FileScanRDD.scala:270)
	at org.apache.spark.sql.execution.datasources.FileScanRDD$$anon$1.hasNext(FileScanRDD.scala:116)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$10.

[Stage 6:>                                                          (0 + 1) / 1]

+-------------------+
|          Timestamp|
+-------------------+
|2020-03-17 21:30:00|
|2020-03-18 02:03:00|
|2020-03-18 08:50:00|
|2020-03-18 13:00:00|
|2020-03-18 17:35:00|
|2020-03-16 00:05:00|
|2020-03-18 23:15:00|
|2020-03-18 18:00:00|
|2020-03-18 14:04:00|
|2020-03-18 21:27:00|
|2020-03-18 07:30:00|
|2020-03-14 14:45:00|
|2020-03-18 07:18:00|
|2020-03-18 11:30:00|
|2020-03-16 15:20:00|
|2020-03-18 13:14:00|
|2020-03-17 21:00:00|
|2020-03-13 15:00:00|
|2020-03-18 03:30:00|
|2020-02-20 12:10:00|
+-------------------+
only showing top 20 rows



                                                                                

In [43]:
df_csv.head(10)

[Row(ID=10224738, Case Number='HY411648', Date='09/05/2015 01:30:00 PM', Block='043XX S WOOD ST', IUCR='0486', Primary Type='BATTERY', Description='DOMESTIC BATTERY SIMPLE', Location Description='RESIDENCE', Arrest=False, Domestic=True, Beat='0924', District='009', Ward=12, Community Area=61, FBI Code='08B', X Coordinate=1165074.0, Y Coordinate=1875917.0, Year=2015, Updated On='02/10/2018 03:50:01 PM', Latitude=41.81511688232422, Longitude=-87.66999816894531, Location='(41.815117282, -87.669999562)'),
 Row(ID=10224739, Case Number='HY411615', Date='09/04/2015 11:30:00 AM', Block='008XX N CENTRAL AVE', IUCR='0870', Primary Type='THEFT', Description='POCKET-PICKING', Location Description='CTA BUS', Arrest=False, Domestic=False, Beat='1511', District='015', Ward=29, Community Area=25, FBI Code='06', X Coordinate=1138875.0, Y Coordinate=1904869.0, Year=2015, Updated On='02/10/2018 03:50:01 PM', Latitude=41.89508056640625, Longitude=-87.7654037475586, Location='(41.895080471, -87.765400451)

In [59]:
df_csv.count()

                                                                                

2119797

### check count here: original df
Command:
`df_csv.count()`

Output:
`485853`

### inspect data
Command:
```
df_csv.head(10)
```

In [65]:
def parse_dt(dt_str):
    """
    parse datetime object from given date string of specific format
    """
    return pdl.from_format(dt_str, dict_city['date_format'])

parse_dt_udf = F.udf(parse_dt, returnType=types.TimestampType())

In [66]:
# parse datetime out of provided date column
df_time = df_csv.withColumn('Timestamp', parse_dt_udf(F.col(dict_city['date_string_col'])))

if dict_city['with_year_col']:
    years_rows = df_time \
        .select('Year')
else:
    years_rows = df_time \
        .select(F.year('Timestamp').alias('Year'))

In [67]:
df_time.head(10)

                                                                                

[Row(DR_NO=1307355, Date Rptd='02/20/2010 12:00:00 AM', DATE OCC='02/20/2010 12:00:00 AM', TIME OCC=1350, AREA =13, AREA NAME='Newton', Rpt Dist No=1385, Part 1-2=2, Crm Cd=900, Crm Cd Desc='VIOLATION OF COURT ORDER', Mocodes='0913 1814 2000', Vict Age=48, Vict Sex='M', Vict Descent='H', Premis Cd=501, Premis Desc='SINGLE FAMILY DWELLING', Weapon Used Cd=None, Weapon Desc=None, Status='AA', Status Desc='Adult Arrest', Crm Cd 1=900, Crm Cd 2=None, Crm Cd 3=None, Crm Cd 4=None, LOCATION='300 E  GAGE                         AV', Cross Street=None, LAT=33.98249816894531, LON=-118.26950073242188, Timestamp=datetime.datetime(2010, 2, 20, 0, 0)),
 Row(DR_NO=11401303, Date Rptd='09/13/2010 12:00:00 AM', DATE OCC='09/12/2010 12:00:00 AM', TIME OCC=45, AREA =14, AREA NAME='Pacific', Rpt Dist No=1485, Part 1-2=2, Crm Cd=740, Crm Cd Desc='VANDALISM - FELONY ($400 & OVER, ALL CHURCH VANDALISMS)', Mocodes='0329', Vict Age=0, Vict Sex='M', Vict Descent='W', Premis Cd=101, Premis Desc='STREET', Weapon

In [68]:
years_rows = years_rows \
    .dropna() \
    .dropDuplicates(['Year']) \
    .collect()

years = [row.Year for row in years_rows]
years.sort()

                                                                                

In [69]:
years

[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]

### check parsed years
Command:
`print(years)`

Output:
`[2001]`

In [70]:
df_test = df_time \
    .repartition(24) \
    .filter(F.month('Timestamp') == 1)

df_test.count()

                                                                                

183674

### check count Jan: csv df, strdate col
Command:
```
df_test = df_time \
    .filter(F.month('Timestamp') == 1)

df_test.count()
```
chicago: `38114` san_francisco: `189723` los_angeles: `183674` austin: `3098`

### check date Jan1: csv df, strdate col
Command:
```
df_test \
    .filter(F.dayofmonth('Timestamp') == 1) \
    .count()
```
chicago: `1825` san_francisco: ` ` los_angeles: ` ` austin: `97`

In [76]:
o_cols = df_time.columns
cols = [col.lower().replace(' ', '_') for col in o_cols]

for year in years:
    df = df_time.filter(F.year('Timestamp') == year)
    for month in range(1, 13):
        df_month = df.filter(F.month('Timestamp') == month)
        for i in range(len(o_cols)):
            df_month = df_month.withColumnRenamed(o_cols[i], cols[i])
        if dict_city['partitions'] > 1:
            df_month = df_month.repartition(dict_city['partitions'])
        df_month \
            .drop('Timestamp', dt_str_col[city]) \
            .write.parquet(f'{gcs_bkt}/pq/{city}/{year}/{month}', mode='overwrite')

ERROR:root:KeyboardInterrupt while sending command.                 (0 + 2) / 4]
Traceback (most recent call last):
  File "/opt/spark-3.3.0-bin-hadoop3/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/spark-3.3.0-bin-hadoop3/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [80]:
sc.stop()