In [21]:
import pandas as pd
import json
import mysql.connector
import csv

In [4]:
csv_file = "data/Crimes_2001_to_Present.csv"
df = pd.read_csv(csv_file)

In [5]:
df.shape

(7846809, 22)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7846809 entries, 0 to 7846808
Data columns (total 23 columns):
 #   Column                Dtype         
---  ------                -----         
 0   ID                    int64         
 1   Case Number           object        
 2   Date                  object        
 3   Time                  object        
 4   Block                 object        
 5   IUCR                  object        
 6   Primary Type          object        
 7   Description           object        
 8   Location Description  object        
 9   Arrest                bool          
 10  Domestic              bool          
 11  Beat                  int64         
 12  District              float64       
 13  Ward                  float64       
 14  Community Area        float64       
 15  FBI Code              object        
 16  X Coordinate          float64       
 17  Y Coordinate          float64       
 18  Year                  int64         
 19  

In [14]:
df.isnull().sum()

ID                          0
Case Number                 4
Date                        0
Time                        0
Block                       0
IUCR                        0
Primary Type                0
Description                 0
Location Description        0
Arrest                      0
Domestic                    0
Beat                        0
District                   47
Ward                        0
Community Area              0
FBI Code                    0
X Coordinate                0
Y Coordinate                0
Year                        0
Updated On                  0
Latitude                88111
Longitude               88111
Location                88111
dtype: int64

## Reapplying transformations

In [6]:
# turning date column into datetime to extract the time and create the time column
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y %I:%M:%S %p')
df['Time'] = df['Date'].apply(lambda x: x.time())
move_time = df.pop('Time')
df.insert(3,'Time', move_time)

In [7]:
# changing updated_on format
df['Updated On'] = pd.to_datetime(df['Updated On'], format='%m/%d/%Y %I:%M:%S %p')

In [8]:
# taking just the date part, converting into dtype object
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y %I:%M:%S %p').dt.date

In [15]:
# replacing nulls
df = df.fillna({'Case Number': 'null'})
df = df.fillna({'District': 0, 'LOngitude': 0,'Latitude': 0})


In [16]:
cols = ['District', 'Arrest', 'Domestic', 'Ward', 'Community Area', 'X Coordinate', 'Y Coordinate']
df[cols] = df[cols].astype(int)

In [None]:
df.head()

In [17]:
df.columns = ["id","case_number","date", "time", "block","iucr","primary_type","description","location_desc","arrest","domestic","beat","district","ward", "community_area", "fbi_code", "x_coord",
       "y_coord","year","updated_on","latitude","longitude", "location"]
df.columns

Index(['id', 'case_number', 'date', 'time', 'block', 'iucr', 'primary_type',
       'description', 'location_desc', 'arrest', 'domestic', 'beat',
       'district', 'ward', 'community_area', 'fbi_code', 'x_coord', 'y_coord',
       'year', 'updated_on', 'latitude', 'longitude', 'location'],
      dtype='object')

In [18]:
not_null_mask = ~df['latitude'].isnull() & ~df['longitude'].isnull()

# Apply the transformation only for non-null values
df.loc[not_null_mask, 'location'] = df[not_null_mask].apply(lambda row: f"POINT({row['latitude']} {row['longitude']})", axis=1)

print(df)

               id case_number        date      time                   block  \
0        11646166    JC213529  2018-09-01  00:01:00   082XX S INGLESIDE AVE   
1        11645836    JC212333  2016-05-01  00:25:00     055XX S ROCKWELL ST   
2        11449702    JB373031  2018-07-31  13:30:00  009XX E HYDE PARK BLVD   
3        11643334    JC209972  2018-12-19  16:30:00  056XX W WELLINGTON AVE   
4        11645527    JC212744  2015-02-02  10:00:00      069XX W ARCHER AVE   
...           ...         ...         ...       ...                     ...   
7846804  13128007    JG325985  2023-06-21  20:00:00  031XX N CALIFORNIA AVE   
7846805  13129172    JG327619  2023-06-20  04:00:00   028XX N MAPLEWOOD AVE   
7846806  13128066    JG325838  2023-06-06  15:42:00    018XX N LOCKWOOD AVE   
7846807  13128324    JG326502  2023-05-13  12:00:00       020XX W CERMAK RD   
7846808  13128375    JG326564  2023-06-24  13:29:00    069XX N HAMILTON AVE   

         iucr                primary_type  \
0     

In [19]:
df.info()
df.to_csv('data/data.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7846809 entries, 0 to 7846808
Data columns (total 23 columns):
 #   Column          Dtype         
---  ------          -----         
 0   id              int64         
 1   case_number     object        
 2   date            object        
 3   time            object        
 4   block           object        
 5   iucr            object        
 6   primary_type    object        
 7   description     object        
 8   location_desc   object        
 9   arrest          int32         
 10  domestic        int32         
 11  beat            int64         
 12  district        int32         
 13  ward            int32         
 14  community_area  int32         
 15  fbi_code        object        
 16  x_coord         int32         
 17  y_coord         int32         
 18  year            int64         
 19  updated_on      datetime64[ns]
 20  latitude        float64       
 21  longitude       float64       
 22  location        ob

In [3]:
with open('config_db.json') as config_json:
    config = json.load(config_json)

conx = mysql.connector.connect(**config) 

In [20]:
mycursor = conx.cursor()

mycursor.execute("""CREATE TABLE IF NOT EXISTS crimes (
                 id int PRIMARY KEY,
                 case_number VARCHAR(10),
                 date date, 
                 time time, 
                 block VARCHAR(150), 
                 iucr VARCHAR(10), 
                 primary_type VARCHAR(150), 
                 description VARCHAR(150), 
                 location_desc VARCHAR(150), 
                 arrest boolean, 
                 domestic boolean, 
                 beat int, 
                 district int, 
                 ward int, 
                 community_area int, 
                 fbi_code VARCHAR(5), 
                 x_coord VARCHAR(15), 
                 y_coord VARCHAR(15), 
                 year int, 
                 updated_on datetime, 
                 latitude float, 
                 longitude float, 
                 location point)""")

query = "DESCRIBE crimes"
mycursor.execute(query)

description_table = mycursor.fetchall()

mycursor.close()

# we have seen the creation of the table 
desc_table=pd.DataFrame(description_table, columns=['Field', 'Type', 'Null', 'Key', 'Default', 'Extra'])
print(desc_table)

mycursor.close()

            Field             Type Null  Key Default Extra
0              id           b'int'   NO  PRI    None      
1            date          b'date'  YES         None      
2            time          b'time'  YES         None      
3           block  b'varchar(150)'  YES         None      
4            iucr   b'varchar(10)'  YES         None      
5    primary_type  b'varchar(150)'  YES         None      
6     description  b'varchar(150)'  YES         None      
7   location_desc  b'varchar(150)'  YES         None      
8          arrest    b'tinyint(1)'  YES         None      
9        district           b'int'  YES         None      
10           year           b'int'  YES         None      
11     updated_on      b'datetime'  YES         None      
12       location         b'point'  YES         None      


False

In [23]:
df.head()

Unnamed: 0,id,case_number,date,time,block,iucr,primary_type,description,location_desc,arrest,...,ward,community_area,fbi_code,x_coord,y_coord,year,updated_on,latitude,longitude,location
0,11646166,JC213529,2018-09-01,00:01:00,082XX S INGLESIDE AVE,810,THEFT,OVER $500,RESIDENCE,0,...,8,44,6,0,0,2018,2019-04-06 16:04:43,0.0,,
1,11645836,JC212333,2016-05-01,00:25:00,055XX S ROCKWELL ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,0,...,15,63,11,0,0,2016,2019-04-06 16:04:43,0.0,,
2,11449702,JB373031,2018-07-31,13:30:00,009XX E HYDE PARK BLVD,2024,NARCOTICS,POSS: HEROIN(WHITE),STREET,1,...,5,41,18,0,0,2018,2019-04-09 16:24:58,0.0,,
3,11643334,JC209972,2018-12-19,16:30:00,056XX W WELLINGTON AVE,1320,CRIMINAL DAMAGE,TO VEHICLE,STREET,0,...,31,19,14,0,0,2018,2019-04-04 16:16:11,0.0,,
4,11645527,JC212744,2015-02-02,10:00:00,069XX W ARCHER AVE,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,OTHER,0,...,23,56,11,0,0,2015,2019-04-06 16:04:43,0.0,,


In [None]:
mycursor = conx.cursor()

query2 = """INSERT INTO crimes (id,case_number,date, time, block, iucr, primary_type, description, location_desc, arrest, domestic, beat, district, ward, community_area, fbi_code, x_coord, y_coord, year, updated_on, latitude, longitude, location) 
VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,ST_GeomFromText(%s))"""


with open('data/data.csv', newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        next(reader) # ignore column names
        for row in reader:
            datos= (row[0], row[1], row[2], row[3],
                                  row[4], row[5], row[6], row[7],
                                  row[8], row[9], row[10], row[11],
                                  row[12], row[13], row[14], row[15], row[16], row[17],
                                  row[18], row[19], row[20], row[21], row[22])
            mycursor.execute(query2, datos)
    
conx.commit()

mycursor.close()