# PREPARE DATA

Load and concatenate dataframes from individual months

## Prepare

In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

In [2]:
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

In [3]:
DROPNANS = True
INCIDENT = 'roubo_celular'

# TODO: inconsistencies found in temporal data
#BASE_COLS = ['DATAOCORRENCIA', 'HORAOCORRENCIA', 'PERIDOOCORRENCIA',]
# QUICKFIX: use first day of month as incident date
BASE_COLS = ['PERIDOOCORRENCIA']

# Neighborhood dataframe
COLS_HOOD = BASE_COLS + ['BAIRRO']

# Localizations dataframe
COLS_LOC = BASE_COLS + ['LATITUDE', 'LONGITUDE']

# Prints to check everything is ok
DEBUG_MODE = True

In [4]:
periods = [f'2023_{i:02d}' for i in range(1, 13)] + ['2024_01']
#periods = [f'2023_{i:02d}' for i in range(1, 2)] # + ['2024_01']
periods

['2023_01',
 '2023_02',
 '2023_03',
 '2023_04',
 '2023_05',
 '2023_06',
 '2023_07',
 '2023_08',
 '2023_09',
 '2023_10',
 '2023_11',
 '2023_12',
 '2024_01']

In [5]:
# just for reference
dfx = pd.read_excel(f"../data/{INCIDENT}/2024_01.xlsx")

In [6]:
dfx.columns

Index(['ANO_BO', 'NUM_BO', 'NUMERO_BOLETIM', 'BO_INICIADO', 'BO_EMITIDO',
       'DATAOCORRENCIA', 'HORAOCORRENCIA', 'PERIDOOCORRENCIA',
       'DATACOMUNICACAO', 'DATAELABORACAO', 'BO_AUTORIA', 'FLAGRANTE',
       'NUMERO_BOLETIM_PRINCIPAL', 'LOGRADOURO', 'NUMERO', 'BAIRRO', 'CIDADE',
       'UF', 'LATITUDE', 'LONGITUDE', 'DESCRICAOLOCAL', 'EXAME', 'SOLUCAO',
       'DELEGACIA_NOME', 'DELEGACIA_CIRCUNSCRICAO', 'ESPECIE', 'RUBRICA',
       'DESDOBRAMENTO', 'STATUS', 'TIPOPESSOA', 'VITIMAFATAL', 'NATURALIDADE',
       'NACIONALIDADE', 'SEXO', 'DATANASCIMENTO', 'IDADE', 'ESTADOCIVIL',
       'PROFISSAO', 'GRAUINSTRUCAO', 'CORCUTIS', 'NATUREZAVINCULADA',
       'TIPOVINCULO', 'RELACIONAMENTO', 'PARENTESCO', 'PLACA_VEICULO',
       'UF_VEICULO', 'CIDADE_VEICULO', 'DESCR_COR_VEICULO',
       'DESCR_MARCA_VEICULO', 'ANO_FABRICACAO', 'ANO_MODELO',
       'DESCR_TIPO_VEICULO', 'QUANT_CELULAR', 'MARCA_CELULAR'],
      dtype='object')

In [7]:
df_hood = None
df_loc = None
for period in periods:
    df = pd.read_excel(f"../data/{INCIDENT}/{period}.xlsx")
    # filter capital
    df = df[df["CIDADE"] == "S.PAULO"]

    # standarize neighborhood
    df["BAIRRO"] = df["BAIRRO"].str.upper()

    ## TODO: inconsistencies found in temporal data
    #df['DATAOCORRENCIA'] = pd.to_datetime(df['DATAOCORRENCIA'], format='%d/%m/%Y')
    #if DEBUG_MODE:
    #    print(
    #        f"{period}\t{df['DATAOCORRENCIA'].min()}\t{df['DATAOCORRENCIA'].max()}"
    #    )
    # QUICKFIX: use first day of month as incident date
    year, month = [int(s) for s in period.split(sep='_')]
    df["period"] = period
    df["year"] = year
    df["month"] = month
    df["date"] = pd.Timestamp(year=year, month=month, day=1)
    COLS_DT = ["period", "year", "month", "date"]

    df_hood = pd.concat([df_hood, df[COLS_HOOD + COLS_DT]])
    df_loc = pd.concat([df_loc, df[COLS_LOC + COLS_DT]])

    print(f"Period {period} done")

Period 2023_01 done
Period 2023_02 done
Period 2023_03 done
Period 2023_04 done
Period 2023_05 done
Period 2023_06 done
Period 2023_07 done
Period 2023_08 done
Period 2023_09 done
Period 2023_10 done
Period 2023_11 done
Period 2023_12 done
Period 2024_01 done


Remove NaNs,

In [8]:
df_hood["BAIRRO"].value_counts()

BAIRRO
REPUBLICA               2970
CAPAO REDONDO           2967
PINHEIROS               2445
ITAIM BIBI              2369
CAMPO LIMPO             2160
                        ... 
JARDIM COTIANA             1
VILA ARAPUA                1
JARDIM MORADA DO SOL       1
PARQUE SAO LUCAS           1
CIDADE DE DEUS             1
Name: count, Length: 1772, dtype: int64

In [9]:
print(df_hood.shape, df_loc.shape)
df_hood = df_hood[df_hood["BAIRRO"].notna()]
df_loc = df_loc[(df_loc["LONGITUDE"].notna())]
df_loc = df_loc[(df_loc["LATITUDE"].notna())]
print(df_hood.shape, df_loc.shape)

(136917, 6) (136917, 7)
(136158, 6) (121930, 7)


In [None]:
df_hood.groupby("date")["period"].count()

In [None]:
df_loc.groupby("date")["period"].count()

## Problem with `df_hood`: too much granularity

Municipio -> Prefeituras Regionais -> Distritos -> Bairros

In [None]:
df_hood["BAIRRO"].value_counts()

Solutions:
- Map "BAIRRO" column to new "DISTRITO" column (hard).
- Use `df_loc` to intersect (lat, lon) with available district areas.

# Continue with `df_loc`

Fix types in location columns,

In [11]:
df_loc.dtypes

PERIDOOCORRENCIA            object
LATITUDE                    object
LONGITUDE                  float64
period                      object
year                         int64
month                        int64
date                datetime64[us]
dtype: object

In [12]:
df_loc["LATITUDE"].map(type).value_counts()

LATITUDE
<class 'float'>    102319
<class 'int'>       19610
<class 'str'>           1
Name: count, dtype: int64

safe way: keep only floats,

In [None]:
print(df_loc.shape)
df_loc = df_loc[df_loc["LATITUDE"].map(type) == float]
print(df_loc.shape)

Add geometry column,

In [None]:
df_hood["BAIRRO"].nunique()

In [None]:
gdf_loc.plot()

In [None]:
df_loc[["LATITUDE", "LATITUDE"]]

In [None]:
# MARTELADA

## y
#df_loc = df_loc[(df_loc["LONGITUDE"] > -1e14) & (df_loc["LONGITUDE"] < -3e13)]
#
## y
#df_loc = df_loc[(df_loc["LATITUDE"] > -1e14) & (df_loc["LATITUDE"] < -3e13)]
##df_loc = df_loc[df_loc["LATITUDE"] > -1e14]

In [None]:
df_loc