# 311 Cases

### References:
https://data.sfgov.org/City-Infrastructure/311-Cases/vw6y-z8j6


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from PIL import Image

from sklearn.model_selection import ShuffleSplit
% matplotlib inline

In [None]:
df = pd.read_csv('311_Cases.csv')

In [None]:
df.head()

In [None]:
df['Opened'] = pd.to_datetime(df['Opened'])
df['year'] = df['Opened'].dt.year
df['month'] = df['Opened'].dt.month

In [None]:
columns = list(df.columns)
columns = [column.lower().replace(' ', '_') for column in columns]
df.columns = columns

In [None]:
# Limit 
df = df.loc[(df['media_url'].isna() != True) &
            ((df['year'] == 2017) | (df['year'] == 2018))]

In [None]:
groups_m = df.groupby(['year', 'month'])['caseid'].count()
groups_m

In [None]:
ss = ShuffleSplit(n_splits=1, train_size=120000, random_state=0)

for train_index, test_index in ss.split(df):
    train_index = train_index

In [None]:
df_subset = df.iloc[train_index].copy(deep=True)

In [None]:
df_subset.reset_index(inplace=True, drop=True)
df_subset.head()

In [None]:
df_subset.to_csv('selected_subset.csv')

In [None]:
groups_small = df_subset.groupby(['year', 'month'])['caseid'].count()
print(groups_small)

In [2]:
# Size of thumbnail image
size = (128, 128)

def get_image(df):
    try: 
        response = requests.get(df, stream=True)
        response.raw.decode_content = True
        with Image.open(response.raw) as image:
            width, height = image.size
            # print(width, height)
            # Crop down to 80% of size
            image = image.crop(box=(width*.10, height*.10, width*.90, height*.90))
            # Convert grayscale
            image = image.convert('L')
            image = image.resize(size)
            #image.show()
            np_im = np.array(image)
            #print(np_im)
            return np_im
    except:
        return np.nan

In [None]:
df_subset['image'] = np.nan

In [None]:
image1 = get_image(df_subset.iloc[4])

In [None]:
plt.imshow(image1, cmap='gray')

## Data Selection
- Due to computational constraints, selecting 120,000 images from 2017 and 2018

In [None]:
df_subset.head()

In [None]:
df_mini = df_subset.iloc[0:20, :]

df_mini.image = df_mini.apply(get_image, axis=1)

df_mini.head()

In [None]:
# Confirm image size
df_mini.iloc[0]['image'].shape

In [None]:
# Confirm an image
plt.imshow(df_mini.iloc[0]['image'], cmap='gray')

### Check code before trying large scale

In [None]:
chunksize = 10
folder = 'bw_pickles/'
for i in range(0, df_mini.shape[0], chunksize):
    chunk = df_mini.iloc[i:min((i+chunksize), df_mini.shape[0])].copy(deep=True)
    chunk['image'] = chunk['media_url'].map(get_image)
    try:
        filename = folder + 'bw_mini_' + str(i) + '.gzde'
        chunk.to_pickle(filename, compression='gzip')
    except:
        print(str(i) + 'failed')

In [None]:
df_test = pd.read_pickle('bw_pickles/bw_mini_10.gzde', compression='gzip')

In [None]:
df_test.head()

## Large scale 

In [3]:
# If restarting kernel, read in the csv with just the selected 
# indices.  Not necessary if starting from beginning.
df_subset = pd.read_csv('selected_subset.csv', index_col=0)
#df_subset = df_subset.re

In [4]:
df_subset.head()

Unnamed: 0,caseid,opened,closed,updated,status,status_notes,responsible_agency,category,request_type,request_details,...,supervisor_district,neighborhood,police_district,latitude,longitude,point,source,media_url,year,month
0,7228536,2017-06-06 07:31:38,06/10/2017 12:26:23 PM,06/10/2017 12:26:23 PM,Closed,SES Graffiti Crew - Paint - Box - Sub Panel -...,DPW Ops Queue,Graffiti,Graffiti on Signal_box,Signal_box - Not_Offensive,...,6.0,South of Market,SOUTHERN,37.774735,-122.40062,"(37.7747345, -122.40061951)",Mobile/Open311,http://mobile311.sfgov.org/reports/7228536/photos,2017,6
1,9848318,2018-11-25 13:15:00,11/25/2018 02:04:15 PM,11/25/2018 02:04:15 PM,Closed,Officer responded to request under CAD # 18329...,Parking Enforcement Dispatch Queue,Parking Enforcement,Other_Illegal_Parking,Silver - Civic - 5xcs051,...,6.0,Tenderloin,TENDERLOIN,37.783645,-122.410763,"(37.78364452, -122.41076323)",Mobile/Open311,http://mobile311.sfgov.org/reports/9848318/photos,2018,11
2,9849482,2018-11-25 19:32:40,02/27/2019 09:30:07 PM,02/27/2019 09:30:07 PM,Closed,Per process - cases are closed 10 days after a...,Clear Channel - Transit Queue,Street and Sidewalk Cleaning,Transit_shelter_platform,Transit_shelter_platform,...,6.0,Rincon Hill,SOUTHERN,37.785531,-122.396656,"(37.7855311, -122.3966557)",Mobile/Open311,http://mobile311.sfgov.org/reports/9849482/photos,2018,11
3,9681555,2018-10-16 17:25:00,10/20/2018 04:24:00 PM,10/20/2018 04:24:00 PM,Closed,,DPW Ops Queue,Graffiti,Graffiti on Sidewalk_in_front_of_property,Sidewalk_in_front_of_property - Not_Offensive,...,8.0,Duboce Triangle,PARK,37.766496,-122.431225,"(37.76649554, -122.43122539)",Mobile/Open311,http://mobile311.sfgov.org/reports/9681555/photos,2018,10
4,9253531,2018-07-11 14:21:09,07/12/2018 06:33:00 PM,07/12/2018 06:33:00 PM,Closed,,DPW Ops Queue,Litter Receptacles,Other_garbage_can_repair,,...,4.0,Outer Sunset,TARAVAL,37.763896,-122.509486,"(37.76389604, -122.50948554)",Mobile/Open311,http://mobile311.sfgov.org/reports/9253531/photos,2018,7


In [5]:
chunksize = 1000
folder = 'bw_pickles/'
total_size = 120000

for i in range(85000, total_size, chunksize):
    chunk = df_subset.iloc[i:min((i+chunksize), total_size)].copy(deep=True)
    chunk['image'] = chunk['media_url'].map(get_image)
    try:
        filename = folder + 'bw_' + str(i) + '.gzde'
        chunk.to_pickle(filename, compression='gzip')
    except:
        print(str(i) + 'failed')

In [None]:
len(train_index)