**Packages**

In [1]:
import numpy as np
import pandas as pd
import tifffile
from sklearn.model_selection import train_test_split

**Data Pre-Processing**

In [2]:
tiles = pd.read_csv(r'./Data/Tiles_binned_zipcode.csv')
tiles.head(5)

Unnamed: 0,Tile_ID,Long2,Lat2,Long1,Lat1,Mid_lat,Mid_long,Stop_Signs,Paving_historical,Paving_future,...,94129,94130,94131,94132,94133,94134,94141,94143,94158,94188
0,36,-122.514446,37.779636,-122.513306,37.778732,37.779184,-122.513876,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,37,-122.514446,37.778732,-122.513306,37.777829,37.77828,-122.513876,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,151,-122.513306,37.779636,-122.512166,37.778732,37.779184,-122.512736,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,152,-122.513306,37.778732,-122.512166,37.777829,37.77828,-122.512736,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,153,-122.513306,37.777829,-122.512166,37.776925,37.777377,-122.512736,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
tiles.columns

Index(['Tile_ID', 'Long2', 'Lat2', 'Long1', 'Lat1', 'Mid_lat', 'Mid_long',
       'Stop_Signs', 'Paving_historical', 'Paving_future', 'Bus_stop',
       'Collisions_Future', 'Collisions_Historical', 'bin', 'RTTYP_I',
       'RTTYP_M', 'RTTYP_O', 'RTTYP_S', 'RTTYP_U', 'Collisions_Future_binary',
       'Collisions_Historical_binary', 'bins_numeric', 'zip_code', '94101',
       '94102', '94104', '94105', '94107', '94108', '94109', '94110', '94111',
       '94112', '94114', '94115', '94116', '94117', '94118', '94121', '94122',
       '94123', '94124', '94127', '94129', '94130', '94131', '94132', '94133',
       '94134', '94141', '94143', '94158', '94188'],
      dtype='object')

Split into training and test

In [4]:
x_train, x_test, y_train, y_test = train_test_split(tiles[['Tile_ID', 
       'Long2', 'Lat2', 'Long1', 'Lat1', 'Mid_lat', 'Mid_long',
       'Stop_Signs', 'Paving_historical', 'Paving_future', 'Bus_stop',
       'Collisions_Future', 'Collisions_Historical', 'RTTYP_I',
       'RTTYP_M', 'RTTYP_O', 'RTTYP_S', 'RTTYP_U', 'Collisions_Future_binary',
       'Collisions_Historical_binary', 'bins_numeric', 'zip_code', '94101',
       '94102', '94104', '94105', '94107', '94108', '94109', '94110', '94111',
       '94112', '94114', '94115', '94116', '94117', '94118', '94121', '94122',
       '94123', '94124', '94127', '94129', '94130', '94131', '94132', '94133',
       '94134', '94141', '94143', '94158', '94188']], 
                                   tiles['bin'],
                                   random_state=104, 
                                   test_size=0.20, 
                                   shuffle=True)

In [5]:
np.shape(x_train)

(8376, 52)

In [6]:
np.shape(x_test)

(2095, 52)

In [7]:
x_train.head()

Unnamed: 0,Tile_ID,Long2,Lat2,Long1,Lat1,Mid_lat,Mid_long,Stop_Signs,Paving_historical,Paving_future,...,94129,94130,94131,94132,94133,94134,94141,94143,94158,94188
8419,10993,-122.406135,37.75072,-122.404995,37.749817,37.750269,-122.405565,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
10129,13338,-122.383333,37.710058,-122.382193,37.709155,37.709606,-122.382763,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
7641,10163,-122.414116,37.773311,-122.412976,37.772407,37.772859,-122.413546,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
5215,7523,-122.440338,37.768793,-122.439198,37.767889,37.768341,-122.439768,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
7784,10314,-122.412976,37.740781,-122.411836,37.739877,37.740329,-122.412406,2.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
x_test.head()

Unnamed: 0,Tile_ID,Long2,Lat2,Long1,Lat1,Mid_lat,Mid_long,Stop_Signs,Paving_historical,Paving_future,...,94129,94130,94131,94132,94133,94134,94141,94143,94158,94188
920,2176,-122.493924,37.716383,-122.492784,37.71548,37.715932,-122.493354,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
6011,8372,-122.432358,37.729034,-122.431218,37.72813,37.728582,-122.431788,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
6218,8602,-122.430077,37.729034,-122.428937,37.72813,37.728582,-122.429507,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
10331,14611,-122.369651,37.806744,-122.368511,37.80584,37.806292,-122.369081,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,0
9352,12070,-122.395874,37.712769,-122.394734,37.711865,37.712317,-122.395304,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,0


Image Pre-Processing of Train Set

In [9]:
IMAGE_PATH = './Satellite Imagery/Satellite Images Tiled/' 

def preprocess_data_part1(IMAGE_PATH):
    """ Generate lists of images and labelsbased on temp_no_refer and temp_refer lists
    
    Params:
    -------
    IMAGE_PATH (str): path to directory with images.
    
    Returns:
    --------
    images_mini  (np.ndarray): Images of shape (N, 149 3)
    """
    
    data_mini = []
    
    for id in x_train['Tile_ID']:    
                    
        # read image and store as matrix            
        # Index at the end makes all images the same size (they sometimes differ by 1 pixel)
        image = tifffile.imread(IMAGE_PATH + str(id) + '.tif')[0:148, 0:188, :]
        
            
        # append to images
        data_mini.append(image)
 
    # stack images and trasnform to array
    images_mini = np.stack(data_mini)
    
    return images_mini

In [10]:
images_mini_x_train = preprocess_data_part1(IMAGE_PATH)
np.shape(images_mini_x_train)

(8376, 148, 188, 4)

Image Pre-Processing of Test Set

In [11]:
IMAGE_PATH = './Satellite Imagery/Satellite Images Tiled/' 

def preprocess_data_part2(IMAGE_PATH):
    """ Generate lists of images and labelsbased on temp_no_refer and temp_refer lists
    
    Params:
    -------
    IMAGE_PATH (str): path to directory with images.
    
    Returns:
    --------
    images_mini  (np.ndarray): Images of shape (N, 149 3)
    """
    
    data_mini = []
    
    for id in x_test['Tile_ID']:    
                    
        # read image and store as matrix            
        # Index at the end makes all images the same size (they sometimes differ by 1 pixel)
        image = tifffile.imread(IMAGE_PATH + str(id) + '.tif')[0:148, 0:188, :]
        
            
        # append to images
        data_mini.append(image)
 
    # stack images and trasnform to array
    images_mini = np.stack(data_mini)
    
    return images_mini

In [12]:
images_mini_x_test = preprocess_data_part2(IMAGE_PATH)
np.shape(images_mini_x_test)

(2095, 148, 188, 4)

Street Data Pre-Processing of Train Set

In [13]:
street = np.asarray(x_train[['Tile_ID', 'Long2', 'Lat2', 'Long1', 'Lat1', 'Mid_lat', 'Mid_long',
       'Stop_Signs', 'Paving_historical', 'Paving_future', 'Bus_stop',
       'Collisions_Future', 'Collisions_Historical', 'RTTYP_I', 'RTTYP_M',
       'RTTYP_O', 'RTTYP_S', 'RTTYP_U', 'Collisions_Future_binary',
       'Collisions_Historical_binary', 'bins_numeric', 'zip_code', '94101',
       '94102', '94104', '94105', '94107', '94108', '94109', '94110', '94111',
       '94112', '94114', '94115', '94116', '94117', '94118', '94121', '94122',
       '94123', '94124', '94127', '94129', '94130', '94131', '94132', '94133',
       '94134', '94141', '94143', '94158', '94188']]).astype('float32')
street_mini = []
for row in range(len(street)):
    street_mini.append([[street[row]]])
street_mini_x_train = np.stack(street_mini)
np.shape(street_mini_x_train)

(8376, 1, 1, 52)

Street Data Pre-Processing of Test Set

In [14]:
street = np.asarray(x_test[['Tile_ID', 'Long2', 'Lat2', 'Long1', 'Lat1', 'Mid_lat', 'Mid_long',
       'Stop_Signs', 'Paving_historical', 'Paving_future', 'Bus_stop',
       'Collisions_Future', 'Collisions_Historical', 'RTTYP_I', 'RTTYP_M',
       'RTTYP_O', 'RTTYP_S', 'RTTYP_U', 'Collisions_Future_binary',
       'Collisions_Historical_binary', 'bins_numeric', 'zip_code', '94101',
       '94102', '94104', '94105', '94107', '94108', '94109', '94110', '94111',
       '94112', '94114', '94115', '94116', '94117', '94118', '94121', '94122',
       '94123', '94124', '94127', '94129', '94130', '94131', '94132', '94133',
       '94134', '94141', '94143', '94158', '94188']]).astype('float32')
street_mini = []
for row in range(len(street)):
    street_mini.append([[street[row]]])
street_mini_x_test = np.stack(street_mini)
np.shape(street_mini_x_test)

(2095, 1, 1, 52)

In [15]:
x_train_array = np.hstack(
    (street_mini_x_train.reshape((8376,52)),
     images_mini_x_train.reshape(8376,148*188*4))
)

np.shape(x_train_array)

(8376, 111348)

In [16]:
x_test_array = np.hstack(
    (street_mini_x_test.reshape((2095,52)),
     images_mini_x_test.reshape(2095,148*188*4))
)
np.shape(x_test_array)

(2095, 111348)

In [17]:
x_train_df = pd.DataFrame(x_train_array)
x_train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,111338,111339,111340,111341,111342,111343,111344,111345,111346,111347
0,10993.0,-122.406136,37.750721,-122.404991,37.749817,37.750271,-122.405563,0.0,0.0,0.0,...,84.0,129.0,158.0,154.0,104.0,175.0,149.0,145.0,106.0,177.0
1,13338.0,-122.383331,37.710056,-122.382195,37.709156,37.709606,-122.382767,0.0,0.0,0.0,...,114.0,118.0,147.0,141.0,113.0,132.0,155.0,147.0,119.0,130.0
2,10163.0,-122.414116,37.773312,-122.412979,37.772408,37.772858,-122.413544,0.0,0.0,0.0,...,161.0,112.0,166.0,169.0,159.0,107.0,173.0,172.0,163.0,120.0
3,7523.0,-122.440338,37.768791,-122.439201,37.767887,37.768341,-122.439766,1.0,0.0,0.0,...,52.0,98.0,64.0,81.0,55.0,101.0,54.0,66.0,49.0,78.0
4,10314.0,-122.412979,37.74078,-122.411835,37.739876,37.74033,-122.412407,2.0,0.0,0.0,...,60.0,26.0,53.0,64.0,57.0,32.0,60.0,68.0,61.0,36.0


In [18]:
x_test_df = pd.DataFrame(x_test_array)
x_test_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,111338,111339,111340,111341,111342,111343,111344,111345,111346,111347
0,2176.0,-122.493927,37.716385,-122.492783,37.715481,37.715931,-122.493355,0.0,0.0,0.0,...,50.0,142.0,81.0,86.0,49.0,139.0,86.0,90.0,53.0,142.0
1,8372.0,-122.432358,37.729034,-122.431221,37.72813,37.72858,-122.431786,1.0,0.0,0.0,...,135.0,151.0,186.0,171.0,136.0,158.0,176.0,160.0,125.0,132.0
2,8602.0,-122.430077,37.729034,-122.42894,37.72813,37.72858,-122.429504,0.0,0.0,0.0,...,49.0,255.0,225.0,34.0,49.0,255.0,225.0,34.0,49.0,255.0
3,14611.0,-122.369652,37.806744,-122.368515,37.80584,37.806293,-122.36908,0.0,0.0,0.0,...,95.0,14.0,80.0,111.0,93.0,17.0,83.0,113.0,95.0,16.0
4,12070.0,-122.395874,37.712769,-122.394737,37.711864,37.712318,-122.395302,0.0,0.0,0.0,...,59.0,87.0,56.0,66.0,57.0,67.0,53.0,61.0,57.0,45.0


In [None]:
from pathlib import Path
file_path = Path('/home/ubuntu/noriel/210_Capstone_Aditya_Arisa_Noriel/Data/x_train.csv')
x_train_df.to_csv(file_path,index=False)

In [None]:
from pathlib import Path
file_path = Path('/home/ubuntu/noriel/210_Capstone_Aditya_Arisa_Noriel/Data/x_test.csv')
x_test_df.to_csv(file_path,index=False)