### Load Libraries

In [1]:
import pandas as pd
import numpy as np

In [5]:
all_columns = pd.read_csv('static/data/MERGE_CON_SOLD.csv').columns.values
all_columns

  interactivity=interactivity, compiler=compiler, result=result)


array(['Unnamed: 0', 'MLSNUM', 'STATUS', 'LISTPRICE', 'SOLDPRICE',
       'LISTDATE', 'SOLDDATE', 'EXPIREDDATE', 'DOM', 'DTO', 'ADDRESS',
       'CITY', 'STATE', 'ZIP', 'AREA', 'BEDS', 'BATHS', 'SQFT', 'AGE',
       'LOTSIZE', 'AGENTNAME', 'OFFICENAME', 'OFFICEPHONE',
       'SHOWINGINSTRUCTIONS', 'REMARKS', 'STYLE', 'LEVEL', 'GARAGE',
       'HEATING', 'COOLING', 'ELEMENTARYSCHOOL', 'JUNIORHIGHSCHOOL',
       'HIGHSCHOOL', 'OTHERFEATURES', 'PROPTYPE', 'STREETNAME',
       'HOUSENUM1', 'HOUSENUM2', 'PHOTOURL'], dtype=object)

In [14]:
## only take the columns we want people to filter by
use_cols = ['MLSNUM', 'ZIP', 'BEDS', 'BATHS', 'SQFT', 'LISTPRICE']
houses = pd.read_csv('static/data/MERGE_CON_SOLD.csv', usecols=use_cols)

  interactivity=interactivity, compiler=compiler, result=result)


In [15]:
houses.shape

(41690, 6)

In [18]:
## format the zip column properly
houses['ZIP'] = houses['ZIP'].apply(lambda x: '{:0>5s}'.format(str(x)))

In [19]:
houses.head()

Unnamed: 0,MLSNUM,LISTPRICE,ZIP,BEDS,BATHS,SQFT
0,71498924,169900,1860,2,1.0,1283
1,71500500,279900,1524,2,2.0,1368
2,71595485,449900,1879,2,2.5,1850
3,71599223,389500,1970,2,2.0,1502
4,71661544,237000,1970,2,1.0,1469


In [20]:
## export the used information to a csv file
houses.to_csv('static/data/condos.csv', index=None)

### Keras Image Processing

In [39]:
import os
import re
from keras.preprocessing import image
from keras.applications.vgg16 import VGG16
from keras.applications.vgg16 import preprocess_input

In [3]:
model = VGG16(weights='imagenet', include_top=False)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, None, None, 3)     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)   147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, None, None, 128)   0         
__________

In [71]:
## create a regex pattern
p = re.compile('(\d{8})_img_(\d+).jpg')
## create an array to hold image features which will be turned into a dataframe (faster than starting with a DF)
data_arr = []

## actual image jpgs are not kept within project directory as they would take up too much space
## due to time constraints, only convert some pictures for now
for root, dirs, files in os.walk('/Users/adamshapiro/Desktop/con_images/imgs_con_14000'):
    for name in files:
        match = p.match(name)
        if match:
            img_dict = {
                'MLSNUM': match.group(1),
                'IMGNUM': match.group(2)
            }
            path = os.path.join(root,name)
            img = image.load_img(path, target_size=(224,224))
            img_data = image.img_to_array(img)
            img_data = np.expand_dims(img_data, axis=0)
            img_data = preprocess_input(img_data)
            
            features = model.predict(img_data)
            img_dict['FEATURES'] = features
            
            data_arr.append(img_dict)
            
data_df = pd.DataFrame(data_arr)

OSError: cannot identify image file '/Users/adamshapiro/Desktop/con_images/imgs_con_14000/71969940_img_2.jpg'

In [124]:
data_df = pd.DataFrame(data_arr)
data_df.shape

(97664, 3)

In [125]:
data_df.head()

Unnamed: 0,FEATURES,IMGNUM,MLSNUM
0,[[[[ 0. 0. 0. 0. ...,0,71482776
1,[[[[ 0. 0. 0. 0. ...,1,71482776
2,[[[[ 0. 0. 0. 0. ...,10,71482776
3,[[[[ 0. 0. 0. 0. ...,11,71482776
4,[[[[ 0. 0. 0. 0. ...,12,71482776


In [129]:
data_df['FEATURES'].iloc[0]

array([[[[ 0.       ,  0.       ,  0.       , ...,  0.       ,
           0.       ,  0.       ],
         [ 0.       ,  0.       ,  0.       , ...,  0.       ,
           0.       ,  0.       ],
         [ 0.       ,  0.       ,  0.       , ...,  0.       ,
           0.       ,  0.       ],
         ...,
         [ 0.       ,  0.       ,  0.       , ...,  0.       ,
          17.30591  ,  0.       ],
         [ 0.       ,  0.       ,  0.       , ...,  0.       ,
          21.08236  ,  0.       ],
         [ 0.       ,  0.       ,  0.       , ...,  0.       ,
          25.857962 ,  0.       ]],

        [[ 0.       ,  0.       ,  0.       , ...,  0.       ,
           0.       ,  0.       ],
         [ 0.       ,  0.       ,  0.       , ...,  0.       ,
           0.       ,  0.       ],
         [ 0.       ,  0.       ,  0.       , ...,  0.       ,
           0.       ,  0.       ],
         ...,
         [ 0.       ,  0.       ,  0.       , ...,  0.       ,
           0.       ,  0.

In [132]:
## to save on space in the csv, only track non zero values in the features, full lists can be reconstructed later
def nonzeroes(arr, filtered):
    for idx, item in enumerate(arr):
        if type(item) is np.ndarray:
            filtered[idx] = nonzeroes(item, dict())
        else:
            if item != 0:
                filtered[idx] = item
    return filtered
    
nonzeroes(data_df['FEATURES'].iloc[0], dict())

{0: {0: {0: {4: 8.330889,
    15: 22.027956,
    26: 24.411951,
    27: 0.47096598,
    30: 2.6305861,
    45: 0.29443574,
    46: 7.4673963,
    50: 0.6815006,
    63: 22.25257,
    71: 9.140534,
    77: 3.0053635,
    89: 15.436633,
    99: 18.897943,
    101: 25.059084,
    146: 10.061992,
    150: 9.187004,
    155: 7.4037185,
    176: 12.594032,
    192: 8.626989,
    201: 19.41593,
    206: 1.394683,
    215: 13.453914,
    232: 2.4221835,
    250: 10.567966,
    253: 19.505589,
    264: 17.012402,
    290: 8.229961,
    294: 4.0897837,
    296: 2.5402634,
    312: 9.348562,
    321: 5.5878735,
    324: 18.661482,
    330: 3.2703414,
    341: 10.3687,
    352: 5.0544195,
    353: 6.830399,
    366: 2.9857101,
    377: 7.946589,
    386: 4.6760316,
    417: 5.008682,
    419: 3.194251,
    428: 33.490807,
    439: 4.7945724,
    440: 3.5270786,
    448: 4.0286927,
    450: 1.0928878,
    451: 10.421179,
    456: 6.114117,
    465: 9.300542,
    467: 10.162156,
    468: 18.884123,


In [135]:
## use limited data for now due to time constraints
df_limited = data_df.head(1000)

In [137]:
## convert the arrays to dicts
df_limited['FEATURES'] = df_limited['FEATURES'].apply(lambda x: nonzeroes(x, dict()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [142]:
df_limited['FEATURES'].iloc[0]

{0: {0: {0: {4: 8.330889,
    15: 22.027956,
    26: 24.411951,
    27: 0.47096598,
    30: 2.6305861,
    45: 0.29443574,
    46: 7.4673963,
    50: 0.6815006,
    63: 22.25257,
    71: 9.140534,
    77: 3.0053635,
    89: 15.436633,
    99: 18.897943,
    101: 25.059084,
    146: 10.061992,
    150: 9.187004,
    155: 7.4037185,
    176: 12.594032,
    192: 8.626989,
    201: 19.41593,
    206: 1.394683,
    215: 13.453914,
    232: 2.4221835,
    250: 10.567966,
    253: 19.505589,
    264: 17.012402,
    290: 8.229961,
    294: 4.0897837,
    296: 2.5402634,
    312: 9.348562,
    321: 5.5878735,
    324: 18.661482,
    330: 3.2703414,
    341: 10.3687,
    352: 5.0544195,
    353: 6.830399,
    366: 2.9857101,
    377: 7.946589,
    386: 4.6760316,
    417: 5.008682,
    419: 3.194251,
    428: 33.490807,
    439: 4.7945724,
    440: 3.5270786,
    448: 4.0286927,
    450: 1.0928878,
    451: 10.421179,
    456: 6.114117,
    465: 9.300542,
    467: 10.162156,
    468: 18.884123,


In [143]:
## use | instead of , to avoid confusion parsing the dicts
df_limited.to_csv('static/data/condo_images.csv', sep='|', index=False)