# Refactor Data

Our goal here is to refactor the data to sensible forms for data processing and exploration.  When we're finished, we'll output a new CSV file.

In [2318]:
import pandas as pd

In [2319]:
filename = '2019-06-20 16:42:57.005476'
df = pd.read_csv(filename+'.csv')

# Preview Raw Data

In [2320]:
pd.set_option('display.max_columns', 25)

In [2321]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,address,bathrooms,bedrooms,building_sqft,desc,garages,img_url,list_price,list_status,list_url,lot_sqft,mls_num,pool,real_comp_link,real_comp_name,real_link,real_name,stories,unknown_data,year_built
0,0,"11630 Ashcroft, Houston, TX 77035",.2 Full & 1 Half Bath(s),4 Bed(s),".2,747 Building Sqft.","Single-Family Property, Traditional Style in Westbury Subdivision in Brays Oaks (Market Area)",2 Garage(s) / Attached/Detached,background-image:url(https://photos.harstatic.com/172553809/lr/img-1.jpeg?ts=2019-04-17T10:23:50.137);,"$ 475,000",For Sale,/11630-ashcroft/sale_69756841,"8,344 Lot Sqft.",MLS# 69756841,No Private Pool,/compass-re-texas-llc/broker_CMTX01,"Compass RE Texas, LLC",/stephanie-finch/agent_SFinch,Stephanie Finch,.2 Stories,,Built in 1960


In [2322]:
df.columns

Index(['Unnamed: 0', 'address', 'bathrooms', 'bedrooms', 'building_sqft',
       'desc', 'garages', 'img_url', 'list_price', 'list_status', 'list_url',
       'lot_sqft', 'mls_num', 'pool', 'real_comp_link', 'real_comp_name',
       'real_link', 'real_name', 'stories', 'unknown_data', 'year_built'],
      dtype='object')

In [2323]:
# Drop Index Column
df.drop(columns='Unnamed: 0', inplace=True)

# Bathrooms Column

#### Split into full bathrooms and half bathrooms columns.

In [2324]:
df.bathrooms.unique()

array(['.2 Full & 1 Half Bath(s)', '.3 Full & 1 Half Bath(s)',
       '.2 Full  Bath(s)', '.1 Full  Bath(s)', '.3 Full  Bath(s)',
       '.1 Full & 1 Half Bath(s)', '.5 Full  Bath(s)',
       '.2 Full & 2 Half Bath(s)', nan, '.4 Full  Bath(s)',
       '.3 Full & 2 Half Bath(s)', '.4 Full & 2 Half Bath(s)',
       '.5 Full & 1 Half Bath(s)', '.4 Full & 1 Half Bath(s)',
       '.2 Full & 5 Half Bath(s)', '.6 Full  Bath(s)',
       '.2 Full & 3 Half Bath(s)', '.2 Full & 4 Half Bath(s)',
       '.5 Full & 2 Half Bath(s)', '.1 Full & 2 Half Bath(s)'],
      dtype=object)

In [2325]:
# Cast column to string to we can use strip operations
df.bathrooms = df.bathrooms.astype(str)
df.bathrooms = df.bathrooms.map(lambda x: x.strip('.').strip('Bath(s)'))
df.bathrooms = df.bathrooms.map(lambda x: x.replace('Half',"").replace('Full',"").replace("&",""))
df.bathrooms = df.bathrooms.map(lambda x: x.strip())

#### Fancy little function here that splits the single column into 2 columns with a max of '1' split per row.

In [2326]:
df['full_baths'], df['half_baths'] = df.bathrooms.str.split(' ', 1).str

In [2327]:
# Remove original bathroom column
df.drop(columns='bathrooms', inplace=True)

In [2328]:
df.full_baths.unique()

array(['2', '3', '1', '5', 'nan', '4', '6'], dtype=object)

In [2329]:
df.half_baths.unique()

array(['  1', nan, '  2', '  5', '  3', '  4'], dtype=object)

# Bedrooms Column

In [2330]:
df.bedrooms.unique()

array(['4 Bed(s)', '3 Bed(s)', '4-5  Bed(s)', '2 Bed(s)', '3-4  Bed(s)',
       '6 Bed(s)', '2-3  Bed(s)', '5 Bed(s)', '1 Bed(s)', nan, '7 Bed(s)',
       '3-5  Bed(s)', '5-6  Bed(s)', '8 Bed(s)', '4-6  Bed(s)',
       '1-2  Bed(s)', '6-8  Bed(s)', '9 Bed(s)', '1-4  Bed(s)'],
      dtype=object)

In [2331]:
# Remove string 'Bed(s)'
df.bedrooms = df.bedrooms.astype(str)
df.bedrooms = df.bedrooms.map(lambda x: x.replace('Bed(s)', ''))
df.bedrooms = df.bedrooms.str.strip()

In [2332]:
# Helper function that takes averages of bedrooms data
# that take the format '1-4'
def avgBeds(bedrooms):
    if '-' in bedrooms:
        sum = float(bedrooms[0])+float(bedrooms[-1])
        bedrooms = sum/2.0
    return bedrooms

In [2333]:
# Take average of bedrooms with form '1-4'
df.bedrooms = df.bedrooms.apply(avgBeds);

In [2334]:
df.bedrooms[:10]

0    4  
1    4  
2    4  
3    4  
4    3  
5    3  
6    4  
7    3  
8    4.5
9    2  
Name: bedrooms, dtype: object

# Building Sqft Column

In [2335]:
df.building_sqft[:5]

0    .2,747 Building Sqft.
1    .3,500 Building Sqft.
2    .3,346 Building Sqft.
3    .2,308 Building Sqft.
4    .2,259 Building Sqft.
Name: building_sqft, dtype: object

In [2336]:
df.building_sqft = df.building_sqft.astype(str)
df.building_sqft = df.building_sqft.map(lambda x: x.lstrip('.'))
df.building_sqft = df.building_sqft.map(lambda x: x.replace('Building Sqft.', ''))
df.building_sqft = df.building_sqft.map(lambda x: x.replace(',', ''))
df.building_sqft = df.building_sqft.astype(float)

In [2337]:
df.building_sqft[:5]

0    2747.0
1    3500.0
2    3346.0
3    2308.0
4    2259.0
Name: building_sqft, dtype: float64

# Garages Column

In [2338]:
df.garages[0:5]

0    2 Garage(s) / Attached/Detached
1    3 Garage(s) / Attached,Tandem  
2    2 Garage(s) / Attached         
3    1 Garage(s) / Detached         
4    2 Garage(s) / Attached         
Name: garages, dtype: object

Let's split at '/' and then worry about uniqueness later

In [2339]:
df['garages'], df['garage_type'] = df.garages.str.split('/', 1).str

In [2340]:
df.garages.unique()

array(['2 Garage(s) ', '3 Garage(s) ', '1 Garage(s) ', nan,
       '22 Garage(s) ', '32 Garage(s) ', '4 Garage(s) ', '60 Garage(s) ',
       '40 Garage(s) ', '12 Garage(s) ', '18 Garage(s) ', '5 Garage(s) ',
       '6 Garage(s) ', '8234 Garage(s) ', '50 Garage(s) ',
       '66 Garage(s) ', '16 Garage(s) ', '48 Garage(s) ', '43 Garage(s) ',
       '21 Garage(s) ', '36 Garage(s) '], dtype=object)

#### Looks like we've got some crazy garage numbers and 'nan'.  I checked a few, they're obviously innacurate.  How many do we have?

Let's fill the nan's before we cast to string.

In [2341]:
# Change 'nan' to 0.  Mostly to play nice with seaborn when we graph.
df.garages.fillna(0, inplace=True)

In [2342]:
df.garages = df.garages.astype(str)
df.garages = df.garages.map(lambda x: x.replace('Garage(s)', ''))
df.garages = df.garages.map(lambda x: x.rstrip().lstrip())

In [2343]:
df.garages.value_counts()

2       7909
0       1392
1       1195
3       510 
4       34  
5       4   
6       4   
60      3   
12      2   
8234    2   
21      2   
50      2   
18      2   
22      2   
43      1   
32      1   
16      1   
66      1   
40      1   
48      1   
36      1   
Name: garages, dtype: int64

Not that many crazy garages out of 11,000 homes... let's drop any 2 digit garages.

In [2344]:
# Important, must create a copy here.
df2 = df[df.garages.map(lambda x: len(x)<=1)]

In [2345]:
# Swap back to original df variable so we don't get confused.
df = df2
df.garages.value_counts()

2    7909
0    1392
1    1195
3    510 
4    34  
6    4   
5    4   
Name: garages, dtype: int64

In [2346]:
# Sanit Check
df.head()

Unnamed: 0,address,bedrooms,building_sqft,desc,garages,img_url,list_price,list_status,list_url,lot_sqft,mls_num,pool,real_comp_link,real_comp_name,real_link,real_name,stories,unknown_data,year_built,full_baths,half_baths,garage_type
0,"11630 Ashcroft, Houston, TX 77035",4,2747.0,"Single-Family Property, Traditional Style in Westbury Subdivision in Brays Oaks (Market Area)",2,background-image:url(https://photos.harstatic.com/172553809/lr/img-1.jpeg?ts=2019-04-17T10:23:50.137);,"$ 475,000",For Sale,/11630-ashcroft/sale_69756841,"8,344 Lot Sqft.",MLS# 69756841,No Private Pool,/compass-re-texas-llc/broker_CMTX01,"Compass RE Texas, LLC",/stephanie-finch/agent_SFinch,Stephanie Finch,.2 Stories,,Built in 1960,2,1,Attached/Detached
1,"18111 Ponte Vecchio, Houston, TX 77044",4,3500.0,"Single-Family Property, Traditional Style in Bridges On Lake Houston Subdivision in Atascocita South (Market Area)",3,background-image:url(https://photos.harstatic.com/172364270/lr/img-1.jpeg?ts=2019-04-11T19:58:44.963);,"$ 475,000",For Sale,/18111-ponte-vecchio/sale_45037765,,MLS# 45037765,No Private Pool,/keller-williams-realty-the-woodlands/broker_KWWD01,Keller Williams Realty The Woodlands,/garland-bennett/agent_GBENNETT,Garland Bennett,.1 Stories,,Built in 2018,3,1,"Attached,Tandem"
2,"5910 Caddo Terrace Lane, Houston, TX 77041",4,3346.0,"Single-Family Property, Traditional Style in Lakes On Eldridge North Sec Subdivision in Eldridge North (Market Area)",2,background-image:url(https://photos.harstatic.com/172320148/lr/img-1.jpeg?ts=2019-04-09T23:55:57.253);,"$ 475,000",For Sale,/5910-caddo-terrace-lane/sale_52704869,"9,410 Lot Sqft.",MLS# 52704869,Has Private Pool,/keller-williams-realty-metropolitan/broker_KWHM01,Keller Williams Realty Metropolitan,/lorna-ramsay/agent_cornhill,Lorna Ramsay,.2 Stories,,Built in 2003,3,1,Attached
3,"14722 Cindywood Drive, Houston, TX 77079",4,2308.0,"Single-Family Property, Traditional Style in Westchester Sec 02 Subdivision in Memorial West (Market Area)",1,background-image:url(https://photos.harstatic.com/171900008/lr/img-1.jpeg?ts=2019-03-29T11:36:32.677);,"$ 475,000",For Sale,/14722-cindywood-drive/sale_29834219,"8,064 Lot Sqft.",MLS# 29834219,No Private Pool,/realty-associates-------------/broker_PBME01,Realty Associates,/richard-gannon/agent_RGANNON,Richard Gannon,.1 Stories,,Built in 1969,2,1,Detached
4,"3424 Timbergrove Heights, Houston, TX 77008",3,2259.0,"Single-Family Property, Contemporary/Modern,French,Traditional Style in Timbergrove Heights Subdivision in Timbergrove/Lazybrook (Market Area)",2,background-image:url(https://photos.harstatic.com/171615142/lr/img-1.jpeg?ts=2019-03-28T11:49:12.817);,"$ 475,000",Under Contract - Pending,/3424-timbergrove-heights/sale_40531170,,MLS# 40531170,No Private Pool,/intown-homes/broker_INTW01,Intown Homes,/emily-wang/agent_emwang,Emily Wang,.3 Stories,,Built in 2019,3,1,Attached


#### The garage multi-label data might require some cleanup down the road...

In [2347]:
df2.garage_type.unique()

array([' Attached/Detached', ' Attached,Tandem', ' Attached', ' Detached',
       ' Attached,Oversized', nan, ' Attached,Detached',
       ' Attached,Oversized,Tandem', ' Attached/Detached,Oversized',
       ' Detached,Oversized', ' Attached,Attached/Detached', ' Oversized',
       ' Tandem', ' Attached/Detached,Detached,Oversized',
       ' Attached,Detached,Oversized', ' Attached/Detached,Detached',
       ' Attached/Detached,Tandem', ' Detached,Tandem',
       ' Attached,Attached/Detached,Oversized',
       ' Detached,Oversized,Tandem',
       ' Attached,Attached/Detached,Detached'], dtype=object)

# IMG URL Column

In [2348]:
df.img_url[0]

'background-image:url(https://photos.harstatic.com/172553809/lr/img-1.jpeg?ts=2019-04-17T10:23:50.137);'

In [2349]:
def cleanURL(img_url):
    if '?' in img_url:
        img_url = img_url.replace(img_url[img_url.index('?'):],'')
    return img_url

In [2350]:
df.img_url = df.img_url.astype(str)
df.img_url = df.img_url.map(lambda x: x.lstrip('background-image:url('))
df.img_url = df.img_url.apply(cleanURL)

In [2351]:
# Turn off truncating of URLs
pd.set_option('display.max_colwidth', -1)

In [2352]:
df.img_url[0:5]

0    https://photos.harstatic.com/172553809/lr/img-1.jpeg
1    https://photos.harstatic.com/172364270/lr/img-1.jpeg
2    https://photos.harstatic.com/172320148/lr/img-1.jpeg
3    https://photos.harstatic.com/171900008/lr/img-1.jpeg
4    https://photos.harstatic.com/171615142/lr/img-1.jpeg
Name: img_url, dtype: object

# List Price Column

In [2353]:
df.list_price[0:5]

0     $ 475,000   
1     $ 475,000   
2     $ 475,000   
3     $ 475,000   
4     $ 475,000   
Name: list_price, dtype: object

In [2354]:
df.list_price = df.list_price.map(lambda x: x.replace('$','').replace(',',''))
df.list_price = df.list_price.astype(int)

In [2355]:
df.list_price[0:5]

0    475000
1    475000
2    475000
3    475000
4    475000
Name: list_price, dtype: int64

# List Status Column

In [2356]:
df.list_status.unique()

array(['For Sale', 'Under Contract - Pending',
       'Under Contract - Option Pending',
       'Under Contract - Pending Continue to Show'], dtype=object)

# List URL Column

In [2357]:
df.list_url[0:5]

0    /11630-ashcroft/sale_69756841          
1    /18111-ponte-vecchio/sale_45037765     
2    /5910-caddo-terrace-lane/sale_52704869 
3    /14722-cindywood-drive/sale_29834219   
4    /3424-timbergrove-heights/sale_40531170
Name: list_url, dtype: object

In [2358]:
df.list_url = df.list_url.map(lambda x: 'http://www.har.com'+x)

In [2359]:
df.list_url[0:5]

0    http://www.har.com/11630-ashcroft/sale_69756841          
1    http://www.har.com/18111-ponte-vecchio/sale_45037765     
2    http://www.har.com/5910-caddo-terrace-lane/sale_52704869 
3    http://www.har.com/14722-cindywood-drive/sale_29834219   
4    http://www.har.com/3424-timbergrove-heights/sale_40531170
Name: list_url, dtype: object

# Lot Sqft Column

In [2360]:
df.lot_sqft[0:5]

0    8,344 Lot Sqft.
1    NaN            
2    9,410 Lot Sqft.
3    8,064 Lot Sqft.
4    NaN            
Name: lot_sqft, dtype: object

In [2361]:
df.lot_sqft = df.lot_sqft.astype(str)
df.lot_sqft = df.lot_sqft.map(lambda x: x.replace('Lot Sqft.', ''))
df.lot_sqft = df.lot_sqft.map(lambda x: x.replace(',', ''))
df.lot_sqft = df.lot_sqft.astype(float)

In [2362]:
df.lot_sqft[0:5]

0    8344.0
1   NaN    
2    9410.0
3    8064.0
4   NaN    
Name: lot_sqft, dtype: float64

# MLS Number Column

In [2363]:
df.mls_num[0:5]

0    MLS# 69756841
1    MLS# 45037765
2    MLS# 52704869
3    MLS# 29834219
4    MLS# 40531170
Name: mls_num, dtype: object

In [2364]:
df.mls_num = df.mls_num.map(lambda x: x.replace('MLS# ', ''))

In [2365]:
df.mls_num[0:5]

0    69756841
1    45037765
2    52704869
3    29834219
4    40531170
Name: mls_num, dtype: object

# Pool Column

In [2366]:
df.pool.unique()

array(['No Private Pool ', ' Has Private Pool'], dtype=object)

In [2367]:
# Remove leading white space from ' Has Private Pool' category
df.pool = df.pool.str.lstrip()

In [2368]:
df.pool[0:5]

0    No Private Pool 
1    No Private Pool 
2    Has Private Pool
3    No Private Pool 
4    No Private Pool 
Name: pool, dtype: object

# Realtor Company Link Column

In [2369]:
df.real_comp_link[0:5]

0    /compass-re-texas-llc/broker_CMTX01                
1    /keller-williams-realty-the-woodlands/broker_KWWD01
2    /keller-williams-realty-metropolitan/broker_KWHM01 
3    /realty-associates-------------/broker_PBME01      
4    /intown-homes/broker_INTW01                        
Name: real_comp_link, dtype: object

In [2370]:
df.real_comp_link = df.real_comp_link.map(lambda x: 'http://www.har.com'+x)

In [2371]:
df.real_comp_link[0:5]

0    http://www.har.com/compass-re-texas-llc/broker_CMTX01                
1    http://www.har.com/keller-williams-realty-the-woodlands/broker_KWWD01
2    http://www.har.com/keller-williams-realty-metropolitan/broker_KWHM01 
3    http://www.har.com/realty-associates-------------/broker_PBME01      
4    http://www.har.com/intown-homes/broker_INTW01                        
Name: real_comp_link, dtype: object

# Realtor Company Name Column

In [2372]:
df.real_comp_name[0:5]

0    Compass RE Texas, LLC               
1    Keller Williams Realty The Woodlands
2    Keller Williams Realty Metropolitan 
3    Realty Associates                   
4    Intown Homes                        
Name: real_comp_name, dtype: object

# Realtor Link

In [2373]:
df.real_link[0:5]

0    /stephanie-finch/agent_SFinch  
1    /garland-bennett/agent_GBENNETT
2    /lorna-ramsay/agent_cornhill   
3    /richard-gannon/agent_RGANNON  
4    /emily-wang/agent_emwang       
Name: real_link, dtype: object

In [2374]:
df.real_link = df.real_link.map(lambda x: 'http://www.har.com'+x)

In [2375]:
df.real_link[0:5]

0    http://www.har.com/stephanie-finch/agent_SFinch  
1    http://www.har.com/garland-bennett/agent_GBENNETT
2    http://www.har.com/lorna-ramsay/agent_cornhill   
3    http://www.har.com/richard-gannon/agent_RGANNON  
4    http://www.har.com/emily-wang/agent_emwang       
Name: real_link, dtype: object

# Realtor Name

In [2376]:
df.real_name[0:5]

0    Stephanie Finch
1    Garland Bennett
2    Lorna Ramsay   
3    Richard Gannon 
4    Emily Wang     
Name: real_name, dtype: object

# Stories Column

In [2377]:
df.stories.unique()

array(['.2 Stories', '.1 Stories', '.3 Stories', '.4 Stories',
       '.1.5 Stories', nan, '.2.5 Stories'], dtype=object)

In [2378]:
# Cast column to string to we can use strip operations
df.stories = df.stories.astype(str)
df.stories = df.stories.map(lambda x: x.lstrip('.').rstrip('Stories'))
# Cast column back to float
df.stories = df.stories.astype(float)

In [2379]:
df.stories[:5]

0    2.0
1    1.0
2    2.0
3    1.0
4    3.0
Name: stories, dtype: float64

In [2380]:
df.stories.unique()

array([2. , 1. , 3. , 4. , 1.5, nan, 2.5])

# Unknown Column

Might decide to return and use acres...

In [2381]:
df.unknown_data.unique()

array([nan, '1.92 Acre(s)', '0.43 Acre(s)', '2.71 Acre(s)',
       '1.64 Acre(s)', '0.55 Acre(s)', '0.96 Acre(s)', '0.61 Acre(s)',
       '2 Acre(s)', '5.89 Acre(s)', '1.01 Acre(s)', '1.06 Acre(s)',
       '4.81 Acre(s)', '0.69 Acre(s)', '1.37 Acre(s)', '0.18 Acre(s)',
       '2.13 Acre(s)', '0.16 Acre(s)', '1.53 Acre(s)', '0.56 Acre(s)',
       '0.07 Acre(s)', '0.13 Acre(s)', '0.30 Acre(s)', '1.30 Acre(s)',
       '1 Acre(s)', '1.26 Acre(s)', '1.10 Acre(s)', '2.83 Acre(s)',
       '1.02 Acre(s)', '1.60 Acre(s)', '1.66 Acre(s)', '1.13 Acre(s)',
       '1.08 Acre(s)', '2.24 Acre(s)', '1.22 Acre(s)', '1.42 Acre(s)',
       '4.01 Acre(s)', '1.88 Acre(s)', '5 Acre(s)', '2.47 Acre(s)',
       '2.98 Acre(s)', '0.20 Acre(s)', '1.21 Acre(s)', '1.09 Acre(s)',
       '0.04 Acre(s)', '0.11 Acre(s)', '1.04 Acre(s)'], dtype=object)

# Year Built Column

In [2382]:
df.year_built[0:5]

0    Built in 1960
1    Built in 2018
2    Built in 2003
3    Built in 1969
4    Built in 2019
Name: year_built, dtype: object

In [2383]:
df.year_built = df.year_built.astype(str)
df.year_built = df.year_built.map(lambda x: x.replace('Built in ', ''))

In [2384]:
df.year_built[0:5]

0    1960
1    2018
2    2003
3    1969
4    2019
Name: year_built, dtype: object

# Sanity Check!

In [2385]:
df.head()

Unnamed: 0,address,bedrooms,building_sqft,desc,garages,img_url,list_price,list_status,list_url,lot_sqft,mls_num,pool,real_comp_link,real_comp_name,real_link,real_name,stories,unknown_data,year_built,full_baths,half_baths,garage_type
0,"11630 Ashcroft, Houston, TX 77035",4,2747.0,"Single-Family Property, Traditional Style in Westbury Subdivision in Brays Oaks (Market Area)",2,https://photos.harstatic.com/172553809/lr/img-1.jpeg,475000,For Sale,http://www.har.com/11630-ashcroft/sale_69756841,8344.0,69756841,No Private Pool,http://www.har.com/compass-re-texas-llc/broker_CMTX01,"Compass RE Texas, LLC",http://www.har.com/stephanie-finch/agent_SFinch,Stephanie Finch,2.0,,1960,2,1,Attached/Detached
1,"18111 Ponte Vecchio, Houston, TX 77044",4,3500.0,"Single-Family Property, Traditional Style in Bridges On Lake Houston Subdivision in Atascocita South (Market Area)",3,https://photos.harstatic.com/172364270/lr/img-1.jpeg,475000,For Sale,http://www.har.com/18111-ponte-vecchio/sale_45037765,,45037765,No Private Pool,http://www.har.com/keller-williams-realty-the-woodlands/broker_KWWD01,Keller Williams Realty The Woodlands,http://www.har.com/garland-bennett/agent_GBENNETT,Garland Bennett,1.0,,2018,3,1,"Attached,Tandem"
2,"5910 Caddo Terrace Lane, Houston, TX 77041",4,3346.0,"Single-Family Property, Traditional Style in Lakes On Eldridge North Sec Subdivision in Eldridge North (Market Area)",2,https://photos.harstatic.com/172320148/lr/img-1.jpeg,475000,For Sale,http://www.har.com/5910-caddo-terrace-lane/sale_52704869,9410.0,52704869,Has Private Pool,http://www.har.com/keller-williams-realty-metropolitan/broker_KWHM01,Keller Williams Realty Metropolitan,http://www.har.com/lorna-ramsay/agent_cornhill,Lorna Ramsay,2.0,,2003,3,1,Attached
3,"14722 Cindywood Drive, Houston, TX 77079",4,2308.0,"Single-Family Property, Traditional Style in Westchester Sec 02 Subdivision in Memorial West (Market Area)",1,https://photos.harstatic.com/171900008/lr/img-1.jpeg,475000,For Sale,http://www.har.com/14722-cindywood-drive/sale_29834219,8064.0,29834219,No Private Pool,http://www.har.com/realty-associates-------------/broker_PBME01,Realty Associates,http://www.har.com/richard-gannon/agent_RGANNON,Richard Gannon,1.0,,1969,2,1,Detached
4,"3424 Timbergrove Heights, Houston, TX 77008",3,2259.0,"Single-Family Property, Contemporary/Modern,French,Traditional Style in Timbergrove Heights Subdivision in Timbergrove/Lazybrook (Market Area)",2,https://photos.harstatic.com/171615142/lr/img-1.jpeg,475000,Under Contract - Pending,http://www.har.com/3424-timbergrove-heights/sale_40531170,,40531170,No Private Pool,http://www.har.com/intown-homes/broker_INTW01,Intown Homes,http://www.har.com/emily-wang/agent_emwang,Emily Wang,3.0,,2019,3,1,Attached


# Save New CSV

In [2386]:
df.to_csv('CLEAN_'+filename+'.csv', index=False)

# Done!

In [2387]:
# locate rows in a column with a specific value
#df.loc[df['bedrooms'] == '1-4  Bed(s)']

In [2390]:
df.building_sqft.max()

14300.0

In [2391]:
df.loc[df.building_sqft == df.building_sqft.max()]

Unnamed: 0,address,bedrooms,building_sqft,desc,garages,img_url,list_price,list_status,list_url,lot_sqft,mls_num,pool,real_comp_link,real_comp_name,real_link,real_name,stories,unknown_data,year_built,full_baths,half_baths,garage_type
9461,"3637 Chimira Lane, Houston, TX 77051",3,14300.0,"Single-Family Property, Traditional Style in Scottcrest Subdivision in Medical Center South (Market Area)",0,https://photos.harstatic.com/170282976/lr/img-1.jpeg,159000,For Sale,http://www.har.com/3637-chimira-lane/sale_51453881,6600.0,51453881,No Private Pool,http://www.har.com/summit-realty/broker_INTS01,Summit Realty,http://www.har.com/cameron-namazi/agent_cnamazi,Cameron Namazi,1.0,,1950,1,,
