# Refactor Data

Our goal here is to refactor the data to sensible forms for data processing and exploration.  When we're finished, we'll output a new CSV file.

In [4110]:
import pandas as pd

In [4111]:
output_dir = 'output/'
filename = '2019-06-20 16:42:57.005476'
df = pd.read_csv(output_dir+filename+'.csv')

# Preview Raw Data

In [4112]:
pd.set_option('display.max_columns', 25)

In [4113]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,address,bathrooms,bedrooms,building_sqft,desc,garages,img_url,list_price,list_status,list_url,lot_sqft,mls_num,pool,real_comp_link,real_comp_name,real_link,real_name,stories,unknown_data,year_built
0,0,"11630 Ashcroft, Houston, TX 77035",.2 Full & 1 Half Bath(s),4 Bed(s),".2,747 Building Sqft.","Single-Family Property, Traditional Style in Westbury Subdivision in Brays Oaks (Market Area)",2 Garage(s) / Attached/Detached,background-image:url(https://photos.harstatic.com/172553809/lr/img-1.jpeg?ts=2019-04-17T10:23:50.137);,"$ 475,000",For Sale,/11630-ashcroft/sale_69756841,"8,344 Lot Sqft.",MLS# 69756841,No Private Pool,/compass-re-texas-llc/broker_CMTX01,"Compass RE Texas, LLC",/stephanie-finch/agent_SFinch,Stephanie Finch,.2 Stories,,Built in 1960


In [4114]:
df.columns

Index(['Unnamed: 0', 'address', 'bathrooms', 'bedrooms', 'building_sqft',
       'desc', 'garages', 'img_url', 'list_price', 'list_status', 'list_url',
       'lot_sqft', 'mls_num', 'pool', 'real_comp_link', 'real_comp_name',
       'real_link', 'real_name', 'stories', 'unknown_data', 'year_built'],
      dtype='object')

In [4115]:
# Drop Index Column
df.drop(columns='Unnamed: 0', inplace=True)

# Zip Code

Let's split out the zipcode from the address because it might come in handly later...

In [4116]:
df['zipcode'] = df.address.map(lambda x: x[-5:])
df.zipcode.unique()

array(['77035', '77044', '77041', '77079', '77008', '77042', '77007',
       '77009', '77071', '77586', '77084', '77064', '77080', '77062',
       '77070', '77066', '77043', '77018', '77065', '77095', '77082',
       '77077', '77063', '77096', '77054', '77055', '77345', '77032',
       '77073', '77026', '77033', '77067', '77037', '77074', '77088',
       '77016', '77045', '77090', '77069', '77049', '77051', '77023',
       '77092', '77094', '77004', '77006', '77025', '77040', '77099',
       '77021', '77014', '77028', '77072', '77022', '77053', '77031',
       '77089', '77047', '77086', '77087', '77081', '77373', '77083',
       '77076', '77078', '77091', '77060', '77020', '77034', '77396',
       '77059', '77003', '77038', '77061', '77015', '77339', '77068',
       '77075', '77058', '77057', '77048', '77085', '77489', '77093',
       '77036', '77011', '77005', '77346', '77598', '77039', '77477',
       '77017', '77029', '77019', '77336', '77012', '77024', '77013',
       '77056', '770

Strange zipcode "-1273", let's take a look...

In [4117]:
df[df.zipcode == '-1273']

Unnamed: 0,address,bathrooms,bedrooms,building_sqft,desc,garages,img_url,list_price,list_status,list_url,lot_sqft,mls_num,pool,real_comp_link,real_comp_name,real_link,real_name,stories,unknown_data,year_built,zipcode
5104,"2218 Lazybrook Drive, Houston, TX 77008-1273",.2 Full Bath(s),3 Bed(s),".1,411 Building Sqft.","Single-Family Property, Ranch,Traditional Style in Lazybrook Subdivision in Timbergrove/Lazybrook (Market Area)",2 Garage(s) / Attached,background-image:url(https://photos.harstatic.com/174712810/lr/img-1.jpeg?ts=2019-06-13T13:33:30.973);,"$ 449,500",For Sale,/2218-lazybrook-drive/sale_80751978,"7,200 Lot Sqft.",MLS# 80751978,No Private Pool,/coldwell-banker-united-realtors---metropolitan/broker_COLD03,"Coldwell Banker United, Realtors - Metropolitan",/steve-louis/agent_LOUIS,Steve Louis,.1 Stories,,Built in 1957,-1273


In [4118]:
df.at[5104, 'zipcode'] = 77008

In [4119]:
# Sanity check. Is it gone?
df[df.zipcode == '-1273']

Unnamed: 0,address,bathrooms,bedrooms,building_sqft,desc,garages,img_url,list_price,list_status,list_url,lot_sqft,mls_num,pool,real_comp_link,real_comp_name,real_link,real_name,stories,unknown_data,year_built,zipcode


In [4120]:
# Check zipcode balance
df.zipcode.value_counts()

77084    503
77044    440
77095    393
77077    385
77008    363
77018    322
77009    311
77007    306
77096    296
77080    277
77070    241
77069    210
77083    209
77092    207
77043    206
77004    183
77035    180
77082    178
77041    178
77088    157
77066    156
77055    148
77073    142
77089    142
77025    142
77059    141
77091    132
77062    130
77064    127
77079    126
        ... 
77012    24 
77060    23 
77013    21 
77396    20 
77085    19 
77346    19 
77032    17 
77057    16 
77006    15 
77037    14 
77598    13 
77336    13 
77024    9  
77056    8  
77365    7  
77477    6  
77005    5  
77002    4  
77098    4  
77433    2  
77373    2  
77050    2  
77008    1  
77504    1  
77030    1  
77586    1  
77350    1  
77338    1  
77027    1  
77483    1  
Name: zipcode, Length: 111, dtype: int64

In [4121]:
# Grab unique zips and how many times they appear
keep_zips = df.zipcode.value_counts()
# Grab only zips that have >= 20 instances (more than 20 properties in this zip)
keep_zips = keep_zips[keep_zips.values >= 20].index
# Finalize dataframe to include only properties with zipcodes in keep_zips list
df = df[df.zipcode.isin(keep_zips)]

In [4122]:
# Remove NaN's and Set Final Form
df.dropna(inplace=True, subset=['zipcode'])
df.zipcode = df.zipcode.astype(str)

In [4123]:
# Sanity Check
df.zipcode.value_counts().values.min()

20

# Bathrooms Column

#### Split into full bathrooms and half bathrooms columns.

In [4124]:
df.bathrooms.unique()

array(['.2 Full & 1 Half Bath(s)', '.3 Full & 1 Half Bath(s)',
       '.2 Full  Bath(s)', '.1 Full  Bath(s)', '.3 Full  Bath(s)',
       '.1 Full & 1 Half Bath(s)', '.5 Full  Bath(s)',
       '.2 Full & 2 Half Bath(s)', nan, '.4 Full  Bath(s)',
       '.3 Full & 2 Half Bath(s)', '.4 Full & 2 Half Bath(s)',
       '.5 Full & 1 Half Bath(s)', '.4 Full & 1 Half Bath(s)',
       '.2 Full & 5 Half Bath(s)', '.6 Full  Bath(s)',
       '.2 Full & 3 Half Bath(s)', '.2 Full & 4 Half Bath(s)',
       '.5 Full & 2 Half Bath(s)', '.1 Full & 2 Half Bath(s)'],
      dtype=object)

In [4125]:
# Cast column to string to we can use strip operations
df.bathrooms = df.bathrooms.astype(str)
df.bathrooms = df.bathrooms.map(lambda x: x.strip('.').strip('Bath(s)'))
df.bathrooms = df.bathrooms.map(lambda x: x.replace('Half',"").replace('Full',"").replace("&",""))
df.bathrooms = df.bathrooms.map(lambda x: x.strip())

#### Fancy little function here that splits the single column into 2 columns with a max of '1' split per row.

In [4126]:
df['full_baths'], df['half_baths'] = df.bathrooms.str.split(' ', 1).str
df.half_baths = df.half_baths.str.lstrip()

In [4127]:
# Remove original bathroom column
df.drop(columns='bathrooms', inplace=True)

In [4128]:
df.full_baths.unique()

array(['2', '3', '1', '5', 'nan', '4', '6'], dtype=object)

In [4129]:
df.half_baths.unique()

array(['1', nan, '2', '5', '3', '4'], dtype=object)

In [4130]:
# Pandas won't find NaN if not float...
df.full_baths = df.full_baths.astype(float)
# Remove NaN's and Set Final Form 
df.dropna(subset=['full_baths'], inplace=True)
# Replace NaN with 0 (don't want to remove if property doesn't have a half bath)
df.half_baths.fillna(0, inplace=True)
df.full_baths = df.full_baths.astype(int)
df.half_baths = df.half_baths.astype(int)

In [4131]:
df.shape

(10835, 22)

In [4132]:
df.full_baths.unique()

array([2, 3, 1, 5, 4, 6])

In [4133]:
df.half_baths.unique()

array([1, 0, 2, 5, 3, 4])

# Bedrooms Column

In [4134]:
df.bedrooms.unique()

array(['4 Bed(s)', '3 Bed(s)', '4-5  Bed(s)', '2 Bed(s)', '3-4  Bed(s)',
       '6 Bed(s)', '2-3  Bed(s)', '5 Bed(s)', '1 Bed(s)', nan, '7 Bed(s)',
       '3-5  Bed(s)', '5-6  Bed(s)', '8 Bed(s)', '4-6  Bed(s)',
       '1-2  Bed(s)', '6-8  Bed(s)', '9 Bed(s)', '1-4  Bed(s)'],
      dtype=object)

In [4135]:
# Remove string 'Bed(s)'
df.bedrooms = df.bedrooms.astype(str)
df.bedrooms = df.bedrooms.map(lambda x: x.replace('Bed(s)', ''))
df.bedrooms = df.bedrooms.str.strip()

In [4136]:
# Helper function that takes averages of bedrooms data
# that take the format '1-4'
def avgBeds(bedrooms):
    if '-' in bedrooms:
        sum = float(bedrooms[0])+float(bedrooms[-1])
        bedrooms = sum/2.0
    return bedrooms

In [4137]:
# Take average of bedrooms with form '1-4'
df.bedrooms = df.bedrooms.apply(avgBeds);

In [4138]:
df.bedrooms[:10]

0    4  
1    4  
2    4  
3    4  
4    3  
5    3  
6    4  
7    3  
8    4.5
9    2  
Name: bedrooms, dtype: object

In [4139]:
# How many homes don't have bedrooms? Must be a float to pick up NaN's
df.bedrooms = df.bedrooms.astype(float)
df.bedrooms.isna().sum()

22

What home doesn't have a bedroom?  Let's get rid of these.

In [4140]:
# Remove NaN's and Set Final Form
df.dropna(inplace=True, subset=['bedrooms'])
df.bedrooms.astype(float);

# Building Sqft Column

In [4141]:
df.building_sqft[:5]

0    .2,747 Building Sqft.
1    .3,500 Building Sqft.
2    .3,346 Building Sqft.
3    .2,308 Building Sqft.
4    .2,259 Building Sqft.
Name: building_sqft, dtype: object

In [4142]:
df.building_sqft = df.building_sqft.astype(str)
df.building_sqft = df.building_sqft.map(lambda x: x.lstrip('.'))
df.building_sqft = df.building_sqft.map(lambda x: x.replace('Building Sqft.', ''))
df.building_sqft = df.building_sqft.map(lambda x: x.replace(',', ''))

In [4143]:
df.building_sqft[:5]

0    2747 
1    3500 
2    3346 
3    2308 
4    2259 
Name: building_sqft, dtype: object

In [4144]:
# Must be float to pick up NaN.  How many homes don't have sqft listed?
df.building_sqft = df.building_sqft.astype(float)
df.building_sqft.isna().sum()

41

Sqft is an important metric.  We'll remove all properties that do not have Sqft as a datapoint.

In [4145]:
# Remove NaN's and Set Final Form
df.dropna(inplace=True, subset=['building_sqft'])
df.building_sqft.astype(int);

# Garages Column

In [4146]:
df.garages[0:5]

0    2 Garage(s) / Attached/Detached
1    3 Garage(s) / Attached,Tandem  
2    2 Garage(s) / Attached         
3    1 Garage(s) / Detached         
4    2 Garage(s) / Attached         
Name: garages, dtype: object

Let's split at '/' and then worry about uniqueness later

In [4147]:
df['garages'], df['garage_type'] = df.garages.str.split('/', 1).str

In [4148]:
df.garages.unique()

array(['2 Garage(s) ', '3 Garage(s) ', '1 Garage(s) ', nan,
       '22 Garage(s) ', '32 Garage(s) ', '4 Garage(s) ', '60 Garage(s) ',
       '40 Garage(s) ', '12 Garage(s) ', '18 Garage(s) ', '5 Garage(s) ',
       '6 Garage(s) ', '8234 Garage(s) ', '50 Garage(s) ',
       '66 Garage(s) ', '16 Garage(s) ', '48 Garage(s) ', '43 Garage(s) ',
       '21 Garage(s) ', '36 Garage(s) '], dtype=object)

#### Looks like we've got some crazy garage numbers and 'nan'.  I checked a few, they're obviously innacurate.  How many do we have?

Let's fill the nan's before we cast to string.

In [4149]:
# Change 'nan' to 0.  Mostly to play nice with seaborn when we graph.
df.garages.fillna(0, inplace=True)

In [4150]:
df.garages = df.garages.astype(str)
df.garages = df.garages.map(lambda x: x.replace('Garage(s)', ''))
df.garages = df.garages.map(lambda x: x.rstrip().lstrip())

In [4151]:
df.garages.value_counts()

2       7747
0       1309
1       1156
3       496 
4       34  
5       4   
6       4   
60      3   
8234    2   
50      2   
21      2   
18      2   
22      2   
12      2   
32      1   
16      1   
40      1   
66      1   
43      1   
36      1   
48      1   
Name: garages, dtype: int64

Not that many crazy garages out of 11,000 homes... let's drop any multi digit garages.

In [4152]:
# Important, must create a copy here.
df2 = df[df.garages.map(lambda x: len(x)<=1)]

In [4153]:
# Swap back to original df variable so we don't get confused.
df = df2
df.garages.value_counts()

2    7747
0    1309
1    1156
3    496 
4    34  
5    4   
6    4   
Name: garages, dtype: int64

In [4154]:
# Remove NaN's and Set Final Form
df.dropna(inplace=True, subset=['garages'])
df.garages.astype(int);

In [4155]:
# Sanity Check
df.head()

Unnamed: 0,address,bedrooms,building_sqft,desc,garages,img_url,list_price,list_status,list_url,lot_sqft,mls_num,pool,real_comp_link,real_comp_name,real_link,real_name,stories,unknown_data,year_built,zipcode,full_baths,half_baths,garage_type
0,"11630 Ashcroft, Houston, TX 77035",4.0,2747.0,"Single-Family Property, Traditional Style in Westbury Subdivision in Brays Oaks (Market Area)",2,background-image:url(https://photos.harstatic.com/172553809/lr/img-1.jpeg?ts=2019-04-17T10:23:50.137);,"$ 475,000",For Sale,/11630-ashcroft/sale_69756841,"8,344 Lot Sqft.",MLS# 69756841,No Private Pool,/compass-re-texas-llc/broker_CMTX01,"Compass RE Texas, LLC",/stephanie-finch/agent_SFinch,Stephanie Finch,.2 Stories,,Built in 1960,77035,2,1,Attached/Detached
1,"18111 Ponte Vecchio, Houston, TX 77044",4.0,3500.0,"Single-Family Property, Traditional Style in Bridges On Lake Houston Subdivision in Atascocita South (Market Area)",3,background-image:url(https://photos.harstatic.com/172364270/lr/img-1.jpeg?ts=2019-04-11T19:58:44.963);,"$ 475,000",For Sale,/18111-ponte-vecchio/sale_45037765,,MLS# 45037765,No Private Pool,/keller-williams-realty-the-woodlands/broker_KWWD01,Keller Williams Realty The Woodlands,/garland-bennett/agent_GBENNETT,Garland Bennett,.1 Stories,,Built in 2018,77044,3,1,"Attached,Tandem"
2,"5910 Caddo Terrace Lane, Houston, TX 77041",4.0,3346.0,"Single-Family Property, Traditional Style in Lakes On Eldridge North Sec Subdivision in Eldridge North (Market Area)",2,background-image:url(https://photos.harstatic.com/172320148/lr/img-1.jpeg?ts=2019-04-09T23:55:57.253);,"$ 475,000",For Sale,/5910-caddo-terrace-lane/sale_52704869,"9,410 Lot Sqft.",MLS# 52704869,Has Private Pool,/keller-williams-realty-metropolitan/broker_KWHM01,Keller Williams Realty Metropolitan,/lorna-ramsay/agent_cornhill,Lorna Ramsay,.2 Stories,,Built in 2003,77041,3,1,Attached
3,"14722 Cindywood Drive, Houston, TX 77079",4.0,2308.0,"Single-Family Property, Traditional Style in Westchester Sec 02 Subdivision in Memorial West (Market Area)",1,background-image:url(https://photos.harstatic.com/171900008/lr/img-1.jpeg?ts=2019-03-29T11:36:32.677);,"$ 475,000",For Sale,/14722-cindywood-drive/sale_29834219,"8,064 Lot Sqft.",MLS# 29834219,No Private Pool,/realty-associates-------------/broker_PBME01,Realty Associates,/richard-gannon/agent_RGANNON,Richard Gannon,.1 Stories,,Built in 1969,77079,2,1,Detached
4,"3424 Timbergrove Heights, Houston, TX 77008",3.0,2259.0,"Single-Family Property, Contemporary/Modern,French,Traditional Style in Timbergrove Heights Subdivision in Timbergrove/Lazybrook (Market Area)",2,background-image:url(https://photos.harstatic.com/171615142/lr/img-1.jpeg?ts=2019-03-28T11:49:12.817);,"$ 475,000",Under Contract - Pending,/3424-timbergrove-heights/sale_40531170,,MLS# 40531170,No Private Pool,/intown-homes/broker_INTW01,Intown Homes,/emily-wang/agent_emwang,Emily Wang,.3 Stories,,Built in 2019,77008,3,1,Attached


#### The garage multi-label data might require some cleanup down the road...

In [4156]:
df2.garage_type.unique()

array([' Attached/Detached', ' Attached,Tandem', ' Attached', ' Detached',
       ' Attached,Oversized', nan, ' Attached,Detached',
       ' Attached,Oversized,Tandem', ' Attached/Detached,Oversized',
       ' Detached,Oversized', ' Attached,Attached/Detached', ' Oversized',
       ' Tandem', ' Attached/Detached,Detached,Oversized',
       ' Attached,Detached,Oversized', ' Attached/Detached,Detached',
       ' Attached/Detached,Tandem', ' Detached,Tandem',
       ' Attached,Attached/Detached,Oversized',
       ' Detached,Oversized,Tandem',
       ' Attached,Attached/Detached,Detached'], dtype=object)

# IMG URL Column

In [4157]:
df.img_url[0]

'background-image:url(https://photos.harstatic.com/172553809/lr/img-1.jpeg?ts=2019-04-17T10:23:50.137);'

In [4158]:
def cleanURL(img_url):
    if '?' in img_url:
        img_url = img_url.replace(img_url[img_url.index('?'):],'')
    return img_url

In [4159]:
df.img_url = df.img_url.astype(str)
df.img_url = df.img_url.map(lambda x: x.lstrip('background-image:url('))
df.img_url = df.img_url.apply(cleanURL)

In [4160]:
# Turn off truncating of URLs
pd.set_option('display.max_colwidth', -1)

In [4161]:
df.img_url[0:5]

0    https://photos.harstatic.com/172553809/lr/img-1.jpeg
1    https://photos.harstatic.com/172364270/lr/img-1.jpeg
2    https://photos.harstatic.com/172320148/lr/img-1.jpeg
3    https://photos.harstatic.com/171900008/lr/img-1.jpeg
4    https://photos.harstatic.com/171615142/lr/img-1.jpeg
Name: img_url, dtype: object

# List Price Column

In [4162]:
df.list_price[0:5]

0     $ 475,000   
1     $ 475,000   
2     $ 475,000   
3     $ 475,000   
4     $ 475,000   
Name: list_price, dtype: object

In [4163]:
df.list_price = df.list_price.map(lambda x: x.replace('$','').replace(',',''))
df.list_price = df.list_price.astype(int)

In [4164]:
df.list_price[0:5]

0    475000
1    475000
2    475000
3    475000
4    475000
Name: list_price, dtype: int64

In [4165]:
# Remove NaN's and Set Final Form
df.dropna(inplace=True, subset=['list_price'])
df.list_price.astype(int);

# List Status Column

In [4166]:
df.list_status.unique()

array(['For Sale', 'Under Contract - Pending',
       'Under Contract - Option Pending',
       'Under Contract - Pending Continue to Show'], dtype=object)

# List URL Column

In [4167]:
df.list_url[0:5]

0    /11630-ashcroft/sale_69756841          
1    /18111-ponte-vecchio/sale_45037765     
2    /5910-caddo-terrace-lane/sale_52704869 
3    /14722-cindywood-drive/sale_29834219   
4    /3424-timbergrove-heights/sale_40531170
Name: list_url, dtype: object

In [4168]:
df.list_url = df.list_url.map(lambda x: 'http://www.har.com'+x)

In [4169]:
df.list_url[0:5]

0    http://www.har.com/11630-ashcroft/sale_69756841          
1    http://www.har.com/18111-ponte-vecchio/sale_45037765     
2    http://www.har.com/5910-caddo-terrace-lane/sale_52704869 
3    http://www.har.com/14722-cindywood-drive/sale_29834219   
4    http://www.har.com/3424-timbergrove-heights/sale_40531170
Name: list_url, dtype: object

# Lot Sqft Column

In [4170]:
df.lot_sqft[0:5]

0    8,344 Lot Sqft.
1    NaN            
2    9,410 Lot Sqft.
3    8,064 Lot Sqft.
4    NaN            
Name: lot_sqft, dtype: object

In [4171]:
df.lot_sqft = df.lot_sqft.astype(str)
df.lot_sqft = df.lot_sqft.map(lambda x: x.replace('Lot Sqft.', ''))
df.lot_sqft = df.lot_sqft.map(lambda x: x.replace(',', ''))
df.lot_sqft = df.lot_sqft.astype(float)

In [4172]:
df.lot_sqft[0:5]

0    8344.0
1   NaN    
2    9410.0
3    8064.0
4   NaN    
Name: lot_sqft, dtype: float64

In [4173]:
# Must be float to pick up NaN.  How many homes don't have sqft listed?
df.lot_sqft = df.lot_sqft.astype(float)
df.lot_sqft.isna().sum()

568

There are quite a few homes that don't list lot sqft.  We'll have to decide what to do with this later...

# MLS Number Column

In [4174]:
df.mls_num[0:5]

0    MLS# 69756841
1    MLS# 45037765
2    MLS# 52704869
3    MLS# 29834219
4    MLS# 40531170
Name: mls_num, dtype: object

In [4175]:
df.mls_num = df.mls_num.map(lambda x: x.replace('MLS# ', ''))

In [4176]:
df.mls_num[0:5]

0    69756841
1    45037765
2    52704869
3    29834219
4    40531170
Name: mls_num, dtype: object

# Pool Column

In [4177]:
df.pool.unique()

array(['No Private Pool ', ' Has Private Pool'], dtype=object)

In [4178]:
# Remove leading white space from ' Has Private Pool' category
df.pool = df.pool.str.lstrip()

In [4179]:
df.pool[0:5]

0    No Private Pool 
1    No Private Pool 
2    Has Private Pool
3    No Private Pool 
4    No Private Pool 
Name: pool, dtype: object

# Realtor Company Link Column

In [4180]:
df.real_comp_link[0:5]

0    /compass-re-texas-llc/broker_CMTX01                
1    /keller-williams-realty-the-woodlands/broker_KWWD01
2    /keller-williams-realty-metropolitan/broker_KWHM01 
3    /realty-associates-------------/broker_PBME01      
4    /intown-homes/broker_INTW01                        
Name: real_comp_link, dtype: object

In [4181]:
df.real_comp_link = df.real_comp_link.map(lambda x: 'http://www.har.com'+x)

In [4182]:
df.real_comp_link[0:5]

0    http://www.har.com/compass-re-texas-llc/broker_CMTX01                
1    http://www.har.com/keller-williams-realty-the-woodlands/broker_KWWD01
2    http://www.har.com/keller-williams-realty-metropolitan/broker_KWHM01 
3    http://www.har.com/realty-associates-------------/broker_PBME01      
4    http://www.har.com/intown-homes/broker_INTW01                        
Name: real_comp_link, dtype: object

# Realtor Company Name Column

In [4183]:
df.real_comp_name[0:5]

0    Compass RE Texas, LLC               
1    Keller Williams Realty The Woodlands
2    Keller Williams Realty Metropolitan 
3    Realty Associates                   
4    Intown Homes                        
Name: real_comp_name, dtype: object

# Realtor Link

In [4184]:
df.real_link[0:5]

0    /stephanie-finch/agent_SFinch  
1    /garland-bennett/agent_GBENNETT
2    /lorna-ramsay/agent_cornhill   
3    /richard-gannon/agent_RGANNON  
4    /emily-wang/agent_emwang       
Name: real_link, dtype: object

In [4185]:
df.real_link = df.real_link.map(lambda x: 'http://www.har.com'+x)

In [4186]:
df.real_link[0:5]

0    http://www.har.com/stephanie-finch/agent_SFinch  
1    http://www.har.com/garland-bennett/agent_GBENNETT
2    http://www.har.com/lorna-ramsay/agent_cornhill   
3    http://www.har.com/richard-gannon/agent_RGANNON  
4    http://www.har.com/emily-wang/agent_emwang       
Name: real_link, dtype: object

# Realtor Name

In [4187]:
df.real_name[0:5]

0    Stephanie Finch
1    Garland Bennett
2    Lorna Ramsay   
3    Richard Gannon 
4    Emily Wang     
Name: real_name, dtype: object

# Stories Column

In [4188]:
df.stories.unique()

array(['.2 Stories', '.1 Stories', '.3 Stories', '.4 Stories',
       '.1.5 Stories', '.2.5 Stories'], dtype=object)

In [4189]:
# Cast column to string to we can use strip operations
df.stories = df.stories.astype(str)
df.stories = df.stories.map(lambda x: x.lstrip('.').rstrip('Stories'))
# Cast column back to float
df.stories = df.stories.astype(float)

In [4190]:
df.stories[:5]

0    2.0
1    1.0
2    2.0
3    1.0
4    3.0
Name: stories, dtype: float64

In [4191]:
df.stories.unique()

array([2. , 1. , 3. , 4. , 1.5, 2.5])

# Unknown Column

Might decide to return and use acres...

In [4192]:
df.unknown_data.unique()

array([nan, '1.92 Acre(s)', '0.43 Acre(s)', '2.71 Acre(s)',
       '1.64 Acre(s)', '0.55 Acre(s)', '0.96 Acre(s)', '0.61 Acre(s)',
       '2 Acre(s)', '1.01 Acre(s)', '1.06 Acre(s)', '4.81 Acre(s)',
       '0.69 Acre(s)', '1.37 Acre(s)', '0.18 Acre(s)', '2.13 Acre(s)',
       '0.16 Acre(s)', '0.56 Acre(s)', '0.07 Acre(s)', '0.13 Acre(s)',
       '0.30 Acre(s)', '1.30 Acre(s)', '1 Acre(s)', '1.26 Acre(s)',
       '1.10 Acre(s)', '2.83 Acre(s)', '1.02 Acre(s)', '1.60 Acre(s)',
       '1.66 Acre(s)', '2.24 Acre(s)', '1.22 Acre(s)', '1.42 Acre(s)',
       '1.88 Acre(s)', '5 Acre(s)', '2.47 Acre(s)', '1.08 Acre(s)',
       '2.98 Acre(s)', '1.21 Acre(s)', '1.09 Acre(s)', '0.04 Acre(s)',
       '0.11 Acre(s)', '1.04 Acre(s)'], dtype=object)

# Year Built Column

In [4193]:
df.year_built[0:5]

0    Built in 1960
1    Built in 2018
2    Built in 2003
3    Built in 1969
4    Built in 2019
Name: year_built, dtype: object

In [4194]:
df.year_built = df.year_built.astype(str)
df.year_built = df.year_built.map(lambda x: x.replace('Built in ', ''))

In [4195]:
df.year_built[0:5]

0    1960
1    2018
2    2003
3    1969
4    2019
Name: year_built, dtype: object

In [4196]:
# Must be float to pick up NaN.  How many homes don't have sqft listed?
df.year_built = df.year_built.astype(float)
df.year_built.isna().sum()

77

Not too many homes that don't have year built data.  Let's get rid of these...

In [4197]:
# Remove NaN's and Set Final Form
df.dropna(inplace=True, subset=['year_built'])
df.year_built.astype(int);

# Sanity Check!

In [4198]:
print(df.shape)
df.head()

(10673, 23)


Unnamed: 0,address,bedrooms,building_sqft,desc,garages,img_url,list_price,list_status,list_url,lot_sqft,mls_num,pool,real_comp_link,real_comp_name,real_link,real_name,stories,unknown_data,year_built,zipcode,full_baths,half_baths,garage_type
0,"11630 Ashcroft, Houston, TX 77035",4.0,2747.0,"Single-Family Property, Traditional Style in Westbury Subdivision in Brays Oaks (Market Area)",2,https://photos.harstatic.com/172553809/lr/img-1.jpeg,475000,For Sale,http://www.har.com/11630-ashcroft/sale_69756841,8344.0,69756841,No Private Pool,http://www.har.com/compass-re-texas-llc/broker_CMTX01,"Compass RE Texas, LLC",http://www.har.com/stephanie-finch/agent_SFinch,Stephanie Finch,2.0,,1960.0,77035,2,1,Attached/Detached
1,"18111 Ponte Vecchio, Houston, TX 77044",4.0,3500.0,"Single-Family Property, Traditional Style in Bridges On Lake Houston Subdivision in Atascocita South (Market Area)",3,https://photos.harstatic.com/172364270/lr/img-1.jpeg,475000,For Sale,http://www.har.com/18111-ponte-vecchio/sale_45037765,,45037765,No Private Pool,http://www.har.com/keller-williams-realty-the-woodlands/broker_KWWD01,Keller Williams Realty The Woodlands,http://www.har.com/garland-bennett/agent_GBENNETT,Garland Bennett,1.0,,2018.0,77044,3,1,"Attached,Tandem"
2,"5910 Caddo Terrace Lane, Houston, TX 77041",4.0,3346.0,"Single-Family Property, Traditional Style in Lakes On Eldridge North Sec Subdivision in Eldridge North (Market Area)",2,https://photos.harstatic.com/172320148/lr/img-1.jpeg,475000,For Sale,http://www.har.com/5910-caddo-terrace-lane/sale_52704869,9410.0,52704869,Has Private Pool,http://www.har.com/keller-williams-realty-metropolitan/broker_KWHM01,Keller Williams Realty Metropolitan,http://www.har.com/lorna-ramsay/agent_cornhill,Lorna Ramsay,2.0,,2003.0,77041,3,1,Attached
3,"14722 Cindywood Drive, Houston, TX 77079",4.0,2308.0,"Single-Family Property, Traditional Style in Westchester Sec 02 Subdivision in Memorial West (Market Area)",1,https://photos.harstatic.com/171900008/lr/img-1.jpeg,475000,For Sale,http://www.har.com/14722-cindywood-drive/sale_29834219,8064.0,29834219,No Private Pool,http://www.har.com/realty-associates-------------/broker_PBME01,Realty Associates,http://www.har.com/richard-gannon/agent_RGANNON,Richard Gannon,1.0,,1969.0,77079,2,1,Detached
4,"3424 Timbergrove Heights, Houston, TX 77008",3.0,2259.0,"Single-Family Property, Contemporary/Modern,French,Traditional Style in Timbergrove Heights Subdivision in Timbergrove/Lazybrook (Market Area)",2,https://photos.harstatic.com/171615142/lr/img-1.jpeg,475000,Under Contract - Pending,http://www.har.com/3424-timbergrove-heights/sale_40531170,,40531170,No Private Pool,http://www.har.com/intown-homes/broker_INTW01,Intown Homes,http://www.har.com/emily-wang/agent_emwang,Emily Wang,3.0,,2019.0,77008,3,1,Attached


# Save New CSV

In [4199]:
df.to_csv(output_dir+'CLEAN_'+filename+'.csv', index=False)

# Done!