# Scraping Apartment Listings for Apartment hunting

This is an attempt for me to learn how to web scrape, clean my own data, and generate a predictive model of apartments that selects apartments that I would like based on price, size, etc.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

## Initial analysis was done on the df variable, but I intend to apply cleanup process on other 3

In [2]:
df = pd.read_csv("Output.csv")

sd = pd.read_csv("San_Diego.csv")
oc = pd.read_csv("Orange_County.csv")
la = pd.read_csv("Los_Angeles.csv")

In [3]:
df.columns

Index(['Unnamed: 0', 'Address', 'Beds', 'Phone', 'Price', 'Title'], dtype='object')

Very minimal dataset, but as I improve my cleanup and analysis skills, I will scrape more content and perform more relevant transformations.

Proof of concept to start.

In [2]:
# set variable price to dataframe column 'Price', which is just a pandas series
price = df["Price"]
price

NameError: name 'df' is not defined

In [5]:
type(df["Price"][0])

str

In [6]:
# Remove the $ symbol
df["Price"] = df["Price"].str[1:]
price

0       2,152 - 6,912
1       1,910 - 3,757
2       2,075 - 4,425
3       2,149 - 5,205
4       1,744 - 3,249
5       1,463 - 2,503
6       2,425 - 4,350
7       1,985 - 3,760
8       1,875 - 4,430
9       1,793 - 4,972
10      2,250 - 4,500
11     1,950 - 12,680
12      2,340 - 4,495
13      1,895 - 4,255
14      2,011 - 4,385
15      2,870 - 4,815
16      2,300 - 6,070
17      1,975 - 6,770
18      1,835 - 6,135
19      2,040 - 3,480
20     2,448 - 12,793
21      2,644 - 6,892
22      1,913 - 4,966
23      2,039 - 3,472
24     2,051 - 20,000
25      2,010 - 6,180
26      1,870 - 3,445
27      1,925 - 5,145
28     1,700 - 10,000
29      1,480 - 3,074
            ...      
224     1,643 - 2,606
225     1,784 - 2,874
226     1,873 - 2,889
227     2,220 - 4,855
228     1,953 - 2,043
229     2,055 - 2,505
230     1,640 - 2,085
231     1,762 - 2,556
232     1,618 - 2,520
233    4,345 - 11,302
234     2,695 - 3,325
235     1,323 - 2,104
236     2,395 - 3,895
237     1,739 - 2,345
238     1,

In [7]:
# Remove the , 
price = price.str.replace(',','')
price

0       2152 - 6912
1       1910 - 3757
2       2075 - 4425
3       2149 - 5205
4       1744 - 3249
5       1463 - 2503
6       2425 - 4350
7       1985 - 3760
8       1875 - 4430
9       1793 - 4972
10      2250 - 4500
11     1950 - 12680
12      2340 - 4495
13      1895 - 4255
14      2011 - 4385
15      2870 - 4815
16      2300 - 6070
17      1975 - 6770
18      1835 - 6135
19      2040 - 3480
20     2448 - 12793
21      2644 - 6892
22      1913 - 4966
23      2039 - 3472
24     2051 - 20000
25      2010 - 6180
26      1870 - 3445
27      1925 - 5145
28     1700 - 10000
29      1480 - 3074
           ...     
224     1643 - 2606
225     1784 - 2874
226     1873 - 2889
227     2220 - 4855
228     1953 - 2043
229     2055 - 2505
230     1640 - 2085
231     1762 - 2556
232     1618 - 2520
233    4345 - 11302
234     2695 - 3325
235     1323 - 2104
236     2395 - 3895
237     1739 - 2345
238     1550 - 2350
239     3490 - 4950
240     1545 - 1860
241     1635 - 2500
242     2050 - 2150


In [8]:
# split the two values into separate columns...a high and low
price = price.str.split("-", expand=True)
price

Unnamed: 0,0,1
0,2152,6912
1,1910,3757
2,2075,4425
3,2149,5205
4,1744,3249
5,1463,2503
6,2425,4350
7,1985,3760
8,1875,4430
9,1793,4972


In [9]:
type(price[1][0])

str

In [10]:
# rename the columns
price.rename(index=str, columns={0: 'min_price', 1: 'max_price'}, inplace=True)

In [11]:
# notice that the individual string has whitespace...let's remove
price['min_price'][0]

'2152 '

In [12]:
# use the aforementioned function on the dataframe
price_trimmed = price.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [13]:
price_trimmed['min_price'][0]

'2152'

In [14]:
price_trimmed

Unnamed: 0,min_price,max_price
0,2152,6912
1,1910,3757
2,2075,4425
3,2149,5205
4,1744,3249
5,1463,2503
6,2425,4350
7,1985,3760
8,1875,4430
9,1793,4972


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254 entries, 0 to 253
Data columns (total 6 columns):
Unnamed: 0    254 non-null int64
Address       254 non-null object
Beds          254 non-null object
Phone         254 non-null object
Price         254 non-null object
Title         254 non-null object
dtypes: int64(1), object(5)
memory usage: 12.0+ KB


In [24]:
price_trimmed

Unnamed: 0,min_price,max_price
0,2152,6912
1,1910,3757
2,2075,4425
3,2149,5205
4,1744,3249
5,1463,2503
6,2425,4350
7,1985,3760
8,1875,4430
9,1793,4972


In [26]:
len(df)

254

In [27]:
len(price_trimmed)

254

In [29]:
df.index

RangeIndex(start=0, stop=254, step=1)

In [33]:
price_trimmed.index

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '244', '245', '246', '247', '248', '249', '250', '251', '252', '253'],
      dtype='object', length=254)

In [34]:
test_df = df.reset_index()

In [36]:
test_df.index

RangeIndex(start=0, stop=254, step=1)

In [37]:
test_price = price_trimmed.reset_index()

In [39]:
test_price.index

RangeIndex(start=0, stop=254, step=1)

In [45]:
df1 = pd.concat([test_df, test_price], axis=1)

In [56]:
df1 = df1.drop(columns=['index'], axis=1)

In [59]:
df1 = df1.drop(columns=['Unnamed: 0'])

In [62]:
df1.head()

Unnamed: 0,Address,Beds,Phone,Price,Title,min_price,max_price
0,"5305-5305 Toscana Way, San Diego, CA 92122",1-3 Bed,844-360-6531,"2,152 - 6,912",Valentia,2152,6912
1,"8725 Ariva Ct, San Diego, CA 92123",Studio - 3 Bed,844-781-5203,"1,910 - 3,757",Vive on the Park,1910,3757
2,"4223 Texas St, San Diego, CA 92104",Studio - 2 Bed,949-245-6587,"2,075 - 4,425",Broadstone North Park,2075,4425
3,"1601 Broadway, San Diego, CA 92101",Studio - 2 Bed,619-817-8362,"2,149 - 5,205",Broadstone Maker's Quarter,2149,5205
4,"1100 Dennery Rd, San Diego, CA 92154",1-3 Bed,866-239-7071,"1,744 - 3,249",Casoleil,1744,3249


In [63]:
# sort by min values
df_sort = df1.sort_values(by=['min_price'])

In [66]:
# reset index
df_sort = df_sort.reset_index()

In [68]:
df_sort['min_price'].astype(float)

ValueError: could not convert string to float: 'all for Rent'