In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [55]:
df = pd.read_csv('../final_rental_merged.csv')

In [56]:
df.columns

Index(['Unnamed: 0', 'listedAt', 'daysOnMarket', 'availableFrom', 'street',
       'price', 'borough', 'neighborhood', 'zip', 'propertyType', 'beds',
       'baths', 'latitude', 'longitude', 'amenities', 'builtIn', 'description',
       'hasVideo', 'PhotosNum', 'state', 'isUndisclosedAddress',
       'isFeaturedListing', 'duplicate', 'Total Population', 'Median Age',
       'Male Population', 'Female Population', 'White Alone',
       'Black or African American Alone', 'Asian Alone', 'Hispanic or Latino',
       'Number of Households', 'Median Household Income', 'Per Capita Income',
       'Population Below Poverty Level', 'Employed', 'Unemployed',
       'Total Income Distribution', 'Median Gross Rent', 'Median Home Value',
       'Occupied Housing Units', 'Vacant Housing Units',
       'Owner-Occupied Units (value < $100,000)', 'Monthly Housing Costs',
       'High School Graduate (Age 25+)', 'Bachelor’s Degree (Age 25+)',
       'Graduate or Professional Degree (Age 25+)', 'English 

In [57]:
df.drop('Unnamed: 0',axis=1,inplace=True) # dropping the index column inferred as a normal column

In [58]:
print("Number of rows:",len(df))

Number of rows: 7842


## Analysing and cleaning columns

In [59]:
for column in df.columns:
    count = df[column].value_counts()
    print("-------------------------------------------")
    print(f"column : {column}")
    print(f"Number of Unique values: {len(count)}")
    if len(count) < 50:       
        print(f"values list: {list(df[column].unique())}")  # Convert to list and remove .index

-------------------------------------------
column : listedAt
Number of Unique values: 177
-------------------------------------------
column : daysOnMarket
Number of Unique values: 204
-------------------------------------------
column : availableFrom
Number of Unique values: 309
-------------------------------------------
column : street
Number of Unique values: 7842
-------------------------------------------
column : price
Number of Unique values: 1378
-------------------------------------------
column : borough
Number of Unique values: 5
values list: ['Brooklyn', 'Manhattan', 'Queens', 'Bronx', 'Staten Island']
-------------------------------------------
column : neighborhood
Number of Unique values: 163
-------------------------------------------
column : zip
Number of Unique values: 166
-------------------------------------------
column : propertyType
Number of Unique values: 9
values list: ['rental', 'coop', 'house', 'condo', 'townhouse', 'Apartment', 'Townhouse', 'House', 'Con

In [60]:
# for daysOnMarket
# Check null values 
print("Number of null values:", df['daysOnMarket'].isnull().sum())

# Get basic statistics
print("\nStatistics for daysOnMarket:")
print(df['daysOnMarket'].describe())

Number of null values: 0

Statistics for daysOnMarket:
count    7842.000000
mean       18.371716
std        35.188656
min        -1.000000
25%         2.000000
50%         9.000000
75%        22.000000
max       958.000000
Name: daysOnMarket, dtype: float64


In [61]:
print("rows with negative daysOnMarket:",len(df[df['daysOnMarket']<0]))
negative_days_on_market_df = df[df['daysOnMarket']<0]
negative_days_on_market_df.head()

# looks like days on market is -1 when listedAt is NaN

rows with negative daysOnMarket: 1239


Unnamed: 0,listedAt,daysOnMarket,availableFrom,street,price,borough,neighborhood,zip,propertyType,beds,...,Graduate or Professional Degree (Age 25+),English Only,Spanish,Never Married,Currently Married,Divorced,Precinct,schools_in_precinct,nearby_subway_stations,crime_rate
2798,,-1,2024-11-18 00:00:00,511 Van Buren St #3R,"$2,400/mo",Brooklyn,,11221,Apartment,1,...,458.0,,,21906.0,11845.0,2080.0,81,33,3,1.214623
2799,,-1,,607 Gates Ave #1E,"$2,700/mo",Brooklyn,,11221,Apartment,2,...,458.0,,,21906.0,11845.0,2080.0,79,46,0,1.591764
2801,,-1,,460 Throop Ave APT 10,"$2,400/mo",Brooklyn,,11221,Apartment,1,...,458.0,,,21906.0,11845.0,2080.0,79,46,0,1.591764
2803,,-1,2024-11-20 00:00:00,509 Van Buren St #5R,"$2,500/mo",Brooklyn,,11221,Apartment,1,...,458.0,,,21906.0,11845.0,2080.0,81,33,3,1.214623
2817,,-1,2024-12-01 00:00:00,499 Evergreen Ave APT 3B,"$3,050/mo",Brooklyn,,11221,Apartment,1,...,458.0,,,21906.0,11845.0,2080.0,83,37,0,1.37311


In [62]:
# Replace negative values with NaN
df.loc[df['daysOnMarket'] < 0, 'daysOnMarket'] = np.nan

print("Number of negative values:", len(df[df['daysOnMarket'] < 0]))
print("Number of NaN values:", df['daysOnMarket'].isnull().sum())

Number of negative values: 0
Number of NaN values: 1239


In [63]:
# for price
print("Number of null values:", df['price'].isnull().sum())
print(df['price'].value_counts())

Number of null values: 0
price
$3,000/mo    182
$3,500/mo    155
$2,800/mo    127
$3,200/mo    125
$2,600/mo     97
            ... 
3296           1
3346           1
3126           1
4159           1
$1,625/mo      1
Name: count, Length: 1378, dtype: int64


In [64]:
# Remove '$' and '/mo' and convert to numeric
df['price'] = df['price'].replace('[\\$,/mo]', '', regex=True).astype(float)

# Verify the cleaning
print("After cleaning:")
print("Number of null values:", df['price'].isnull().sum())
print("\nBasic statistics:")
print(df['price'].describe())

After cleaning:
Number of null values: 0

Basic statistics:
count      7842.000000
mean       4638.869421
std        5976.426868
min        1199.000000
25%        2625.000000
50%        3250.000000
75%        4295.000000
max      125000.000000
Name: price, dtype: float64


In [65]:
# for zip
# Check null values 
print("Number of null values:", df['zip'].isnull().sum()) # NO NULL VALUES
print(df['zip'].describe()) # no negative values - GOOD


Number of null values: 0
count     7842.000000
mean     10651.059551
std        577.309660
min      10001.000000
25%      10025.000000
50%      10467.000000
75%      11221.000000
max      12345.000000
Name: zip, dtype: float64


In [66]:
# for propertyType
print("Number of null values:", df['propertyType'].isnull().sum()) # no null values
print(df['propertyType'].value_counts()) # need to lowercase everything

Number of null values: 0
propertyType
Apartment    4713
rental       1973
condo         507
coop          236
House         201
Townhouse     116
house          59
Condo          22
townhouse      15
Name: count, dtype: int64


In [67]:
df['propertyType'] = df['propertyType'].apply(lambda x:x.lower())
print(df['propertyType'].value_counts()) # done

propertyType
apartment    4713
rental       1973
condo         529
house         260
coop          236
townhouse     131
Name: count, dtype: int64


In [68]:
# for beds
print("Number of null values:", df['beds'].isnull().sum()) # no null values
print(df['beds'].value_counts()) # looks all good, 0 beds could mean studio

Number of null values: 0
beds
1     2401
2     2356
3     1665
0      934
4      390
5       70
6       20
8        4
10       1
7        1
Name: count, dtype: int64


In [69]:
# for latitude
print("Number of null values:", df['latitude'].isnull().sum()) # no null values
print("negative values:",len(df[df['latitude']<=0])) # looks good for NYC
print(df['latitude'].value_counts()) # looks all good


Number of null values: 0
negative values: 0
latitude
40.710201    6
40.746300    6
40.761299    5
40.707901    5
40.750198    5
            ..
40.701070    1
40.699270    1
40.685238    1
40.695530    1
40.709740    1
Name: count, Length: 7360, dtype: int64


In [70]:
# for longitude
print("Number of null values:", df['longitude'].isnull().sum()) # no null values
print("negative values:",len(df[df['longitude']<=0])) # looks good for NYC
print(df['longitude'].value_counts()) # looks all good

Number of null values: 0
negative values: 7842
longitude
-73.992302    7
-73.989304    6
-73.984200    6
-73.972702    6
-73.966904    5
             ..
-73.890145    1
-73.877762    1
-73.876597    1
-73.894056    1
-74.006620    1
Name: count, Length: 7002, dtype: int64


In [71]:
# for amenities
print("Number of null values:", df['amenities'].isnull().sum()) # too many null valus. over 50%. probably drop?
print(df['amenities'].value_counts())

Number of null values: 5052
amenities
['fios_available']                                                                                                                                                                                                                                                                                                                       118
['fios_available', 'hardwood_floors']                                                                                                                                                                                                                                                                                                     54
['fios_available', 'hardwood_floors', 'pets']                                                                                                                                                                                                                                                           

In [72]:
# for builtIn
print("Number of null values:", df['builtIn'].isnull().sum()) # too many null valus. over 50%. probably drop?
print("negative values:",len(df[df['builtIn']<=0])) # negattive values too?? dropping for sure

Number of null values: 5052
negative values: 77


In [73]:
# for hasVideo
print("Number of null values:", df['hasVideo'].isnull().sum()) # no null values
print(df['hasVideo'].value_counts()) # can clean this

Number of null values: 0
hasVideo
False    5049
1        2790
True        3
Name: count, dtype: int64


In [77]:
df['hasVideo'] = df['hasVideo'].apply(lambda x:1 if x=='True' or x=='1' else 0)
print(df['hasVideo'].value_counts()) # done

hasVideo
0    5049
1    2793
Name: count, dtype: int64


In [79]:
# for PhotosNum
print("Number of null values:", df['PhotosNum'].isnull().sum()) # no null values
print("negative values:",len(df[df['PhotosNum']<=0])) 
print(df['PhotosNum'].value_counts())

Number of null values: 0
negative values: 3793
PhotosNum
0      3793
480     304
560     285
640     250
720     242
       ... 
35        1
51        1
73        1
37        1
31        1
Name: count, Length: 63, dtype: int64


In [81]:
# for state
print("Number of null values:", df['state'].isnull().sum()) # no null values
print(df['state'].value_counts()) # we know we have taken all of these from NYC in NY state. so we can just drop this column, all values will be NY here

Number of null values: 2790
state
NY    5052
Name: count, dtype: int64


In [83]:
# for isUndisclosedAddress
print("Number of null values:", df['isUndisclosedAddress'].isnull().sum()) # no null values
print(df['isUndisclosedAddress'].value_counts()) # just dropping this, too many null

Number of null values: 6552
isUndisclosedAddress
False    1290
Name: count, dtype: int64


In [85]:
# for isFeaturedListing
print("Number of null values:", df['isFeaturedListing'].isnull().sum()) 
print(df['isFeaturedListing'].value_counts()) # can drop

Number of null values: 2790
isFeaturedListing
True     3729
False    1323
Name: count, dtype: int64


In [87]:
# for duplicate
print("Number of null values:", df['duplicate'].isnull().sum()) 
print(df['duplicate'].value_counts()) # can drop

Number of null values: 0
duplicate
False    7842
Name: count, dtype: int64


In [88]:
# for Total Population
print("Number of null values:", df['Total Population'].isnull().sum()) 
print(df['Total Population'].value_counts())

Number of null values: 870
Total Population
54369.0    207
53877.0    181
91236.0    176
58418.0    158
65511.0    147
          ... 
42792.0      2
43517.0      2
0.0          1
7365.0       1
47230.0      1
Name: count, Length: 131, dtype: int64


In [91]:
df[df['Total Population'].isnull()]['zip'].value_counts()

zip
10128    106
11101     87
11102     84
11435     79
11105     74
11106     72
11103     62
11432     59
10065     51
11249     43
10075     32
11104     29
11434     18
11433     11
10282      7
10280      7
11421      5
11422      4
11429      4
11412      4
11436      4
10044      4
11426      3
11420      3
11423      3
11109      2
11413      2
11427      2
11419      2
11415      2
10069      1
11418      1
12345      1
11416      1
11411      1
Name: count, dtype: int64