In [1]:
# Dependencies and Setup
import pandas as pd
import os

In [2]:
# File to Load (Remember to change the path if needed.)
cities = os.path.join("Resources", "Cities_Along_Raritan.csv")

# Read the School Data and Student Data and store into a Pandas DataFrame
cities_df = pd.read_csv(cities)
cities_df.head()

Unnamed: 0,zipcode,city,county,flood_zone,Latitude,Longitude
0,8901,New Brunswick,Middlesex,A,40.48603,-74.431
1,8904,Highland park,Middlesex,AE,40.49333,-74.4338
2,8876,Somerville,Somerset,AE,40.55752,-74.6154
3,8805,Bound Brook,Somerset,X,40.56078,-74.5339
4,8807,Bridgewater,Somerset,0,40.54642,-74.6706


In [3]:
cities_df.dtypes

zipcode         int64
city           object
county         object
flood_zone     object
Latitude      float64
Longitude     float64
dtype: object

In [4]:
# Clean cities file Convert zipcode to string
cities_df['zipcode'] = cities_df['zipcode'].apply(str)

# Fix zipcodes in cities file to left pad with 0
cities_df['zipcode']=cities_df.zipcode.str.pad(5,side='left',fillchar='0')

In [5]:
cities_df.head()

Unnamed: 0,zipcode,city,county,flood_zone,Latitude,Longitude
0,8901,New Brunswick,Middlesex,A,40.48603,-74.431
1,8904,Highland park,Middlesex,AE,40.49333,-74.4338
2,8876,Somerville,Somerset,AE,40.55752,-74.6154
3,8805,Bound Brook,Somerset,X,40.56078,-74.5339
4,8807,Bridgewater,Somerset,0,40.54642,-74.6706


In [6]:
# Read the Zillow Data for Real Estate List Price History

homeprice = os.path.join("Resources", "ZipcodePricealltypeshouse.csv")

# Read the RealEstateList Price Data and store into a Pandas DataFrame
homeprice_df = pd.read_csv(homeprice)
homeprice_df.head()

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,City,Metro,CountyName,1/31/2000,...,8/31/2021,9/30/2021,10/31/2021,11/30/2021,12/31/2021,1/31/2022,2/28/2022,3/31/2022,4/30/2022,5/31/2022
0,61639,0,10025,Zip,NY,NY,New York,New York-Newark-Jersey City,New York County,325181.0,...,1124632.0,1135008.0,1139408.0,1138788.0,1139359.0,1145264.0,1157044.0,1175799.0,1192501.0,1206638.0
1,84654,1,60657,Zip,IL,IL,Chicago,Chicago-Naperville-Elgin,Cook County,314617.0,...,523400.0,526000.0,528598.0,531407.0,533414.0,534807.0,535022.0,537713.0,539815.0,544880.0
2,61637,2,10023,Zip,NY,NY,New York,New York-Newark-Jersey City,New York County,507069.0,...,1483316.0,1499147.0,1507813.0,1512079.0,1512394.0,1523018.0,1532777.0,1549793.0,1562975.0,1581574.0
3,91982,3,77494,Zip,TX,TX,Katy,Houston-The Woodlands-Sugar Land,Harris County,225949.0,...,414215.0,423395.0,431406.0,438534.0,444306.0,450440.0,460032.0,471785.0,485634.0,497150.0
4,84616,4,60614,Zip,IL,IL,Chicago,Chicago-Naperville-Elgin,Cook County,409525.0,...,665692.0,668892.0,672821.0,677488.0,680424.0,682344.0,682106.0,685214.0,687665.0,693485.0


In [7]:
homeprice_df.dtypes

RegionID        int64
SizeRank        int64
RegionName      int64
RegionType     object
StateName      object
               ...   
1/31/2022     float64
2/28/2022     float64
3/31/2022     float64
4/30/2022     float64
5/31/2022     float64
Length: 278, dtype: object

In [8]:
# Read the Rainfall Data for New Brunswick
rainfall = os.path.join("Resources", "new_brunswick_precip.csv")

# Read the rainfall Data and store into a Pandas DataFrame
rainfall_df = pd.read_csv(rainfall)
rainfall_df.head()

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual
0,1893,2.61,5.73,3.22,5.61,3.95,3.28,2.31,10.7,3.31,3.61,3.91,Mo,Ma
1,1894,2.22,3.99,1.59,3.41,6.57,2.25,2.5,1.96,10.85,5.36,3.55,3.86,48.11
2,1895,5.44,0.97,Mn,4.61,2.82,2.79,4.26,4.5,Mu,3.9,3.41,2.68,Mb
3,1896,,,,,,,,,,,,,
4,1897,,,,,,,,,,,,,


In [9]:
rainfall_df.tail()

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual
131,Mean,3.43,3.04,3.8,3.76,4.04,3.89,4.91,4.7,4.11,3.58,3.45,3.75,46.27
132,Median,3.03,2.78,3.53,3.25,3.93,3.39,4.35,3.98,3.31,3.24,3.14,3.48,45.84
133,Min,0.5,0.64,0.75,0.65,0.29,0.02,0.68,0.76,0.04,0.25,0.34,0.17,27.51
134,Max,9.18,6.67,11.13,12.38,9.38,10.04,11.39,17.43,12.68,12.32,8.72,9.91,66.11
135,Count,118.0,119.0,118.0,119.0,119.0,118.0,119.0,119.0,117.0,118.0,118.0,116.0,114.0


In [10]:
rainfall_df.dtypes

Year      object
Jan       object
Feb       object
Mar       object
Apr       object
May       object
Jun       object
Jul       object
Aug       object
Sep       object
Oct       object
Nov       object
Dec       object
Annual    object
dtype: object

In [11]:
# Clean rainfall
# Remove Rows where Year is not numeric
rainfall_df['Year'] = pd.to_numeric(rainfall_df['Year'], errors='coerce')

# Drop NaN's
rainfall_df.dropna(how='any',inplace=True)

In [12]:
# Convert year from float to integer
rainfall_df.Year = rainfall_df.Year.astype(int)

In [13]:
# Filter only years 2000-2021
clean_rainfall_df = rainfall_df[rainfall_df.Year > 2012] 
clean_rainfall_df = clean_rainfall_df[clean_rainfall_df.Year < 2022] 

# Drop Annual Column
clean_rainfall_df=clean_rainfall_df.drop('Annual',1)

  clean_rainfall_df=clean_rainfall_df.drop('Annual',1)


In [14]:
# Convert months from object to float
clean_rainfall_df[['Jan', 'Feb', 'Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']] = clean_rainfall_df[['Jan', 'Feb', 'Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']].astype(float)

In [15]:
clean_rainfall_df.dtypes

Year      int32
Jan     float64
Feb     float64
Mar     float64
Apr     float64
May     float64
Jun     float64
Jul     float64
Aug     float64
Sep     float64
Oct     float64
Nov     float64
Dec     float64
dtype: object

In [16]:
#Add Quarterly rainfall totals
clean_rainfall_df['Q1_rainfall'] = clean_rainfall_df['Jan'] + clean_rainfall_df['Feb'] + clean_rainfall_df['Mar']
clean_rainfall_df['Q2_rainfall'] = clean_rainfall_df['Apr'] + clean_rainfall_df['May'] + clean_rainfall_df['Jun']
clean_rainfall_df['Q3_rainfall'] = clean_rainfall_df['Jul'] + clean_rainfall_df['Aug'] + clean_rainfall_df['Sep']
clean_rainfall_df['Q4_rainfall'] = clean_rainfall_df['Oct'] + clean_rainfall_df['Nov'] + clean_rainfall_df['Dec']

# Add city
clean_rainfall_df['City'] = 'New Brunswick'

In [17]:
clean_rainfall_df.to_csv('Resources/rainfall.csv', index=False)

In [18]:
# Clean home price file
# Filter on State=NJ

nj_homeprice_df = homeprice_df[homeprice_df.State == 'NJ']


In [19]:
nj_homeprice_df.dtypes

RegionID        int64
SizeRank        int64
RegionName      int64
RegionType     object
StateName      object
               ...   
1/31/2022     float64
2/28/2022     float64
3/31/2022     float64
4/30/2022     float64
5/31/2022     float64
Length: 278, dtype: object

In [20]:
nj_homeprice_df['RegionName'] = nj_homeprice_df['RegionName'].apply(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nj_homeprice_df['RegionName'] = nj_homeprice_df['RegionName'].apply(str)


In [21]:
nj_homeprice_df.dtypes

RegionID        int64
SizeRank        int64
RegionName     object
RegionType     object
StateName      object
               ...   
1/31/2022     float64
2/28/2022     float64
3/31/2022     float64
4/30/2022     float64
5/31/2022     float64
Length: 278, dtype: object

In [22]:
# left Pad Zipcode with 0 if length < 5
nj_homeprice_df['RegionName']=nj_homeprice_df.RegionName.str.pad(5,side='left',fillchar='0')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nj_homeprice_df['RegionName']=nj_homeprice_df.RegionName.str.pad(5,side='left',fillchar='0')


In [23]:
nj_homeprice_df

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,City,Metro,CountyName,1/31/2000,...,8/31/2021,9/30/2021,10/31/2021,11/30/2021,12/31/2021,1/31/2022,2/28/2022,3/31/2022,4/30/2022,5/31/2022
66,60545,67,07030,Zip,NJ,NJ,Hoboken,New York-Newark-Jersey City,Hudson County,251364.0,...,749486.0,747117.0,746338.0,744986.0,745995.0,746177.0,747799.0,754807.0,765987.0,775344.0
82,61148,83,08701,Zip,NJ,NJ,Lakewood Township,New York-Newark-Jersey City,Ocean County,170574.0,...,459463.0,468158.0,477602.0,484454.0,490099.0,496232.0,506064.0,516967.0,527549.0,539501.0
112,60639,113,07302,Zip,NJ,NJ,Jersey City,New York-Newark-Jersey City,Hudson County,197412.0,...,817980.0,813051.0,811324.0,809523.0,810357.0,810901.0,814109.0,823253.0,834215.0,843868.0
200,61169,202,08753,Zip,NJ,NJ,Toms River,New York-Newark-Jersey City,Ocean County,141372.0,...,390684.0,396863.0,402540.0,406570.0,409384.0,412563.0,418127.0,424441.0,430778.0,437603.0
225,60518,227,07002,Zip,NJ,NJ,Bayonne,New York-Newark-Jersey City,Hudson County,172571.0,...,468545.0,469831.0,472676.0,474605.0,477434.0,479031.0,481218.0,487852.0,496014.0,504295.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26524,61048,30068,08321,Zip,NJ,NJ,Downe Township,Vineland-Bridgeton,Cumberland County,,...,137854.0,145477.0,147901.0,147696.0,142621.0,140984.0,136546.0,137178.0,139296.0,141122.0
26596,61115,30222,08561,Zip,NJ,NJ,Robbinsville Township,Trenton,Mercer County,,...,372909.0,378435.0,382689.0,385907.0,390567.0,392838.0,396247.0,400309.0,408311.0,416051.0
26762,61041,30617,08314,Zip,NJ,NJ,Maurice River Township,Vineland-Bridgeton,Cumberland County,,...,106109.0,105339.0,106293.0,108270.0,112179.0,114999.0,115392.0,117089.0,120914.0,123450.0
26875,61055,30892,08329,Zip,NJ,NJ,Commercial Township,Vineland-Bridgeton,Cumberland County,,...,162774.0,169330.0,175013.0,176788.0,179026.0,182009.0,184741.0,188614.0,192775.0,197366.0


In [24]:
for i in nj_homeprice_df:
    print(i)
    

RegionID
SizeRank
RegionName
RegionType
StateName
State
City
Metro
CountyName
1/31/2000
2/29/2000
3/31/2000
4/30/2000
5/31/2000
6/30/2000
7/31/2000
8/31/2000
9/30/2000
10/31/2000
11/30/2000
12/31/2000
1/31/2001
2/28/2001
3/31/2001
4/30/2001
5/31/2001
6/30/2001
7/31/2001
8/31/2001
9/30/2001
10/31/2001
11/30/2001
12/31/2001
1/31/2002
2/28/2002
3/31/2002
4/30/2002
5/31/2002
6/30/2002
7/31/2002
8/31/2002
9/30/2002
10/31/2002
11/30/2002
12/31/2002
1/31/2003
2/28/2003
3/31/2003
4/30/2003
5/31/2003
6/30/2003
7/31/2003
8/31/2003
9/30/2003
10/31/2003
11/30/2003
12/31/2003
1/31/2004
2/29/2004
3/31/2004
4/30/2004
5/31/2004
6/30/2004
7/31/2004
8/31/2004
9/30/2004
10/31/2004
11/30/2004
12/31/2004
1/31/2005
2/28/2005
3/31/2005
4/30/2005
5/31/2005
6/30/2005
7/31/2005
8/31/2005
9/30/2005
10/31/2005
11/30/2005
12/31/2005
1/31/2006
2/28/2006
3/31/2006
4/30/2006
5/31/2006
6/30/2006
7/31/2006
8/31/2006
9/30/2006
10/31/2006
11/30/2006
12/31/2006
1/31/2007
2/28/2007
3/31/2007
4/30/2007
5/31/2007
6/30/2007
7

In [25]:
# Start home zip price series by year
startyear = 2021
homezip_df = pd.DataFrame(columns = ['Zip', 'City', 'County','Year','Q1_avg_homeprice','Q2_avg_homeprice',
                                     'Q3_avg_homeprice','Q4_avg_homeprice'])

In [26]:
homezip_df.head()

Unnamed: 0,Zip,City,County,Year,Q1_avg_homeprice,Q2_avg_homeprice,Q3_avg_homeprice,Q4_avg_homeprice


In [27]:
range(len(nj_homeprice_df))

range(0, 564)

In [28]:
for c in range(len(nj_homeprice_df)):
    homezip_df = homezip_df.append({'Zip' : nj_homeprice_df.iloc[c,2], 
                    'City' : nj_homeprice_df.iloc[c,6], 
                    'County' : nj_homeprice_df.iloc[c,8], 
                    'Year' : startyear,
                    'Jan' : nj_homeprice_df.iloc[c,261],
                    'Feb' : nj_homeprice_df.iloc[c,262],
                   'Mar' : nj_homeprice_df.iloc[c,263],
                   'Apr' : nj_homeprice_df.iloc[c,264],
                   'May' : nj_homeprice_df.iloc[c,265],
                   'Jun' : nj_homeprice_df.iloc[c,266],
                   'Jul' : nj_homeprice_df.iloc[c,267],
                   'Aug' : nj_homeprice_df.iloc[c,268],
                   'Sep' : nj_homeprice_df.iloc[c,269],
                   'Oct' : nj_homeprice_df.iloc[c,270],
                    'Nov' : nj_homeprice_df.iloc[c,271],               
                    'Dec' : nj_homeprice_df.iloc[c,272],                                                  
                    'Q1_avg_homeprice': ((nj_homeprice_df.iloc[c,261] + nj_homeprice_df.iloc[c,262] + nj_homeprice_df.iloc[c,263])/3),
                   'Q2_avg_homeprice': ((nj_homeprice_df.iloc[c,264] + nj_homeprice_df.iloc[c,265] + nj_homeprice_df.iloc[c,266])/3),
                   'Q3_avg_homeprice': ((nj_homeprice_df.iloc[c,267] + nj_homeprice_df.iloc[c,268] + nj_homeprice_df.iloc[c,269])/3),
                   'Q4_avg_homeprice': ((nj_homeprice_df.iloc[c,270] + nj_homeprice_df.iloc[c,271] + nj_homeprice_df.iloc[c,272])/3)
                                   },ignore_index = True)


In [29]:
homezip_df

Unnamed: 0,Zip,City,County,Year,Q1_avg_homeprice,Q2_avg_homeprice,Q3_avg_homeprice,Q4_avg_homeprice,Apr,Aug,Dec,Feb,Jan,Jul,Jun,Mar,May,Nov,Oct,Sep
0,07030,Hoboken,Hudson County,2021,727520.000000,727778.333333,7.461107e+05,7.457730e+05,723597.0,749486.0,745995.0,729251.0,728266.0,741729.0,734759.0,725043.0,724979.0,744986.0,746338.0,747117.0
1,08701,Lakewood Township,Ocean County,2021,393416.666667,421827.333333,4.585370e+05,4.840517e+05,409735.0,459463.0,490099.0,393127.0,387174.0,447990.0,435089.0,399949.0,420658.0,484454.0,477602.0,468158.0
2,07302,Jersey City,Hudson County,2021,796599.333333,799803.333333,8.144750e+05,8.104013e+05,796351.0,817980.0,810357.0,797549.0,794881.0,812394.0,805822.0,797368.0,797237.0,809523.0,811324.0,813051.0
3,08753,Toms River,Ocean County,2021,314477.000000,351892.333333,3.894740e+05,4.061647e+05,335735.0,390684.0,409384.0,314065.0,307811.0,380875.0,369303.0,321555.0,350639.0,406570.0,402540.0,396863.0
4,07002,Bayonne,Hudson County,2021,436341.666667,446857.333333,4.664423e+05,4.749050e+05,442018.0,468545.0,477434.0,437268.0,431577.0,460951.0,453195.0,440180.0,445359.0,474605.0,472676.0,469831.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
559,08321,Downe Township,Cumberland County,2021,119277.666667,120020.333333,1.374077e+05,1.460727e+05,117935.0,137854.0,142621.0,120718.0,118356.0,128892.0,123503.0,118759.0,118623.0,147696.0,147901.0,145477.0
560,08561,Robbinsville Township,Mercer County,2021,344873.666667,360448.000000,3.731407e+05,3.863877e+05,356401.0,372909.0,390567.0,344486.0,339474.0,368078.0,364009.0,350661.0,360934.0,385907.0,382689.0,378435.0
561,08314,Maurice River Township,Cumberland County,2021,98480.666667,103061.000000,1.055297e+05,1.089140e+05,101365.0,106109.0,112179.0,97359.0,98879.0,105141.0,104474.0,99204.0,103344.0,108270.0,106293.0,105339.0
562,08329,Commercial Township,Cumberland County,2021,,,1.616890e+05,1.769423e+05,,162774.0,179026.0,,,152963.0,147241.0,,139134.0,176788.0,175013.0,169330.0


In [30]:
homezip_df.to_csv('Resources/njhomeprice.csv', index=False)

In [None]:
# Upload all DF to Postgres DB

In [None]:
# Perform inner join on cities and rainfall

In [None]:
# Export to new rainfall DF (will only have cities that have rainfall data)

In [None]:
# Perform inner join on cities and homeprice

In [None]:
# Export to new homeprice DF (will only have cities that have oneprice data)

In [None]:
# Merge rainfall and homeprice DF
