In [1]:
# Dependencies and Setup
import pandas as pd
import os

In [2]:
# File to Load (Remember to change the path if needed.)
cities = os.path.join("Resources", "Cities_Along_Raritan.csv")

# Read the School Data and Student Data and store into a Pandas DataFrame
cities_df = pd.read_csv(cities)
cities_df.head()

Unnamed: 0,zipcode,city,county,flood_zone,Latitude,Longitude
0,8901,New Brunswick,Middlesex,A,40.48603,-74.431
1,8904,Highland park,Middlesex,AE,40.49333,-74.4338
2,8876,Somerville,Somerset,AE,40.55752,-74.6154
3,8805,Bound Brook,Somerset,X,40.56078,-74.5339
4,8807,Bridgewater,Somerset,0,40.54642,-74.6706


In [3]:
cities_df.dtypes

zipcode         int64
city           object
county         object
flood_zone     object
Latitude      float64
Longitude     float64
dtype: object

In [4]:
# Clean cities file Convert zipcode to string
cities_df['zipcode'] = cities_df['zipcode'].apply(str)

# Fix zipcodes in cities file to left pad with 0
cities_df['zipcode']=cities_df.zipcode.str.pad(5,side='left',fillchar='0')

In [5]:
cities_df.head()

Unnamed: 0,zipcode,city,county,flood_zone,Latitude,Longitude
0,8901,New Brunswick,Middlesex,A,40.48603,-74.431
1,8904,Highland park,Middlesex,AE,40.49333,-74.4338
2,8876,Somerville,Somerset,AE,40.55752,-74.6154
3,8805,Bound Brook,Somerset,X,40.56078,-74.5339
4,8807,Bridgewater,Somerset,0,40.54642,-74.6706


In [6]:
# Read the Zillow Data for Real Estate List Price History

homeprice = os.path.join("Resources", "ZipcodePricealltypeshouse.csv")

# Read the RealEstateList Price Data and store into a Pandas DataFrame
homeprice_df = pd.read_csv(homeprice)
homeprice_df.head()

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,City,Metro,CountyName,1/31/2000,...,8/31/2021,9/30/2021,10/31/2021,11/30/2021,12/31/2021,1/31/2022,2/28/2022,3/31/2022,4/30/2022,5/31/2022
0,61639,0,10025,Zip,NY,NY,New York,New York-Newark-Jersey City,New York County,325181.0,...,1124632.0,1135008.0,1139408.0,1138788.0,1139359.0,1145264.0,1157044.0,1175799.0,1192501.0,1206638.0
1,84654,1,60657,Zip,IL,IL,Chicago,Chicago-Naperville-Elgin,Cook County,314617.0,...,523400.0,526000.0,528598.0,531407.0,533414.0,534807.0,535022.0,537713.0,539815.0,544880.0
2,61637,2,10023,Zip,NY,NY,New York,New York-Newark-Jersey City,New York County,507069.0,...,1483316.0,1499147.0,1507813.0,1512079.0,1512394.0,1523018.0,1532777.0,1549793.0,1562975.0,1581574.0
3,91982,3,77494,Zip,TX,TX,Katy,Houston-The Woodlands-Sugar Land,Harris County,225949.0,...,414215.0,423395.0,431406.0,438534.0,444306.0,450440.0,460032.0,471785.0,485634.0,497150.0
4,84616,4,60614,Zip,IL,IL,Chicago,Chicago-Naperville-Elgin,Cook County,409525.0,...,665692.0,668892.0,672821.0,677488.0,680424.0,682344.0,682106.0,685214.0,687665.0,693485.0


In [7]:
homeprice_df.dtypes

RegionID        int64
SizeRank        int64
RegionName      int64
RegionType     object
StateName      object
               ...   
1/31/2022     float64
2/28/2022     float64
3/31/2022     float64
4/30/2022     float64
5/31/2022     float64
Length: 278, dtype: object

In [8]:
# Read the Rainfall Data for New Brunswick
rainfall = os.path.join("Resources", "new_brunswick_precip.csv")

# Read the rainfall Data and store into a Pandas DataFrame
rainfall_df = pd.read_csv(rainfall)
rainfall_df.head()

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual
0,1893,2.61,5.73,3.22,5.61,3.95,3.28,2.31,10.7,3.31,3.61,3.91,Mo,Ma
1,1894,2.22,3.99,1.59,3.41,6.57,2.25,2.5,1.96,10.85,5.36,3.55,3.86,48.11
2,1895,5.44,0.97,Mn,4.61,2.82,2.79,4.26,4.5,Mu,3.9,3.41,2.68,Mb
3,1896,,,,,,,,,,,,,
4,1897,,,,,,,,,,,,,


In [9]:
rainfall_df.tail()

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,Annual
131,Mean,3.43,3.04,3.8,3.76,4.04,3.89,4.91,4.7,4.11,3.58,3.45,3.75,46.27
132,Median,3.03,2.78,3.53,3.25,3.93,3.39,4.35,3.98,3.31,3.24,3.14,3.48,45.84
133,Min,0.5,0.64,0.75,0.65,0.29,0.02,0.68,0.76,0.04,0.25,0.34,0.17,27.51
134,Max,9.18,6.67,11.13,12.38,9.38,10.04,11.39,17.43,12.68,12.32,8.72,9.91,66.11
135,Count,118.0,119.0,118.0,119.0,119.0,118.0,119.0,119.0,117.0,118.0,118.0,116.0,114.0


In [10]:
# Clean rainfall
# Remove Rows where Year is not numeric
rainfall_df['Year'] = pd.to_numeric(rainfall_df['Year'], errors='coerce')

# Drop NaN's
rainfall_df.dropna(how='any',inplace=True)

In [11]:
# Filter only years 2000-2021
clean_rainfall_df = rainfall_df[rainfall_df.Year > 2012]

In [16]:
clean_rainfall_df['City'] = 'New Brunswick'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_rainfall_df['City'] = 'New Brunswick'


In [17]:
clean_rainfall_df.to_csv('Resources/rainfall.csv', index=False)

In [18]:
# Clean home price file
# Filter on State=NJ

nj_homeprice_df = homeprice_df[homeprice_df.State == 'NJ']


In [19]:
nj_homeprice_df.dtypes

RegionID        int64
SizeRank        int64
RegionName      int64
RegionType     object
StateName      object
               ...   
1/31/2022     float64
2/28/2022     float64
3/31/2022     float64
4/30/2022     float64
5/31/2022     float64
Length: 278, dtype: object

In [20]:
nj_homeprice_df['RegionName'] = nj_homeprice_df['RegionName'].apply(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nj_homeprice_df['RegionName'] = nj_homeprice_df['RegionName'].apply(str)


In [21]:
nj_homeprice_df.dtypes

RegionID        int64
SizeRank        int64
RegionName     object
RegionType     object
StateName      object
               ...   
1/31/2022     float64
2/28/2022     float64
3/31/2022     float64
4/30/2022     float64
5/31/2022     float64
Length: 278, dtype: object

In [22]:
# left Pad Zipcode with 0 if length < 5
nj_homeprice_df['RegionName']=nj_homeprice_df.RegionName.str.pad(5,side='left',fillchar='0')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nj_homeprice_df['RegionName']=nj_homeprice_df.RegionName.str.pad(5,side='left',fillchar='0')


In [23]:
nj_homeprice_df.head()

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,City,Metro,CountyName,1/31/2000,...,8/31/2021,9/30/2021,10/31/2021,11/30/2021,12/31/2021,1/31/2022,2/28/2022,3/31/2022,4/30/2022,5/31/2022
66,60545,67,7030,Zip,NJ,NJ,Hoboken,New York-Newark-Jersey City,Hudson County,251364.0,...,749486.0,747117.0,746338.0,744986.0,745995.0,746177.0,747799.0,754807.0,765987.0,775344.0
82,61148,83,8701,Zip,NJ,NJ,Lakewood Township,New York-Newark-Jersey City,Ocean County,170574.0,...,459463.0,468158.0,477602.0,484454.0,490099.0,496232.0,506064.0,516967.0,527549.0,539501.0
112,60639,113,7302,Zip,NJ,NJ,Jersey City,New York-Newark-Jersey City,Hudson County,197412.0,...,817980.0,813051.0,811324.0,809523.0,810357.0,810901.0,814109.0,823253.0,834215.0,843868.0
200,61169,202,8753,Zip,NJ,NJ,Toms River,New York-Newark-Jersey City,Ocean County,141372.0,...,390684.0,396863.0,402540.0,406570.0,409384.0,412563.0,418127.0,424441.0,430778.0,437603.0
225,60518,227,7002,Zip,NJ,NJ,Bayonne,New York-Newark-Jersey City,Hudson County,172571.0,...,468545.0,469831.0,472676.0,474605.0,477434.0,479031.0,481218.0,487852.0,496014.0,504295.0


In [None]:
# Upload all DF to Postgres DB

In [None]:
# Perform inner join on cities and rainfall

In [None]:
# Export to new rainfall DF (will only have cities that have rainfall data)

In [None]:
# Perform inner join on cities and homeprice

In [None]:
# Export to new homeprice DF (will only have cities that have oneprice data)

In [None]:
# Merge rainfall and homeprice DF
