## Merging Owner Information and Sales Data Into The Main DataFrame

In [1]:
# Dependencies
import pandas as pd
import numpy as np

In [2]:
# read new owner data file
file = pd.read_csv('Resources/new_owner.csv')
owner_info = pd.DataFrame(file)
owner_info.head()

Unnamed: 0.1,Unnamed: 0,ACCOUNT,NEW_OWNER_DATE_2019,NEW_OWNER_DATE_2018
0,0,21750000003,2009-10-13 00:00:00.000,2009-10-13 00:00:00.000
1,1,21750000013,1995-08-02 00:00:00.000,1995-08-02 00:00:00.000
2,2,21750000018,2018-06-29 00:00:00.000,2018-06-29 00:00:00.000
3,3,21750000019,2001-01-01 00:00:00.000,2001-01-01 00:00:00.000
4,4,41320000003,2014-12-12 00:00:00.000,2014-12-12 00:00:00.000


In [3]:
# obtain sales dates from new owner data column
# then separate sales in 2019 from other years
owner_info["Sale Year"] = owner_info["NEW_OWNER_DATE_2019"].str.split('-').str[0]
owner_info['2019 Sales']=np.where(owner_info['Sale Year']=='2019',1,0)

owner_info.head()

Unnamed: 0.1,Unnamed: 0,ACCOUNT,NEW_OWNER_DATE_2019,NEW_OWNER_DATE_2018,Sale Year,2019 Sales
0,0,21750000003,2009-10-13 00:00:00.000,2009-10-13 00:00:00.000,2009,0
1,1,21750000013,1995-08-02 00:00:00.000,1995-08-02 00:00:00.000,1995,0
2,2,21750000018,2018-06-29 00:00:00.000,2018-06-29 00:00:00.000,2018,0
3,3,21750000019,2001-01-01 00:00:00.000,2001-01-01 00:00:00.000,2001,0
4,4,41320000003,2014-12-12 00:00:00.000,2014-12-12 00:00:00.000,2014,0


In [4]:
# group account numbers to have one account id per row prior to merging with the main dataframe
owner_info = owner_info.groupby(['ACCOUNT']).count()

In [5]:
owner_info

Unnamed: 0_level_0,Unnamed: 0,NEW_OWNER_DATE_2019,NEW_OWNER_DATE_2018,Sale Year,2019 Sales
ACCOUNT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
21750000003,1,1,1,1,1
21750000013,1,1,1,1,1
21750000018,1,1,1,1,1
21750000019,1,1,1,1,1
41320000003,1,1,1,1,1
...,...,...,...,...,...
1391080010001,1,1,1,1,1
1391080010002,1,1,1,1,1
1391700010001,1,1,1,1,1
1391700010002,1,1,1,1,1


In [6]:
# make the 2019 sales column into a separate dataframe to merge with the main dataframe
sales = owner_info["2019 Sales"]
sales_2019 = pd.DataFrame(sales)

In [7]:
# read the main dataframe to be used in the results calculation
file2 = pd.read_csv("Resources/combined_data_without_charter_schools.csv")
data = pd.DataFrame(file2)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26253 entries, 0 to 26252
Data columns (total 38 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Address                                     26253 non-null  object 
 1   Latitude                                    26253 non-null  float64
 2   Longitude                                   26253 non-null  float64
 3   Flood Description                           20837 non-null  object 
 4   Flood Zone                                  26253 non-null  object 
 5   Flood Risk                                  26253 non-null  int64  
 6   SITE_ADDR_1                                 26253 non-null  object 
 7   ACCOUNT                                     26253 non-null  int64  
 8   Zip Code                                    26253 non-null  int64  
 9   NEIGHBORHOOD_CODE                           26253 non-null  float64
 10  NEIGHBORHO

In [8]:
# merge the 2019 sales with the main dataframe
merged_data = pd.merge(data,sales_2019,on='ACCOUNT',how='left')
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26253 entries, 0 to 26252
Data columns (total 39 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Address                                     26253 non-null  object 
 1   Latitude                                    26253 non-null  float64
 2   Longitude                                   26253 non-null  float64
 3   Flood Description                           20837 non-null  object 
 4   Flood Zone                                  26253 non-null  object 
 5   Flood Risk                                  26253 non-null  int64  
 6   SITE_ADDR_1                                 26253 non-null  object 
 7   ACCOUNT                                     26253 non-null  int64  
 8   Zip Code                                    26253 non-null  int64  
 9   NEIGHBORHOOD_CODE                           26253 non-null  float64
 10  NEIGHBORHO

In [9]:
# output the merged data
merged_data.to_csv("Output/all_data_merged.csv")