# 01 Importing Libraries

# 02 Creating Dataframes

# 03 Merging dataframes

# 04 Wrangling Data

# 05 Updating main dataset

## 01 Importing Libraries

In [1]:
# Import Analysis Libraries
import pandas as pd
import numpy as np
import os

## 02 Creating Dataframes

In [2]:
# Defining the main project path
proj = r'C:\Users\bfd_6\Documents\Career Foundry Project\Course Part 2\A6 Adv Analytics and Dashboards\Real Estate Project'

In [5]:
# Creating a data frame for the 2012 dataset
df_re_12 = pd.read_pickle(os.path.join(proj, '02 Data', 'Prepared Data', 'real_estate_data_2012.pkl'))

In [6]:
# Creating a data frame for the 2022 dataset
df_re_22 = pd.read_pickle(os.path.join(proj, '02 Data', 'Prepared Data', 'real_estate_data_2022.pkl'))

In [7]:
df_re_22.shape

(3222, 85)

In [8]:
df_re_12.shape

(3221, 84)

## 03 Merging dataframes

In [9]:
df_re_merged = df_re_22.merge(df_re_12, how = 'inner', on = ['geo_id', 'us_county'])

In [10]:
df_re_merged.shape

(3191, 167)

## 04 Wrangling Data

In [15]:
# Determining format of us_county field
df_re_merged.get('us_county')

0                          Autauga County, Alabama
1                          Baldwin County, Alabama
2                          Barbour County, Alabama
3                             Bibb County, Alabama
4                           Blount County, Alabama
5                          Bullock County, Alabama
6                           Butler County, Alabama
7                          Calhoun County, Alabama
8                         Chambers County, Alabama
9                         Cherokee County, Alabama
10                         Chilton County, Alabama
11                         Choctaw County, Alabama
12                          Clarke County, Alabama
13                            Clay County, Alabama
14                        Cleburne County, Alabama
15                          Coffee County, Alabama
16                         Colbert County, Alabama
17                         Conecuh County, Alabama
18                           Coosa County, Alabama
19                       Coving

In [12]:
# Splitting county and state information from us_county
df_re_merged[['county', 'state']] = df_re_merged['us_county'].str.split(', ', expand = True)

In [13]:
df_re_merged.shape

(3191, 169)

In [14]:
# Determining which states/entities are included in the data
pd.set_option('display.max_rows', None)
df_re_merged['state'].unique()

array(['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Delaware', 'District of Columbia', 'Florida',
       'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa',
       'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland',
       'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
       'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire',
       'New Jersey', 'New Mexico', 'New York', 'North Carolina',
       'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
       'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee',
       'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming', 'Puerto Rico'],
      dtype=object)

Puerto Rico is not in the scope of our questions. All rows for Puerto Rico
should be dropped. 

In [17]:
# Dropping rows for counties in Puerto Rico
index_state = df_re_merged[df_re_merged['state'] == 'Puerto Rico'].index
df_re_merged.drop(index_state, inplace = True)

In [20]:
# Checking that all rows for Puerto Rico were dropped
print(df_re_merged.loc[(df_re_merged['state'] == 'Puerto Rico')])

Empty DataFrame
Columns: [geo_id, us_county, 2022 total housing units, 2022 occupied housing units, 2022 vacant housing units, 2022 num_units -1_detached, 2022 num_units-1_attached, 2022 2 units, 2022 3-4 units, 2022 5-9 units, 2022 10-19 units, 2022 20+ units, 2022 mobile home, 2022 boat_rv_van_etc, 2022 built 2020+, 2022 built 2010-2019, 2022 built 2000-2009, 2022 built 1990-1999, 2022 built 1980-1989, 2022 built 1970-1979, 2022 built 1960-1969, 2022 built 1950-1959, 2022 built 1940-1949, 2022 built before 1940, 2022 1 room, 2022 2 rooms, 2022 3 rooms, 2022 4 rooms, 2022 5 rooms, 2022 6 rooms, 2022 7 rooms, 2022 8 rooms, 2022 9+ rooms, 2022 no bedroom, 2022 1 bedroom, 2022 2 bedrooms, 2022 3 bedrooms, 2022 4 bedrooms, 2022 5+ bedrooms, 2022 owner-occupied, 2022 renter-occupied, 2022 move-in 2021+, 2022 move-in 2018-2020, 2022 move-in 2010-2017, 2022 move-in 2000-2009, 2022 move-in 1990-1999, 2022 move-in before 1990, 2022 no vehicles available, 2022 1 vehicle available, 2022 2 vehicl

The dataset is empty; therefore, all rows with Puerto Rico as the state were dropped.

In [21]:
df_re_merged.shape

(3129, 169)

In [26]:
# Adding a column for state abbreviations
# This process starts with creating a dictionary with state names and abbreviations
# This dictionary is public domain and taken from https://gist.github.com/rogerallen/1583593
us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
   }

In [23]:
# The next step is to add the column using the map() function
df_re_merged['st_abbr'] = df_re_merged['state'].map(us_state_to_abbrev)

In [24]:
df_re_merged['st_abbr'].head(50)

0     AL
1     AL
2     AL
3     AL
4     AL
5     AL
6     AL
7     AL
8     AL
9     AL
10    AL
11    AL
12    AL
13    AL
14    AL
15    AL
16    AL
17    AL
18    AL
19    AL
20    AL
21    AL
22    AL
23    AL
24    AL
25    AL
26    AL
27    AL
28    AL
29    AL
30    AL
31    AL
32    AL
33    AL
34    AL
35    AL
36    AL
37    AL
38    AL
39    AL
40    AL
41    AL
42    AL
43    AL
44    AL
45    AL
46    AL
47    AL
48    AL
49    AL
Name: st_abbr, dtype: object

In [25]:
df_re_merged['st_abbr'].tail(30)

3099    WI
3100    WI
3101    WI
3102    WI
3103    WI
3104    WI
3105    WI
3106    WY
3107    WY
3108    WY
3109    WY
3110    WY
3111    WY
3112    WY
3113    WY
3114    WY
3115    WY
3116    WY
3117    WY
3118    WY
3119    WY
3120    WY
3121    WY
3122    WY
3123    WY
3124    WY
3125    WY
3126    WY
3127    WY
3128    WY
Name: st_abbr, dtype: object

In [54]:
# Creating a Region flag to address a project question. This will be a two part process. 
# The first step is to create a list for each region. The second step will be to create a new flag column based on those lists.
# Creating lists for each region.
ne_list = ['Maine', 'New Hampshire', 'Vermont', 'Massachusetts', 'Rhode Island', 'Connecticut', 'New York', 'Pennsylvania', 'New Jersey']
mw_list = ['Wisconsin', 'Michigan', 'Illinois', 'Indiana', 'Ohio', 'North Dakota', 'South Dakota', 'Nebraska', 'Kansas', 'Minnesota', 'Iowa', 'Missouri']
sth_list = ['Delaware', 'Maryland', 'District of Columbia', 'Virginia', 'West Virginia', 'North Carolina', 'South Carolina', 'Georgia', 'Florida', 'Kentucky', 'Tennessee', 'Mississippi', 'Alabama', 'Oklahoma', 'Texas', 'Arkansas', 'Louisiana']
we_list = ['Idaho', 'Montana', 'Wyoming', 'Nevada', 'Utah', 'Colorado', 'Arizona', 'New Mexico', 'Alaska', 'Washington', 'Oregon', 'California', 'Hawaii']

In [55]:
# Adding region flag based on state column
df_re_merged.loc[df_re_merged['state'].isin(ne_list), 'region'] = "Northeast" 
df_re_merged.loc[df_re_merged['state'].isin(mw_list), 'region'] = "Midwest" 
df_re_merged.loc[df_re_merged['state'].isin(sth_list), 'region'] = "South" 
df_re_merged.loc[df_re_merged['state'].isin(we_list), 'region'] = "West" 

In [56]:
# Verifying that the added flag worked and is correct.
df_re_merged['region'].value_counts()

region
South        1422
Midwest      1054
West          444
Northeast     209
Name: count, dtype: int64

In [63]:
pd.set_option('display.max_columns', None)
df_re_merged.head()

Unnamed: 0,geo_id,us_county,2022 total housing units,2022 occupied housing units,2022 vacant housing units,2022 num_units -1_detached,2022 num_units-1_attached,2022 2 units,2022 3-4 units,2022 5-9 units,2022 10-19 units,2022 20+ units,2022 mobile home,2022 boat_rv_van_etc,2022 built 2020+,2022 built 2010-2019,2022 built 2000-2009,2022 built 1990-1999,2022 built 1980-1989,2022 built 1970-1979,2022 built 1960-1969,2022 built 1950-1959,2022 built 1940-1949,2022 built before 1940,2022 1 room,2022 2 rooms,2022 3 rooms,2022 4 rooms,2022 5 rooms,2022 6 rooms,2022 7 rooms,2022 8 rooms,2022 9+ rooms,2022 no bedroom,2022 1 bedroom,2022 2 bedrooms,2022 3 bedrooms,2022 4 bedrooms,2022 5+ bedrooms,2022 owner-occupied,2022 renter-occupied,2022 move-in 2021+,2022 move-in 2018-2020,2022 move-in 2010-2017,2022 move-in 2000-2009,2022 move-in 1990-1999,2022 move-in before 1990,2022 no vehicles available,2022 1 vehicle available,2022 2 vehicles available,2022 3+ vehicles available,2022 gas heat,2022 lp container heat,2022 electric heat,2022 oil or kerosene heat,2022 coal or coke heat,2022 wood heat,2022 solar heat,2022 other fuel heat,2022 no fuel heat,2022 no full bathroom,2022 no full kitchen,2022 no landline available,2022 1.00 or less occ per room,2022 1.01 to 1.50 occ per room,2022 1.51 or more occ per room,2022 own_occ < $50k,2022 own_occ $50k < $100k,2022 own_occ $100k < $150k,2022 own_occ $150k < $200k,2022 own_occ $200k <$300k,2022 own_occ $300k - $500k,2022 own_occ $500k - $1m,2022 own_occ $1m+,2022 mortgage,2022 no mortgage,2022 rent producing units,2022 gross rent < $500,2022 gross rent $500 < $1k,2022 gross rent $1k < $1.5k,2022 gross rent $1.5k < $2k,2022 gross rent $2k <$2.5k,2022 gross rent $2.5k <$3k,2022 gross rent $3k+,2022 no rent paid,2012 total housing units,2012 occupied housing units,2012 vacant housing units,2012 num_units -1_detached,2012 num units-1_attached,2012 2 units,2012 3-4 units,2012 5-9 units,2012 10-19 units,2012 20+ units,2012 mobile home,2012 boat_rv_van_etc,2012 built 2010+,2012 built 2000-2009,2012 built 1990-1999,2012 built 1980 -1989,2012 built 1970-1979,2012 built 1960-1969,2012 built 1950-1959,2012 built 1940-1949,2012 built before 1940,2012 1 room,2012 2 rooms,2012 3 rooms,2012 4 rooms,2012 5 rooms,2012 6 rooms,2012 7 rooms,2012 8 rooms,2012 9 rooms+,2012 no bedroom,2012 1 bedroom,2012 2 bedrooms,2012 3 bedrooms,2012 4 bedrooms,2012 5 bedrooms+,2012 owner-occupied,2012 renter-occupied,2012 moved in 2010+,2012 moved in 2000-2009,2012 moved in 1990-1999,2012 moved in 1980-1989,2012 moved in 1970 to 1979,2012 moved before 1970,2012 no vehicles available,2012 1 vehicle available,2012 2 vehicles available,2012 3+ vehicles available,2012 gas heat,2012 lp container heat,2012 electric heat,2012 oil or kerosene heat,2012 coal or coke heat,2012 wood heat,2012 solar heat,2012 other fuel heat,2012 no fuel heat,2012 no full bathroom,2012 no full kitchen,2012 no landline available,2012 1.00 or less occ per room,2012 1.01 to 1.50 occ per room,2012 1.51 or more occ per room,"2012 own_occ < $50,000","2012 own_occ $50,000 < $99,999","2012 own_occ $100,000 < $149,999","2012 own_occ $150,000 < $199,999","2012 own_occ $200,000 < $299,999","2012 own_occ $300,000 < $499,999","2012 own_occ $500,000 < $999,999","2012 own_occ $1,000,000+",2012 mortgage,2012 no mortgage,2012 rent producing units,2012 gross rent less than $200,2012 gross rent $200 < $299,2012 gross rent $300 < $499,2012 gross rent $500 < $749,2012 gross rent $750 < $999,"2012 gross rent $1,000 < $1,499","2012 gross rent $1,500+",2012 no rent paid,county,state,st_abbr,region
0,0500000US01001,"Autauga County, Alabama",24457,22308,2149,18370,273,145,294,510,500,470,3805,90,169,3110,5180,4704,2696,4511,2224,973,371,519,144,110,961,2806,4233,6061,4129,2416,3597,150,844,4109,11715,6468,1171,16832,5476,639,4267,7510,5226,2381,2285,888,7080,7741,6599,6912,1802,13327,13,0,166,0,0,88,111,90,151,22001,160,147,1962,1847,2435,2688,4398,2863,572,67,9676,7156,4826,250,1291,2068,759,338,50,70,650,22077,19934,2143,15485,217,289,354,526,312,272,4609,13,283,5471,5240,3306,3216,2836,909,250,566,47,286,989,2684,4276,5793,3294,2648,2060,67,1055,3393,11695,5266,601,15488,4446,1856,10491,3705,1600,1274,1008,1024,5642,8127,5141,7016,2599,10080,36,0,113,0,0,90,122,135,554,19522,370,42,2084,3085,3500,2593,2558,1470,155,43,10023,5465,4034,0,73,484,1010,1290,810,367,412,Autauga County,Alabama,AL,South
1,0500000US01003,"Baldwin County, Alabama",125113,90802,34311,80703,1676,1966,2606,4004,4482,16248,12752,676,1956,18681,33475,28452,18209,10937,6530,2787,1574,2512,1746,1273,6304,18251,27395,26455,16487,12743,14459,1764,7689,25951,60918,23792,4999,70708,20094,4896,20158,33596,17295,8643,6214,2056,28223,38546,21977,8356,1570,79759,82,0,346,30,11,648,386,800,970,89001,1130,671,4188,3977,5810,8643,18427,18528,8778,2357,41421,29287,17892,889,5251,6964,3418,856,283,231,2202,103984,72751,31233,64222,894,1850,2329,3623,3650,13964,13080,372,497,34487,27755,16347,11304,5366,3756,1784,2688,857,1897,9343,20031,23585,18563,12680,8735,8293,942,5753,23927,54317,16012,3033,53889,18862,9856,40286,13026,4571,2807,2205,2242,21979,32375,16155,9081,2693,60285,92,0,445,0,42,113,409,734,2048,71279,1169,303,5874,7231,8384,10888,11015,6753,3112,632,32749,21140,17139,225,370,720,4341,4888,5348,1247,1723,Baldwin County,Alabama,AL,South
2,0500000US01005,"Barbour County, Alabama",11673,9016,2657,6557,119,661,424,334,41,113,3409,15,74,468,1085,2485,2023,1786,1427,932,473,920,58,317,1379,1953,2652,2276,1336,738,964,68,768,3073,6028,1554,182,5858,3158,279,1311,2836,2123,1249,1218,1058,3002,2887,2069,565,754,7586,0,0,45,0,9,57,20,45,379,8671,283,62,1312,1559,984,700,576,511,187,29,2499,3359,2804,882,1586,336,0,0,0,0,354,11878,9423,2455,7492,157,478,214,296,50,101,3068,22,22,948,2468,2093,2295,1486,1010,502,1054,35,126,741,2020,2521,2816,1751,1007,861,48,619,2658,6487,1770,296,6285,3138,761,4216,2327,959,562,598,940,3361,2892,2230,875,1385,6976,35,0,90,0,0,62,98,107,355,9235,93,95,1563,1830,1007,585,677,465,82,76,3185,3100,2510,110,296,580,834,535,141,14,628,Barbour County,Alabama,AL,South
3,0500000US01007,"Bibb County, Alabama",9046,7216,1830,5426,36,124,90,235,86,150,2765,134,12,630,1417,1620,1502,1490,850,480,345,700,73,172,308,1294,2204,2174,1313,585,923,81,409,2279,4871,1127,279,5465,1751,181,1117,2160,1392,1139,1227,544,1836,2539,2297,1414,684,4828,0,0,236,0,6,48,38,132,125,7122,67,27,1105,1190,895,758,977,416,81,43,1994,3471,1317,420,681,210,6,0,0,0,434,8958,7386,1572,5471,53,88,163,102,28,111,2942,0,6,1105,2110,1640,1564,900,588,289,756,100,60,487,1138,2860,2135,1143,395,640,117,443,1946,5089,1020,343,5979,1407,533,3127,1786,939,535,466,378,2042,2861,2105,1464,687,4948,51,0,163,0,54,19,10,3,276,7326,60,0,1826,1491,879,979,529,212,58,5,3012,2967,1145,56,164,203,572,142,8,0,262,Bibb County,Alabama,AL,South
4,0500000US01009,"Blount County, Alabama",24677,21626,3051,17120,263,443,478,201,97,135,5901,39,41,1679,4890,5125,3634,3593,2266,1265,1039,1145,153,168,1020,3055,5937,5337,3313,2548,3146,202,875,5861,13304,3514,921,17144,4482,825,2821,7197,5252,2962,2569,1047,5375,7072,8132,2743,2286,16144,21,0,299,18,3,112,96,183,288,21115,442,69,1878,3224,2725,3295,3251,2135,539,97,8647,8497,3559,600,2396,465,64,5,21,8,923,23761,21031,2730,17548,102,256,328,288,149,52,5037,1,0,4424,5759,3632,4031,2127,1323,1122,1343,54,155,778,3345,6442,5137,3605,2012,2233,54,602,5398,13930,3229,548,17057,3974,1403,10542,4706,1937,1388,1055,806,5535,7805,6885,2690,2838,14782,56,0,490,1,50,124,60,119,485,20582,332,117,2881,4385,3602,2866,2335,782,120,86,10119,6938,3362,152,231,653,1554,555,174,43,612,Blount County,Alabama,AL,South


In [66]:
# Creating new columns to allow for better comparison between snapshot years
df_re_merged['2022 move-in 2010-2020'] = df_re_merged['2022 move-in 2018-2020'].add(df_re_merged['2022 move-in 2010-2017'])
df_re_merged['2012 move-in before 1980'] = df_re_merged['2012 moved in 1970 to 1979'].add(df_re_merged['2012 moved before 1970'])

In [75]:
# Checking the new column for 2012 data
df_re_merged[['2012 move-in before 1980', '2012 moved in 1970 to 1979', '2012 moved before 1970']]

Unnamed: 0,2012 move-in before 1980,2012 moved in 1970 to 1979,2012 moved before 1970
0,2282,1274,1008
1,5012,2807,2205
2,1160,562,598
3,1001,535,466
4,2443,1388,1055
5,589,461,128
6,1333,678,655
7,6818,3442,3376
8,2767,1347,1420
9,1491,716,775


In [76]:
# Checking the new column for 2022 data
df_re_merged[['2022 move-in 2010-2020', '2022 move-in 2018-2020', '2022 move-in 2010-2017']]

Unnamed: 0,2022 move-in 2010-2020,2022 move-in 2018-2020,2022 move-in 2010-2017
0,11777,4267,7510
1,53754,20158,33596
2,4147,1311,2836
3,3277,1117,2160
4,10018,2821,7197
5,1430,425,1005
6,3093,1052,2041
7,21677,7903,13774
8,6384,2331,4053
9,3873,1545,2328


In order to answer some of the questions, I will need additional data. This was expected.
I selected median income data by county for the years 2012 and 2022 from the US Census Bureau.
Links to datasets: https://data.census.gov/table/ACSST5Y2012.S1903?q=median%20income%20by%20county%202012 
https://data.census.gov/table/ACSST5Y2022.S1903?q=median%20income%20by%20county%202022 

I will clean and wrangle that data, then merge it with this main dataset in the next notebook.

## 05 Updating main dataset

In [78]:
# Exporting pkl file to prepared data folder for the current df_re_merged dataset. This will 
# be picked up in the next notebook.
df_re_merged.to_pickle(os.path.join(proj, '02 Data', 'Prepared Data', 're_merged.pkl'))