<a href="https://colab.research.google.com/github/andrew66882011/qss20_slides_activities/blob/main/activities/03_merging_session1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports 

In [None]:
import pandas as pd
import re 
import numpy as np
import datetime
from datetime import datetime

## repeated printouts
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Load and view dataset 1: tax certificates for San Diego businesses

In [None]:
## general link: https://data.sandiego.gov/datasets/business-listings/

## active tax certificates
sd_active_biz = pd.read_csv("https://seshat.datasd.org/ttcs/sd_businesses_active_datasd.csv")


## Step 1 - what are the possible join fields?

- Business-level fields:
    - Owner name
    - Business name (dba_name)
    
- Sector-level fields:
    - naics_sector 
    - naics_code
    - naics_description
    
- Geographic fields:
    - City and state (less interesting in this case)
    - Zip 
    - Bid (business improvement district)
    - Council district

## Step 2- once we've decided on join field, cleaning/deduplicating

Here, first focus on two-digit NAICS codes, or NAICS sector

In [None]:
## see reasonably clean and no missingness 
sd_active_biz.naics_sector.value_counts(dropna = False)

## look at crosstab with sector description
pd.crosstab(sd_active_biz.naics_sector,
           sd_active_biz.naics_description).T

54    13361
81     9588
56     5004
45     4940
62     4877
23     4830
44     3683
72     3600
53     3189
42     2049
48     1944
61     1790
71     1469
52     1219
33      845
51      803
32      325
49      315
31      309
55      234
11      116
22       68
92       11
21        2
Name: naics_sector, dtype: int64

naics_sector,11,21,22,23,31,32,33,42,44,45,...,53,54,55,56,61,62,71,72,81,92
naics_description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACCOMMODATION,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,10,0,0
ACCOMMODATION & FOOD SERVICES,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,29,0,0
ACCOUNTING/TAX PREP/BOOKKEEP/PAYROLL SERVICES,0,0,0,0,0,0,0,0,0,0,...,0,529,0,0,0,0,0,0,0,0
ACTIVITIES RELATED TO CREDIT INTERMEDIATION,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ACTIVITIES RELATED TO REAL ESTATE,0,0,0,0,0,0,0,0,0,0,...,297,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WOOD PRODUCT MFG,0,0,0,0,0,23,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
WRECKING & DEMOLITION CONTRACTORS,0,0,0,17,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
YOGA INSTRUCTOR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,80,0,0,0,0,0
YOGA STUDIO,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,24,0,0,0,0,0


## Step 3 - repeat those steps with the data i'm joining

### Here, I want to join official census data on naics codes that might have more detail, loading and cleaning


In [None]:
## general site- https://www.census.gov/eos/www/naics/2017NAICS

## two files that vary based on summary digits
## first seems to contain both 2 and 6 digit NAICS codes
## second seems to contain only 6-dig codes
naics_26 = pd.read_excel("https://www.census.gov/eos/www/naics/2017NAICS/2-6%20digit_2017_Codes.xlsx")
naics_26.head()
naics_6 = pd.read_excel("https://www.census.gov/eos/www/naics/2017NAICS/6-digit_2017_Codes.xlsx")
naics_6.head()

## see that requires two fixes to make usable:
## (1) column names
## (2) first row is just empty 
## (3) cols with unnamed are empty

def clean_naics(one_naics: pd.DataFrame):
    
    ## first fix cols
    naics_newcol = [re.sub('\s+|\.', '', col.lower()) for col in one_naics.columns]
    one_naics.columns = naics_newcol
    
    ## skip first row (0 index) and keep col if not unnamed in col
    naics_keep = one_naics.loc[1:, [col for col in one_naics.columns if "unnamed" not in col and "seq" not in col]]
    return(naics_keep)

Unnamed: 0,Seq. No.,2017 NAICS US Code,2017 NAICS US Title,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,,,,,,
1,1.0,11.0,"Agriculture, Forestry, Fishing and Hunting",,,
2,2.0,111.0,Crop Production,,,
3,3.0,1111.0,Oilseed and Grain Farming,,,
4,4.0,11111.0,Soybean Farming,,,


Unnamed: 0,2017 NAICS Code,2017 NAICS Title,Unnamed: 2
0,,,
1,111110.0,Soybean Farming,
2,111120.0,Oilseed (except Soybean) Farming,
3,111130.0,Dry Pea and Bean Farming,
4,111140.0,Wheat Farming,


In [None]:
naics_26_c = clean_naics(naics_26)
naics_6_c = clean_naics(naics_6)

naics_26_c.head()
naics_6_c.head()

## rowbind using pd.concat and deduplicate
## to rowbind, need (1) identical colnames
## (2) good to also have identical types
print("Dtypes for 2-6 code data are:---------------------------")
naics_26_c.dtypes
print("Dtypes for 6 code data are:---------------------------")
naics_6_c.dtypes


## fix dtypes and rename col
naics_6_c['2017naicscode'] = naics_6_c['2017naicscode'].astype(str).str.replace("\.", "", regex = True)


## use regular expressions to rename cols by subbing out the "us"
### way 1: riskier since could have "us" elsewhere in string
naics_26_cleancol = [re.sub("us", "", one_col) for one_col in naics_26_c.columns]
naics_26_cleancol


Unnamed: 0,2017naicsuscode,2017naicsustitle
1,11,"Agriculture, Forestry, Fishing and Hunting"
2,111,Crop Production
3,1111,Oilseed and Grain Farming
4,11111,Soybean Farming
5,111110,Soybean Farming


Unnamed: 0,2017naicscode,2017naicstitle
1,111110.0,Soybean Farming
2,111120.0,Oilseed (except Soybean) Farming
3,111130.0,Dry Pea and Bean Farming
4,111140.0,Wheat Farming
5,111150.0,Corn Farming


Dtypes for 2-6 code data are:---------------------------


2017naicsuscode     object
2017naicsustitle    object
dtype: object

Dtypes for 6 code data are:---------------------------


2017naicscode     float64
2017naicstitle     object
dtype: object

['2017naicscode', '2017naicstitle']

In [None]:
### way 2: will make sense after datacamp module; tell it where to look
### for the "us" substring and remove that group
find_us_pattern = r"(.*naics)(us)([code|title].*)"

## let's test with 1 in a repetitive way (usually want to do
## the group call within an ifelse since issues if no matches)
found_g1 = re.match(find_us_pattern, naics_26_c.columns[0]).group(1)
found_g2 = re.match(find_us_pattern, naics_26_c.columns[0]).group(2)
found_g3 = re.match(find_us_pattern, naics_26_c.columns[0]).group(3)

print(found_g1)
print(found_g2)
print(found_g3)

## consolidate into one
cleaned_colpattern = [re.match(find_us_pattern, one_col).group(1) + re.match(find_us_pattern, one_col).group(3) 
                    for one_col
                    in naics_26_c.columns]

naics_26_c.columns = cleaned_colpattern

2017naics
us
code


In [None]:
## finally :) 
## we can concatenate!

naics_all_raw = pd.concat([naics_26_c, naics_6_c])
naics_all_raw.shape
naics_all = naics_all_raw.drop_duplicates()

naics_all['is_twodig_code'] = np.where(naics_all['2017naicscode'].astype(str).str.len() == 2, 
                                       True, False)


(3253, 2)

# Examples of joins

To make more realistic of rows dropping, we're going to work only with ownership_type == LP

In [None]:
sd_active_biz.ownership_type.value_counts()

SOLE      28719
CORP      18569
LLC        7912
SCORP      4555
H-W        1959
PARTNR     1671
NO/PRF      707
LP          466
TRUST        13
Name: ownership_type, dtype: int64

In [None]:
sd_lp = sd_active_biz[sd_active_biz.ownership_type == "LP"].copy()

"""The full SD data has {full_unique} naics codes while LPs come from {lp_unique} codes and \
there are {cen_unique} naics codes \
in this census file""".format(full_unique = len(sd_active_biz.naics_sector.unique()),
            lp_unique = len(sd_lp.naics_sector.unique()),
            cen_unique = len(naics_all['2017naicscode'][naics_all.is_twodig_code].unique()))



'The full SD data has 24 naics codes while LPs come from 22 codes and there are 17 naics codes in this census file'

In [None]:

## final check on dtypes
if sd_lp['naics_sector'].dtypes != naics_all['2017naicscode'].dtypes:
    naics_all['2017naicscode'] = naics_all['2017naicscode'].astype(str)
    sd_lp['naics_sector'] = sd_lp['naics_sector'].astype(str)

## "Inner join"- retain only two-dig naics codes in both 

- Filter to two-digit naics codes in the Census data
- Inner join the sd_lp data and these two-digit codes

After the merge, print diagnostics on dropped rows in SD businesses data and dropped NAICS codes

In [None]:
naics_twodig = naics_all[naics_all.is_twodig_code].copy()

In [None]:
sd_cen_inner = pd.merge(sd_lp,
                       naics_twodig,
                       how = "inner",
                       left_on = "naics_sector",
                       right_on = "2017naicscode")

"""In our original data, there were {n_orig} rows; \
now after dropping ones without a naics code \
in our current census data there are {n_new} rows\
""".format(n_orig = sd_lp.shape[0], 
          n_new = sd_cen_inner.shape[0])

## which naics codes got lost
## subset to account keys not in the 
## inner join and to the naics columns
lost_merge = sd_lp.loc[~sd_lp.account_key.isin(sd_cen_inner.account_key),
                      ['naics_sector', 'naics_description']].drop_duplicates()

lost_merge

## see ones like cpa; can also aggregate; seems at least some are misc
## and are probably in census data full 6-dig naics codes
lost_merge_dx = lost_merge.groupby('naics_sector').agg({'naics_description': lambda x: "; ".join(x)})

lost_merge_dx


'In our original data, there were 466 rows; now after dropping ones without a naics code in our current census data there are 403 rows'

Unnamed: 0,naics_sector,naics_description
4099,48,MOTOR VEHICLE TOWING
5855,44,GROCERY STORES
6563,49,WAREHOUSING & STORAGE
7489,32,ALL OTHER MISCELLANEOUS WOOD PRODUCT MFG
10796,44,GASOLINE STATIONS
13945,45,ALL OTHER MISC STORE RETAILERS (EXC TOBACCO)
15177,33,ALL OTH ELECTRICAL EQUIPMENT & COMPONENT MFG
15573,45,MISCELLANEOUS STORE RETAILERS
17179,44,USED CAR DEALERS
19656,44,GASOLINE STATIONS WITH CONVENIENCE STORES


Unnamed: 0_level_0,naics_description
naics_sector,Unnamed: 1_level_1
31,COFFEE & TEA MFG; WINERIES; ALL OTHER MISCELLA...
32,ALL OTHER MISCELLANEOUS WOOD PRODUCT MFG; COMM...
33,ALL OTH ELECTRICAL EQUIPMENT & COMPONENT MFG; ...
44,GROCERY STORES; GASOLINE STATIONS; USED CAR DE...
45,ALL OTHER MISC STORE RETAILERS (EXC TOBACCO); ...
48,MOTOR VEHICLE TOWING; OTHER AIR TRANSPORTATION...
49,WAREHOUSING & STORAGE; LOCAL MESSENGERS & LOCA...


## "Left join"- retain all sd businesses even if 2-dig naics code not in census

In [None]:
naics_twodig.head()

Unnamed: 0,2017naicscode,2017naicstitle,is_twodig_code
1,11,"Agriculture, Forestry, Fishing and Hunting",True
132,21,"Mining, Quarrying, and Oil and Gas Extraction",True
180,22,Utilities,True
205,23,Construction,True
926,42,Wholesale Trade,True


In [None]:
## try merging 
sd_licensed_wnaics = pd.merge(sd_lp,
                             naics_twodig,
                             left_on = 'naics_sector',
                             right_on = '2017naicscode',
                             how = "left",
                             indicator = "naics_merge_status")
sd_licensed_wnaics.naics_merge_status.value_counts()

## look at sample of ones that didnt merge

sd_licensed_wnaics.loc[sd_licensed_wnaics.naics_merge_status == "left_only",
                  ['naics_code', 'naics_description', '2017naicstitle',
                  'dba_name']].sample(20)


both          403
left_only      63
right_only      0
Name: naics_merge_status, dtype: int64

Unnamed: 0,naics_code,naics_description,2017naicstitle,dba_name
84,44112,USED CAR DEALERS,,CLASSIC ASSETS MOTORSPORTS CENTER
23,4451,GROCERY STORES,,O B QUIK STOP MARKET
24,4931,WAREHOUSING & STORAGE,,AAA ALLIANCE SELF STORAGE
373,45322,"GIFT, NOVELTY & SOUVENIR STORES",,AR WORKSHOP SAN DIEGO
394,4431,ELECTRONICS & APPLIANCE STORES,,VERIZON WIRELESS
17,48841,MOTOR VEHICLE TOWING,,TIC TAC TOW LLP
212,44531,"BEER, WINE & LIQUOR STORES",,HKG DUTY FREE
395,4431,ELECTRONICS & APPLIANCE STORES,,VERIZON WIRELESS
110,44711,GASOLINE STATIONS WITH CONVENIENCE STORES,,4S RANCH GASOLINE & CAR WASH
408,454,NONSTORE RETAILERS,,KCG VENTURES LLC


# Activity

- Going back to the full sd_active_biz data
- Go back to the six-digit NAICS codes and try the following merges:
    
    - Inner join with census data
    - Right join where in the sd_active_biz data, find the # of businesses per NAICS code and then merge that with the Census data
    - Left join with census data retaining all san diego businesses

    
- Using other fields in the SD businesses data like the date the business was started, do some merge diagnostics of whether, for instance, older businesses are more or less likely to be lost in the left join

## Inner join

In [None]:
sd_active_biz[[col for col in sd_active_biz.columns if "naics" in col]].head()
sd_active_biz[[col for col in sd_active_biz.columns if "naics" in col]].dtypes

naics_all.dtypes

sd_active_biz['naics_code_string'] = sd_active_biz.naics_code.astype(str)

good_merge = pd.merge(sd_active_biz,
                    naics_all,
                    left_on = "naics_code_string",
                    right_on = "2017naicscode")

print("When we do an inner join, we retain {n_post} rows out of {n_pre} rows, or {perc} percent".
     format(n_post = good_merge.shape[0],
           n_pre = sd_active_biz.shape[0],
           perc = round((good_merge.shape[0]/sd_active_biz.shape[0])*100)))


Unnamed: 0,naics_sector,naics_code,naics_description,naics_code_str,digits_in_naics,naics_code_string
0,44,442,FURNITURE & HOME FURNISHINGS STORES,442,3,442
1,23,23511,"PLUMBING, HEATING & AC CONTRACTOR",23511,5,23511
2,42,42199,OTHER MISCELLANEOUS DURABLE GOODS WHSLE,42199,5,42199
3,42,42199,OTHER MISCELLANEOUS DURABLE GOODS WHSLE,42199,5,42199
4,45,4539,OTHER MISCELLANEOUS STORE RETAILERS,4539,4,4539


naics_sector          int64
naics_code            int64
naics_description    object
naics_code_str       object
digits_in_naics       int64
naics_code_string    object
dtype: object

2017naicscode     object
2017naicstitle    object
is_twodig_code      bool
dtype: object

When we do an inner join, we retain 49087 rows out of 64571 rows, or 76 percent


In [None]:
sd_active_biz[[col for col in sd_active_biz.columns if "naics" in col]].head()

sd_active_biz['digits_in_naics'] = sd_active_biz.naics_code.astype(str).str.len()
sd_active_biz[[col for col in sd_active_biz.columns if "naics" in col]].head()

sd_active_biz.digits_in_naics.value_counts()

naics_all.head()

#sd_active_biz.dtypes
#naics_all.dtypes

sd_active_biz['naics_code_string'] = sd_active_biz.naics_code.astype(str)
sd_active_biz.dtypes

## do inner
sd_inner = pd.merge(sd_active_biz,
                   naics_all,
                   how = "inner",
                   left_on = "naics_code_string",
                   right_on = "2017naicscode")

sd_inner.shape[0]/sd_active_biz.shape[0]

naics_all.head()

Unnamed: 0,naics_sector,naics_code,naics_description,naics_code_str,digits_in_naics,naics_code_string
0,44,442,FURNITURE & HOME FURNISHINGS STORES,442,3,442
1,23,23511,"PLUMBING, HEATING & AC CONTRACTOR",23511,5,23511
2,42,42199,OTHER MISCELLANEOUS DURABLE GOODS WHSLE,42199,5,42199
3,42,42199,OTHER MISCELLANEOUS DURABLE GOODS WHSLE,42199,5,42199
4,45,4539,OTHER MISCELLANEOUS STORE RETAILERS,4539,4,4539


Unnamed: 0,naics_sector,naics_code,naics_description,naics_code_str,digits_in_naics,naics_code_string
0,44,442,FURNITURE & HOME FURNISHINGS STORES,442,3,442
1,23,23511,"PLUMBING, HEATING & AC CONTRACTOR",23511,5,23511
2,42,42199,OTHER MISCELLANEOUS DURABLE GOODS WHSLE,42199,5,42199
3,42,42199,OTHER MISCELLANEOUS DURABLE GOODS WHSLE,42199,5,42199
4,45,4539,OTHER MISCELLANEOUS STORE RETAILERS,4539,4,4539


5    30785
4    12819
6    12107
3     7801
2     1059
Name: digits_in_naics, dtype: int64

Unnamed: 0,2017naicscode,2017naicstitle,is_twodig_code
1,11,"Agriculture, Forestry, Fishing and Hunting",True
2,111,Crop Production,False
3,1111,Oilseed and Grain Farming,False
4,11111,Soybean Farming,False
5,111110,Soybean Farming,False


account_key                int64
account_status            object
account_status_code       object
date_account_creation     object
date_cert_expiration      object
date_cert_effective       object
business_owner_name       object
ownership_type            object
date_business_start       object
dba_name                  object
naics_sector               int64
naics_code                 int64
naics_description         object
address_no                object
address_pd                object
address_road              object
address_sfx               object
address_no_fraction       object
address_city              object
address_state             object
address_zip               object
address_suite             object
address_pmb_box           object
address_po_box           float64
bid                      float64
council_district         float64
lat                      float64
lng                      float64
naics_code_str            object
digits_in_naics            int64
naics_code

0.7602019482430193

Unnamed: 0,2017naicscode,2017naicstitle,is_twodig_code
1,11,"Agriculture, Forestry, Fishing and Hunting",True
2,111,Crop Production,False
3,1111,Oilseed and Grain Farming,False
4,11111,Soybean Farming,False
5,111110,Soybean Farming,False


## Right join

In [None]:
## first, create aggregate of san diego
## by naics code of number of businesses

nbiz_pernaics = sd_active_biz.groupby('naics_code_string').agg({'account_key': 'nunique'}).reset_index()
nbiz_pernaics.columns = ['naics_code', 'n_sdbiz']
nbiz_pernaics.head()

## then, use that as a left
## right join with census all
naics_wn_sdbiz = pd.merge(nbiz_pernaics,
                         naics_all,
                         how = "right",
                         left_on = "naics_code",
                         right_on = "2017naicscode",
                        indicator = "nbiz_merge_dx")

naics_wn_sdbiz.head()

## look at status- see that 2365 stayed right only
naics_wn_sdbiz.nbiz_merge_dx.value_counts()

## view some with zero sd business
"""
A random sample of industries with no SD businesses with active \
tax certificates are: {naics_ex}
""".format(naics_ex = "; ".join(naics_wn_sdbiz['2017naicstitle'][naics_wn_sdbiz.nbiz_merge_dx == "right_only"].sample(n = 10)))

Unnamed: 0,naics_code,n_sdbiz
0,11,6
1,111,6
2,1111,1
3,111199,2
4,11121,2


Unnamed: 0,naics_code,n_sdbiz,2017naicscode,2017naicstitle,is_twodig_code,nbiz_merge_dx
0,11.0,6.0,11,"Agriculture, Forestry, Fishing and Hunting",True,both
1,111.0,6.0,111,Crop Production,False,both
2,1111.0,1.0,1111,Oilseed and Grain Farming,False,both
3,,,11111,Soybean Farming,False,right_only
4,,,111110,Soybean Farming,False,right_only


right_only    2365
both           888
left_only        0
Name: nbiz_merge_dx, dtype: int64

'\nA random sample of NAICS codes with no SD businesses with active tax certificates are: Employment Placement Agencies ; Meat and Meat Product Merchant Wholesalers ; Toy and Hobby Goods and Supplies Merchant Wholesalers ; Psychiatric and Substance Abuse Hospitals; Ice Manufacturing ; Wheat Farming; Fuel Dealers ; Motor Vehicle Seating and Interior Trim Manufacturing; Paperboard Mills; Other Support Activities for Air Transportation\n'

## Left join and diagnostics 

In [None]:
## left join to retain all SD active businesses

sd_active_biz_wnaics = pd.merge(sd_active_biz,
                    naics_all,
                    left_on = "naics_code_string",
                    right_on = "2017naicscode",
                    how = "left",
                    indicator = "naics_dx_merge")

## check merge indicator - get same 76% match rate we got above as expected
sd_active_biz_wnaics.naics_dx_merge.value_counts(normalize = True)


## create binary indicator for lost in merge
sd_active_biz_wnaics['is_lost_merge'] = np.where(sd_active_biz_wnaics.naics_dx_merge == "left_only", 
                                                True, False)

## do some comparisons by group

### first make numeric version of date_business_start
### as difference between today and that date- just coercing errors 
### and using full timestamp; for actual work, would want to round
### to the day and do with static date
sd_active_biz_wnaics['date_bstart_dt'] =  pd.to_datetime(sd_active_biz_wnaics.date_business_start,
                                                        errors = "coerce")
today = datetime.now()
sd_active_biz_wnaics['tenure_asoftoday'] = pd.to_numeric(today - sd_active_biz_wnaics.date_bstart_dt)

sd_active_biz_wnaics[[col for col in sd_active_biz_wnaics.columns if 
                        "date" in col] + ['tenure_asoftoday']].head()


## group by match and compare tenure
### see that newer businesses (shorter tenure) are more likely to be lost in merge
sd_active_biz_wnaics.groupby('is_lost_merge')['tenure_asoftoday'].mean()

## compare some categorical

### see bid like bid 27 more likely to get lost in merge
pd.crosstab(sd_active_biz_wnaics['is_lost_merge'],
           sd_active_biz_wnaics['bid'], normalize = "columns").T

### see some smaller SD-area places that are outside the city
### more likely
pd.crosstab(sd_active_biz_wnaics['is_lost_merge'],
           sd_active_biz_wnaics['address_city'], normalize = "columns").T

both          0.760202
left_only     0.239798
right_only    0.000000
Name: naics_dx_merge, dtype: float64

Unnamed: 0,date_account_creation,date_cert_expiration,date_cert_effective,date_business_start,date_bstart_dt,tenure_asoftoday
0,1974-07-01 12:00:00,2021-06-30 12:00:00,2020-07-01 12:00:00,1974-07-01 12:00:00,1974-07-01 12:00:00,1477046116345195000
1,1974-07-01 12:00:00,2021-06-30 12:00:00,2020-07-01 12:00:00,1974-07-01 12:00:00,1974-07-01 12:00:00,1477046116345195000
2,1974-07-01 12:00:00,2021-06-30 00:00:00,2020-07-01 00:00:00,1974-07-01 12:00:00,1974-07-01 12:00:00,1477046116345195000
3,1974-07-01 12:00:00,2021-06-30 00:00:00,2020-07-01 00:00:00,1974-07-01 12:00:00,1974-07-01 12:00:00,1477046116345195000
4,1974-07-01 12:00:00,2021-06-30 12:00:00,2020-07-01 12:00:00,1974-07-01 12:00:00,1974-07-01 12:00:00,1477046116345195000


is_lost_merge
False    399714644745272640
True     345444528141030400
Name: tenure_asoftoday, dtype: int64

is_lost_merge,False,True
bid,Unnamed: 1_level_1,Unnamed: 2_level_1
4.0,0.827839,0.172161
6.0,0.84127,0.15873
7.0,0.673203,0.326797
10.0,0.728232,0.271768
11.0,0.7375,0.2625
12.0,0.741333,0.258667
13.0,0.816092,0.183908
14.0,0.758112,0.241888
16.0,0.805243,0.194757
17.0,0.760181,0.239819


is_lost_merge,False,True
address_city,Unnamed: 1_level_1,Unnamed: 2_level_1
,0.214286,0.785714
ADDISON,0.600000,0.400000
ADELANTO,0.333333,0.666667
AGUANGA,0.000000,1.000000
AHAHEIM,1.000000,0.000000
...,...,...
YORBA LINDA,0.200000,0.800000
YOUNGSTOWN,0.000000,1.000000
YOUNGSVILLE,1.000000,0.000000
YUMA,0.833333,0.166667
