## Imputation for zip codes
### Fraud Analytics Project I
#### by: Aslan

In [129]:
import pandas as pd
import numpy as np

In [130]:
df = pd.read_csv('NY property data.csv')

In [131]:
df.columns

Index(['RECORD', 'BBLE', 'B', 'BLOCK', 'LOT', 'EASEMENT', 'OWNER', 'BLDGCL',
       'TAXCLASS', 'LTFRONT', 'LTDEPTH', 'EXT', 'STORIES', 'FULLVAL', 'AVLAND',
       'AVTOT', 'EXLAND', 'EXTOT', 'EXCD1', 'STADDR', 'ZIP', 'EXMPTCL',
       'BLDFRONT', 'BLDDEPTH', 'AVLAND2', 'AVTOT2', 'EXLAND2', 'EXTOT2',
       'EXCD2', 'PERIOD', 'YEAR', 'VALTYPE'],
      dtype='object')

In [132]:
df_init = df[
        ['RECORD', 'BBLE', 'B', 'BLOCK', 'BLDGCL',
        'TAXCLASS', 'ZIP',  'STADDR','LTFRONT', 'LTDEPTH', 'STORIES', 
        'BLDFRONT', 'BLDDEPTH', 'FULLVAL', 'AVLAND',
        'AVTOT']]

In [5]:
null_zero = pd.DataFrame(dict(zip(['# of null', '# of zeros'], [df_init.isna().sum(), (df_init == 0).sum()])))
null_zero

Unnamed: 0,# of null,# of zeros
RECORD,0,0
BBLE,0,0
B,0,0
BLOCK,0,0
BLDGCL,0,0
TAXCLASS,0,0
ZIP,29890,0
STADDR,676,0
LTFRONT,0,169108
LTDEPTH,0,170128


In [6]:
df_init[(df_init.STADDR.isna())].ZIP.isna().sum()

676

In [7]:
uni = df_init.groupby(['B', 'BLOCK'])['ZIP'].unique()

In [8]:
cou = df_init.groupby(['B', 'BLOCK'])['ZIP'].count()

In [9]:
## here I group records by boro and block, and unique code columns correspond to unique zip codes in each group,
## represented in a list, and group size is the count of non-null values in each group

uni_cou = pd.DataFrame(dict(zip(['unique code', 'group size'], [uni, cou])))
uni_cou.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,unique code,group size
B,BLOCK,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,[10004.0],2
1,2,[10004.0],2
1,3,[10004.0],4
1,4,[10004.0],53
1,5,[10004.0],9
1,6,[10004.0],2
1,7,[10004.0],15
1,8,[10004.0],6
1,9,[10004.0],4
1,10,[10004.0],8


In [10]:
uni_cou['group size'].sum()

1041104

In [11]:
## I would like to know groups that only contain null values

uni_cou['null group'] = uni_cou['unique code'].apply(lambda x: 'Yes' if (len(x) == 1) & (pd.isnull(x[0])) else 'No')

In [12]:
null_uni_cou = uni_cou[uni_cou['null group'] == 'Yes']

In [13]:
null_uni_cou

Unnamed: 0_level_0,Unnamed: 1_level_0,unique code,group size,null group
B,BLOCK,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,36,[nan],0,Yes
1,108,[nan],0,Yes
1,115,[nan],0,Yes
1,120,[nan],0,Yes
1,213,[nan],0,Yes
...,...,...,...,...
5,7770,[nan],0,Yes
5,7774,[nan],0,Yes
5,7806,[nan],0,Yes
5,7987,[nan],0,Yes


In [14]:
## I found that for these groups, most of them actually have street information, which is great
## so I checked below for those that don't have st information
## it turns out only 4 B-BLOCK groups with 5 records in total don't
## NICE

In [15]:
## B-BLOCK Groups that have *no* non-null ADDRESS values
B = df_init['B'].unique()

print("null zips with no address:\n")
for i in B:
    b, Block = zip(*null_uni_cou.loc[[i]].index)
    for j in Block:
        temp_df = df_init[(df['B'] == i) & (df['BLOCK'] == j)]
        if temp_df.STADDR.isna().sum() == temp_df.shape[0]:
            print("borough: {}, block: {}".format(i,j))

null zips with no address:

borough: 3, block: 4577
borough: 3, block: 4578
borough: 3, block: 4579
borough: 5, block: 6415


In [None]:
## so there are 4 records that have no zip and no street information at the same time

In [18]:
#df_init[(df_init['B'] == 2) & (df_init['BLOCK'] == 3231)]

In [19]:
df_init[(df['B'] == 3) & (df['BLOCK'] == 4577)]

Unnamed: 0,RECORD,BBLE,B,BLOCK,BLDGCL,TAXCLASS,ZIP,STADDR,LTFRONT,LTDEPTH,STORIES,BLDFRONT,BLDDEPTH,FULLVAL,AVLAND,AVTOT
400057,400058,3045770001,3,4577,U5,3,,,0,0,,0,0,0.0,0.0,0.0


In [20]:
df_init[(df['B'] == 1) & (df['BLOCK'] == 108)]

Unnamed: 0,RECORD,BBLE,B,BLOCK,BLDGCL,TAXCLASS,ZIP,STADDR,LTFRONT,LTDEPTH,STORIES,BLDFRONT,BLDDEPTH,FULLVAL,AVLAND,AVTOT
9019,9020,1001080004,1,108,V9,4,,SOUTH STREET,118,691,,0,0,55426140.0,24941763.0,24941763.0


In [22]:
#! pip install -U googlemaps

In [23]:
## i found that googlemap api can be a good tool to use
import googlemaps
gmaps = googlemaps.Client(key='AIzaSyDfDeD9wp9GmqjeZnwK5ZOyD19qhyp_xRs')

In [24]:
## define a function that return a zip code given an address; if no zip could be found, return nothing
def return_zip(add):
    boro = {1 : 'MANHATTAN', 2 : 'BRONX', 3 : 'BROOKLYN', 4 : 'QUEENS', 5 : 'STATEN ISLAND' }
    boro_num = add[0]
    ad = add[1]
    try:
        address = ad + boro[boro_num] + 'NY'
        result = gmaps.geocode(address)
        for dicts in result[0]['address_components']:
            if ''.join(dicts['types']) == 'postal_code':
                return int(dicts['long_name'])
                break
    except:
        return np.nan

In [None]:
## from now on, my codes seperate into two parts, working on two dataframes basically,
## one whould be the frame capturing B-BLOCK groups that have no non-null values
## another would be the frame capturing B-BLOCK groups that have at least on non-null values

In [25]:
new = pd.DataFrame()
for b_idx, bl_idx in null_uni_cou.index:
    new = new.append(df_init[(df_init['B'] == b_idx) & (df_init['BLOCK'] == bl_idx)])

In [26]:
new.head()

Unnamed: 0,RECORD,BBLE,B,BLOCK,BLDGCL,TAXCLASS,ZIP,STADDR,LTFRONT,LTDEPTH,STORIES,BLDFRONT,BLDDEPTH,FULLVAL,AVLAND,AVTOT
6092,6093,1000360012,1,36,Y7,4,,PIER 9,534,604,,139,604,21500000.0,9180000.0,9675000.0
6093,6094,1000360018,1,36,T2,4,,PIER11 SOUTH STREET,328,547,1.0,14,34,14025000.0,6030000.0,6311250.0
6094,6095,1000360025,1,36,T2,4,,PIER13 SOUTH STREET,352,607,1.0,98,24,13300000.0,5535000.0,5985000.0
6095,6096,1000360030,1,36,T2,4,,PIER14 SOUTH STREET,273,606,1.0,50,488,11600000.0,4770000.0,5220000.0
9019,9020,1001080004,1,108,V9,4,,SOUTH STREET,118,691,,0,0,55426140.0,24941763.0,24941763.0


In [27]:
new.shape

(2448, 16)

In [28]:
new.STADDR.isna().sum()

15

In [29]:
null_zip_add = new.copy()

In [30]:
## drop rows with no address

null_zip_add = null_zip_add[np.logical_not((null_zip_add.B == 3) & (null_zip_add.BLOCK == 4577))]
null_zip_add = null_zip_add[np.logical_not((null_zip_add.B == 3) & (null_zip_add.BLOCK == 4578))]
null_zip_add = null_zip_add[np.logical_not((null_zip_add.B == 3) & (null_zip_add.BLOCK == 4579))]
null_zip_add = null_zip_add[np.logical_not((null_zip_add.B == 5) & (null_zip_add.BLOCK == 6415))]

In [31]:
null_zip_add.shape ## precisely delete 5 records

(2443, 16)

In [32]:
## these 10 records are the cases when the group "B" "BLOCK" has *at least one* non-null STADDR values
null_zip_add[null_zip_add.STADDR.isna()]

Unnamed: 0,RECORD,BBLE,B,BLOCK,BLDGCL,TAXCLASS,ZIP,STADDR,LTFRONT,LTDEPTH,STORIES,BLDFRONT,BLDDEPTH,FULLVAL,AVLAND,AVTOT
170213,170214,2032310001E,2,3231,U9,3,,,0,0,,0,0,0.0,0.0,0.0
170214,170215,2032310001F,2,3231,U1,3,,,0,0,,0,0,0.0,0.0,0.0
249673,249674,2057530135E,2,5753,U6,3,,,0,0,,0,0,0.0,0.0,0.0
380021,380022,3038150001E,3,3815,U7,3,,,149,570,,0,0,0.0,0.0,0.0
380023,380024,3038150030E,3,3815,U7,3,,,22,69,,0,0,0.0,0.0,0.0
380025,380026,3038150034E,3,3815,U7,3,,,22,26,,0,0,0.0,0.0,0.0
573689,573690,3088130050E,3,8813,U9,3,,,60,115,,0,0,0.0,0.0,0.0
603583,603584,4009170100E,4,917,U1,3,,,0,0,,0,0,0.0,0.0,0.0
922761,922762,4156980001E,4,15698,U9,3,,,50,360,,0,0,200.0,90.0,90.0
1052616,1052617,5063280100E,5,6328,U9,3,,,50,80,,0,0,0.0,0.0,0.0


In [33]:
temp = null_zip_add.copy()

In [34]:
## take really long time....

temp['ZIP'] = temp[['B', 'STADDR']].apply(return_zip, axis = 1)

In [35]:
temp.head()

Unnamed: 0,RECORD,BBLE,B,BLOCK,BLDGCL,TAXCLASS,ZIP,STADDR,LTFRONT,LTDEPTH,STORIES,BLDFRONT,BLDDEPTH,FULLVAL,AVLAND,AVTOT
6092,6093,1000360012,1,36,Y7,4,10038.0,PIER 9,534,604,,139,604,21500000.0,9180000.0,9675000.0
6093,6094,1000360018,1,36,T2,4,10005.0,PIER11 SOUTH STREET,328,547,1.0,14,34,14025000.0,6030000.0,6311250.0
6094,6095,1000360025,1,36,T2,4,10038.0,PIER13 SOUTH STREET,352,607,1.0,98,24,13300000.0,5535000.0,5985000.0
6095,6096,1000360030,1,36,T2,4,10038.0,PIER14 SOUTH STREET,273,606,1.0,50,488,11600000.0,4770000.0,5220000.0
9019,9020,1001080004,1,108,V9,4,,SOUTH STREET,118,691,,0,0,55426140.0,24941763.0,24941763.0


In [36]:
temp.ZIP.isna().sum()

1199

In [38]:
temp.ZIP.unique()

array([10038., 10005.,    nan, 10002., 10012., 10014., 10013., 10011.,
       10451., 10035., 10454., 10474., 10457., 10033., 10468., 10473.,
       10465., 10453., 10475., 10471., 11238., 11205., 11211., 11222.,
       11237., 11207., 10463., 11220., 11209., 11224., 11236., 11234.,
       11235., 11101., 11105., 11368., 11693., 11378., 11356., 11357.,
       11367., 11354., 11106., 11361., 11360., 11366., 11364., 11363.,
       11411., 11377., 11001., 11435., 11040., 11414., 11420., 11436.,
       11434., 11691., 11692., 11694., 11697., 10301., 10314., 10304.,
       10305., 10302., 10303., 10306., 11358., 10309., 10308., 31648.,
       10312., 10010., 54151., 10307.])

In [42]:
## i notice that there are two weird numbers: 31648 & 54151, so i looked them up

temp[(temp.ZIP == 54151) | (temp.ZIP == 31648)]

Unnamed: 0,RECORD,BBLE,B,BLOCK,BLDGCL,TAXCLASS,ZIP,STADDR,LTFRONT,LTDEPTH,STORIES,BLDFRONT,BLDDEPTH,FULLVAL,AVLAND,AVTOT
1046019,1046020,5057660001,5,5766,V0,1B,31648.0,STATENVILLE AVENUE,200,300,,0,0,450000.0,2929.0,2929.0
1046020,1046021,5057670001,5,5767,V0,1B,31648.0,STATENVILLE AVENUE,200,202,,0,0,360000.0,2807.0,2807.0
1046021,1046022,5057680001,5,5768,V0,1B,31648.0,STATENVILLE AVENUE,60,100,,0,0,67500.0,2563.0,2563.0
1046022,1046023,5057680004,5,5768,V0,1B,31648.0,STATENVILLE AVENUE,140,305,,0,0,523000.0,4639.0,4639.0
1046023,1046024,5057690001,5,5769,V0,1B,31648.0,STATENVILLE AVENUE,160,320,,0,0,576000.0,5738.0,5738.0
1046024,1046025,5057690009,5,5769,V0,1B,31648.0,STATENVILLE AVENUE,40,100,,0,0,45000.0,1708.0,1708.0
1046032,1046033,5057710001,5,5771,V0,1B,31648.0,STATENVILLE AVENUE,40,100,,0,0,180000.0,5029.0,5029.0
1046033,1046034,5057710003,5,5771,V0,1B,31648.0,STATENVILLE AVENUE,40,100,,0,0,180000.0,5029.0,5029.0
1046301,1046302,5057970216,5,5797,V0,1B,31648.0,STATENVILLE AVENUE,80,100,,0,0,360000.0,9082.0,9082.0
1046302,1046303,5057970220,5,5797,V0,1B,31648.0,STATENVILLE AVENUE,120,100,,0,0,540000.0,13555.0,13555.0


In [40]:
## it will affect the imputation step since null records in the same group may use that number
temp[(temp.B == 5) & (temp.BLOCK == 7379)]

Unnamed: 0,RECORD,BBLE,B,BLOCK,BLDGCL,TAXCLASS,ZIP,STADDR,LTFRONT,LTDEPTH,STORIES,BLDFRONT,BLDDEPTH,FULLVAL,AVLAND,AVTOT
1064677,1064678,5073790001,5,7379,V9,4,,CADY AVENUE,240,185,,0,0,874000.0,393300.0,393300.0
1064678,1064679,5073790015,5,7379,V9,4,54151.0,PEMBINE STREET,214,200,,0,0,754400.0,339480.0,339480.0


In [63]:
## therefore, for these records, I manually check their zip codes using http://maps.nyc.gov/doitt/nycitymap/
## where I can input BBLE number and get the exact location
## then I change the zip codes to correct ones

temp.loc[temp.ZIP == 31648, 'ZIP'] = 10312
temp.loc[temp.ZIP == 54151, 'ZIP'] = 10309

In [None]:
## Important Note:

# so i further check some other api-gerenated zip code, by comparing with the location shown on this website: 
# http://maps.nyc.gov/doitt/nycitymap/ and google the zip for that location. It turns out for some records
# there is disagreement bewteen nycitymap and google.
# it could be the reason that the street number are wrong, maybe in real world, we shoule go for BBLE and scrap
# location from the website (which seems to be more authoritive), but this right now goes beyong my knowledge and 
# it would take my a long time to figure it out. Therefore, I decide to stay with google api. 
# fortunately, even though zip may not be 100% accurate, they are very close.

## end

In [None]:
## triple check:

## now I want to do a final check with the api-returned zip codes
## for those returned records, if the returned zip falls out of the list of zip codes in that borough, I will replace
## it with the min zip in that borough. (here we can use other replacment)


In [78]:
## i copy and paste lists of zip codes for each borough; could do web sracping, but I'll save some energy.......

zip_1 = [10026, 10027, 10030, 10037, 10039,10001, 10011, 10018, 10019, 10020, 10036,10029, 10035,10010, 10016, 10017, 10022,10012, 10013, 10014,10004, 10005, 10006, 10007, 10038, 10280,10002, 10003, 10009,10021, 10028, 10044, 10065, 10075, 10128,10023, 10024, 10025,10031, 10032, 10033, 10034, 10040]

zip_2 = [10453, 10457, 10460,10458, 10467, 10468,10451, 10452, 10456,10454, 10455, 10459, 10474,10463, 10471,10466, 10469, 10470, 10475,10461, 10462,10464, 10465, 10472, 10473]

zip_3 = [11212, 11213, 11216, 11233, 11238,11209, 11214, 11228,11204, 11218, 11219, 11230,11234, 11236, 11239,11223, 11224, 11229, 11235,11201, 11205, 11215, 11217, 11231,11203, 11210, 11225, 11226,11207, 11208,11211, 11222,11220, 11232,11206, 11221, 11237]

zip_4 = [11361, 11362, 11363, 11364,11354, 11355, 11356, 11357, 11358, 11359, 11360,11365, 11366, 11367,11412, 11423, 11432, 11433, 11434, 11435, 11436,11101, 11102, 11103, 11104, 11105, 11106,11374, 11375, 11379, 11385,11691, 11692, 11693, 11694, 11695, 11697,11004, 11005, 11411, 11413, 11422, 11426, 11427, 11428, 11429,11414, 11415, 11416, 11417, 11418, 11419, 11420, 11421,11368, 11369, 11370, 11372, 11373, 11377, 11378]

zip_5 = [10302, 10303, 10310,10306, 10307, 10308, 10309, 10312,10301, 10304, 10305,10314]

In [85]:
zip_dict = {
    'zip_1': zip_1,
    'zip_2': zip_2,
    'zip_3': zip_3,
    'zip_4': zip_4,
    'zip_5': zip_5
}

In [98]:
def final_check(x):
    boro = x[0]
    z = x[1]
    
    if pd.isnull(z):
        return z
    else:
        boro = int(boro)
        z = int(z)
        if z not in zip_dict['zip_' + str(boro)]:
            return min(zip_dict['zip_' + str(boro)])
        else:
            return z

In [99]:
temp['ZIP'] = temp[['B', 'ZIP']].apply(final_check, axis = 1)

In [100]:
temp.ZIP.unique()

array([10038., 10005.,    nan, 10002., 10012., 10014., 10013., 10011.,
       10001., 10035., 10454., 10474., 10457., 10451., 10468., 10473.,
       10465., 10453., 10475., 10471., 11238., 11205., 11211., 11222.,
       11237., 11207., 11201., 11220., 11209., 11224., 11236., 11234.,
       11235., 11101., 11105., 11368., 11693., 11378., 11356., 11357.,
       11367., 11354., 11106., 11361., 11360., 11366., 11364., 11363.,
       11411., 11377., 11004., 11435., 11414., 11420., 11436., 11434.,
       11691., 11692., 11694., 11697., 10301., 10314., 10304., 10305.,
       10302., 10303., 10306., 10309., 10308., 10312., 10307.])

In [101]:
temp_uni = temp.groupby(['B', 'BLOCK'])['ZIP'].unique()
temp_cou = temp.groupby(['B', 'BLOCK'])['ZIP'].count()

temp_uni_cou = pd.DataFrame(dict(zip(['unique code', 'group size'], [temp_uni, temp_cou])))
temp_uni_cou['null group'] = temp_uni_cou['unique code'].apply(lambda x: 'Yes' if (len(x) == 1) & (pd.isnull(x[0])) else 'No')
temp_uni_cou.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,unique code,group size,null group
B,BLOCK,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,36,"[10038.0, 10005.0]",4,No
1,108,[nan],0,Yes
1,115,[nan],0,Yes
1,120,[10038.0],1,No
1,213,[nan],0,Yes


In [102]:
## so there are still groups that have no non-null address values
null_temp_uni_cou = temp_uni_cou[temp_uni_cou['null group'] == 'Yes']
null_temp_uni_cou

Unnamed: 0_level_0,Unnamed: 1_level_0,unique code,group size,null group
B,BLOCK,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,108,[nan],0,Yes
1,115,[nan],0,Yes
1,213,[nan],0,Yes
1,316,[nan],0,Yes
1,340,[nan],0,Yes
...,...,...,...,...
5,7746,[nan],0,Yes
5,7747,[nan],0,Yes
5,7748,[nan],0,Yes
5,7749,[nan],0,Yes


In [103]:
## first I want to use median as imputed value
## but it creates decimal places
## so I change it to most frequent
## update:

def zip_fill(x):
    b = x[0]
    bl = x[1]
    zip_code = x[2]
    
    if pd.isnull(zip_code):
        try: 
            return temp[(temp.B == b) & (temp.BLOCK == bl)]['ZIP'].value_counts().idxmax()
        except:
            return np.nan   ## some group may have empty sequence
    else:
        return zip_code

In [104]:
temp['ZIP'] = temp[['B', 'BLOCK', 'ZIP']].apply(zip_fill, axis = 1)

In [105]:
temp.head()

Unnamed: 0,RECORD,BBLE,B,BLOCK,BLDGCL,TAXCLASS,ZIP,STADDR,LTFRONT,LTDEPTH,STORIES,BLDFRONT,BLDDEPTH,FULLVAL,AVLAND,AVTOT
6092,6093,1000360012,1,36,Y7,4,10038.0,PIER 9,534,604,,139,604,21500000.0,9180000.0,9675000.0
6093,6094,1000360018,1,36,T2,4,10005.0,PIER11 SOUTH STREET,328,547,1.0,14,34,14025000.0,6030000.0,6311250.0
6094,6095,1000360025,1,36,T2,4,10038.0,PIER13 SOUTH STREET,352,607,1.0,98,24,13300000.0,5535000.0,5985000.0
6095,6096,1000360030,1,36,T2,4,10038.0,PIER14 SOUTH STREET,273,606,1.0,50,488,11600000.0,4770000.0,5220000.0
9019,9020,1001080004,1,108,V9,4,,SOUTH STREET,118,691,,0,0,55426140.0,24941763.0,24941763.0


In [106]:
## 776 records that google map could not locate the zip & B-BLOCK group don't have non-null values.
temp.ZIP.isna().sum()

776

In [None]:
## Note:

## first, I think we should not drop any null records
## again, for these null values, we could use the website to figure them out, but it takes lot of time
## so alternatively, I will just assign them the corresponding boro min zip

In [107]:
def stubborn_nulls(x):
    boro = x[0]
    z = x[1]
    
    if pd.isnull(z):
        boro = int(boro)
        return min(zip_dict['zip_' + str(boro)])
    else:
        return z

In [108]:
temp['ZIP'] = temp[['B','ZIP']].apply(stubborn_nulls, axis = 1)

In [109]:
temp.ZIP.isna().sum()

0

In [110]:
## this is the dataframe where each B-BLOCK group have at least on non-null zip values
old = df_init.copy()
old.drop(new.index, axis = 0, inplace = True)

In [111]:
## 1068546 + 2448 = 1070994
old.shape

(1068546, 16)

In [112]:
old.ZIP.isna().sum()

27442

In [113]:
## we know for this dataset, there will be no empty group, so we simply impute with most frequent zip in each group
def zip_fill_2(x):
    b = x[0]
    bl = x[1]
    zip_code = x[2]
    
    if pd.isnull(zip_code):
        return old[(old.B == b) & (old.BLOCK == bl)]['ZIP'].value_counts().idxmax()
    else:
        return zip_code

In [114]:
old['ZIP'] = old[['B', 'BLOCK', 'ZIP']].apply(zip_fill_2, axis = 1)

In [115]:
old.ZIP.isna().sum()

0

In [116]:
## total number check

temp.shape[0] + old.shape[0] + 5

1070994

In [117]:
temp.shape

(2443, 16)

In [118]:
## remember that we exclude 5 records earlier?
## need to add them back and apply stubborn_nulls again

temp = temp.append(new[(new.B == 3) & (new.BLOCK == 4577)],ignore_index=True)
temp = temp.append(new[(new.B == 3) & (new.BLOCK == 4578)],ignore_index=True)
temp = temp.append(new[(new.B == 3) & (new.BLOCK == 4579)],ignore_index=True)
temp = temp.append(new[(new.B == 5) & (new.BLOCK == 6415)],ignore_index=True)

In [119]:
temp.shape ## yeah, we added them back successfully!

(2448, 16)

In [120]:
temp['ZIP'] = temp[['B','ZIP']].apply(stubborn_nulls, axis = 1)

In [121]:
temp.ZIP.isna().sum()

0

In [122]:
## FINALLY, we can combine them together and end this imputation...

imputed_zip = old.append(temp, ignore_index = True)
imputed_zip.head()

Unnamed: 0,RECORD,BBLE,B,BLOCK,BLDGCL,TAXCLASS,ZIP,STADDR,LTFRONT,LTDEPTH,STORIES,BLDFRONT,BLDDEPTH,FULLVAL,AVLAND,AVTOT
0,1,1000010101,1,1,P7,4,10004.0,1 LIBERTY ISLAND,500,1046,,0,0,21400000.0,4225500.0,9630000.0
1,2,1000010201,1,1,Z9,4,10004.0,1 ELLIS ISLAND,27,0,,0,0,193800000.0,14310000.0,87210000.0
2,3,1000020001,1,2,Y7,4,10004.0,MARGINAL STREET,709,564,3.0,709,564,104686000.0,39008700.0,47108700.0
3,4,1000020023,1,2,T2,4,10004.0,PIER 6,793,551,2.0,85,551,39200000.0,15255000.0,17640000.0
4,5,1000030001,1,3,Q1,4,10004.0,BATTERY PARK,323,1260,1.0,89,57,272300000.0,121050000.0,122535000.0


In [123]:
imputed_zip.ZIP.value_counts()

10314.0    25651
11234.0    20364
10312.0    20224
10306.0    18041
10462.0    17070
           ...  
11359.0        1
11352.0        1
10162.0        1
11005.0        1
11241.0        1
Name: ZIP, Length: 196, dtype: int64

In [124]:
imputed_zip.ZIP.unique()

array([10004., 10280., 10281., 10282., 10007., 10006., 10005., 10038.,
       10048., 10013., 10002., 10009., 10003., 10012., 10014., 10001.,
       10011., 10016., 10128., 10030., 10019., 10018., 10010., 10036.,
       10020., 10023., 10028., 10024., 10069., 10025., 10017., 10022.,
       10044., 10065., 10021., 10075., 10162., 10026., 10029., 10035.,
       10027., 10037., 10039., 10031., 10040., 10032., 10033., 10034.,
       10463., 10454., 10451., 10455., 10456., 10468., 10458., 10459.,
       10452., 10474., 10465., 11370., 10472., 10457., 10453., 10460.,
       10464., 10470., 10467., 10466., 10473., 10462., 10461., 10469.,
       10475., 10803., 10471., 11201., 11217., 11242., 11241., 11231.,
       11215., 11230., 11232., 11218., 11220., 11238., 11226., 11205.,
       11225., 11216., 11213., 11207., 11203., 11233., 11234., 11236.,
       11212., 11221., 11206., 11211., 11227., 11243., 11222., 11223.,
       11237., 11209., 11208., 11416., 11239., 11210., 11219., 11228.,
      

In [126]:
imputed_zip.shape

(1070994, 16)

In [134]:
## update:
## note there is one 33803 zip, this is from the original zip, we I've checked is not matching with the actual location

imputed_zip[imputed_zip.ZIP == 33803]

Unnamed: 0,RECORD,BBLE,B,BLOCK,BLDGCL,TAXCLASS,ZIP,STADDR,LTFRONT,LTDEPTH,STORIES,BLDFRONT,BLDDEPTH,FULLVAL,AVLAND,AVTOT
502699,502995,3070210014,3,7021,G7,4,33803.0,WEST 16 STREET,20,118,,0,0,59900.0,26955.0,26955.0
502700,502996,3070210015,3,7021,G7,4,33803.0,WEST 16 STREET,20,118,,0,0,95000.0,42750.0,42750.0
502701,502997,3070210016,3,7021,V0,1B,33803.0,WEST 16 STREET,40,118,,0,0,475000.0,6566.0,6566.0


In [135]:
imputed_zip.loc[imputed_zip.ZIP == 33803, 'ZIP'] = 11224

In [136]:
imputed_zip.ZIP.unique()

array([10004., 10280., 10281., 10282., 10007., 10006., 10005., 10038.,
       10048., 10013., 10002., 10009., 10003., 10012., 10014., 10001.,
       10011., 10016., 10128., 10030., 10019., 10018., 10010., 10036.,
       10020., 10023., 10028., 10024., 10069., 10025., 10017., 10022.,
       10044., 10065., 10021., 10075., 10162., 10026., 10029., 10035.,
       10027., 10037., 10039., 10031., 10040., 10032., 10033., 10034.,
       10463., 10454., 10451., 10455., 10456., 10468., 10458., 10459.,
       10452., 10474., 10465., 11370., 10472., 10457., 10453., 10460.,
       10464., 10470., 10467., 10466., 10473., 10462., 10461., 10469.,
       10475., 10803., 10471., 11201., 11217., 11242., 11241., 11231.,
       11215., 11230., 11232., 11218., 11220., 11238., 11226., 11205.,
       11225., 11216., 11213., 11207., 11203., 11233., 11234., 11236.,
       11212., 11221., 11206., 11211., 11227., 11243., 11222., 11223.,
       11237., 11209., 11208., 11416., 11239., 11210., 11219., 11228.,
      

In [138]:
imputed_zip.ZIP.isna().sum()

0

In [139]:
## END: Thanks for watching

imputed_zip.to_csv('Imputed_Zip_Data_New.csv')

In [None]:
## should have sorted by record ##