In [91]:
import pandas as pd
import numpy as np
import time
import json

In [92]:
pd.__version__

'0.22.0'

In [93]:
def read_income_file(csv_file):
    df_income = pd.read_csv(csv_file)
    df_income = df_income[['GEO.display-label','GEO.id2','HC02_EST_VC02']]
    row_to_drop = []
    for index, row in df_income.iterrows():
        if row['HC02_EST_VC02'].isdigit() == False or row['GEO.id2'] < 2101 or row['GEO.id2'] > 2467:
            row_to_drop.append(index)
    df_income = df_income.drop(index=row_to_drop).reset_index(drop=True)
    selected_zipcode = set()
    for index, row in df_income.iterrows():
        df_income.loc[index,'GEO.id2'] = '0' + str(row['GEO.id2'])
        selected_zipcode.add(df_income.loc[index,'GEO.id2'])
    return df_income, selected_zipcode

In [94]:
def read_race_file(csv_file, selected_zipcode):
    df_race = pd.read_csv(csv_file)
    df_race = df_race.drop(columns=['GEO.id'])
    row_to_drop = []
    for index, row in df_race.iterrows():
        if row['GEO.display-label'][-5:] not in selected_zipcode:
            row_to_drop.append(index)
    df_race = df_race.drop(index=row_to_drop).reset_index(drop=True)
    df_race = df_race.drop(columns=['GEO.id2', 'GEO.display-label'])
    return df_race

In [95]:
def concat_df(df_income, df_race):
    df = pd.concat([df_income, df_race], axis=1, join_axes=[df_income.index])
    df = df.drop(columns=['GEO.display-label', 'HD02_VD01', 'HD02_VD02', 
                     'HD02_VD03', 'HD02_VD04', 'HD02_VD05', 'HD02_VD06',
                     'HD02_VD07', 'HD02_VD08', 'HD02_VD09', 'HD02_VD10'])
    df = df.rename(index=str, columns={'GEO.id2': 'Zip Code',
                             'HC02_EST_VC02': 'Household Mean Income',
                             'HD01_VD01': 'Total',
                             'HD01_VD02': 'White alone',
                             'HD01_VD03': 'Black or African American alone',
                             'HD01_VD04': 'American Indian and Alaska Native alone',
                             'HD01_VD05': 'Asian alone',
                             'HD01_VD06': 'Native Hawaiian and Other Pacific Islander alone',
                             'HD01_VD07': 'Some other race alone',
                             'HD01_VD08': ' Two or more races',
                             'HD01_VD09': 'Two or more races/ Two races including Some other race',
                             'HD01_VD10': 'Two or more races/ Two races excluding Some other race, and three or more races'})
    return df

In [96]:
df_income_2016, selected_zipcode_2016 = read_income_file('MeanIncome.csv')
df_race_2016 = read_race_file('Race.csv', selected_zipcode_2016)
df_2016 = concat_df(df_income_2016,df_race_2016)
df_2016

Unnamed: 0,Zip Code,Household Mean Income,Total,White alone,Black or African American alone,American Indian and Alaska Native alone,Asian alone,Native Hawaiian and Other Pacific Islander alone,Some other race alone,Two or more races,Two or more races/ Two races including Some other race,"Two or more races/ Two races excluding Some other race, and three or more races"
0,02108,184161,4049,3515,209,26,172,0,31,96,0,96
1,02109,169961,4015,3497,135,0,249,0,68,66,0,66
2,02110,251376,2124,1814,83,0,206,0,0,21,0,21
3,02111,112127,7564,3157,212,0,3926,0,64,205,30,175
4,02113,106761,7495,7046,27,0,206,0,87,129,34,95
5,02114,133075,13001,10294,560,0,1475,0,350,322,7,315
6,02115,67392,28976,19475,2574,49,4386,27,1450,1015,234,781
7,02116,170240,23004,17745,1126,42,3310,0,276,505,60,445
8,02118,117991,27664,14723,4953,121,4140,0,2441,1286,179,1107
9,02119,47250,26833,3964,16043,87,695,0,4721,1323,445,878


In [97]:
df_income_2015, selected_zipcode_2015 = read_income_file('MeanIncome_2015.csv')
df_race_2015 = read_race_file('Race_2015.csv', selected_zipcode_2015)
df_2015 = concat_df(df_income_2016,df_race_2016)
df_2015

Unnamed: 0,Zip Code,Household Mean Income,Total,White alone,Black or African American alone,American Indian and Alaska Native alone,Asian alone,Native Hawaiian and Other Pacific Islander alone,Some other race alone,Two or more races,Two or more races/ Two races including Some other race,"Two or more races/ Two races excluding Some other race, and three or more races"
0,02108,184161,4049,3515,209,26,172,0,31,96,0,96
1,02109,169961,4015,3497,135,0,249,0,68,66,0,66
2,02110,251376,2124,1814,83,0,206,0,0,21,0,21
3,02111,112127,7564,3157,212,0,3926,0,64,205,30,175
4,02113,106761,7495,7046,27,0,206,0,87,129,34,95
5,02114,133075,13001,10294,560,0,1475,0,350,322,7,315
6,02115,67392,28976,19475,2574,49,4386,27,1450,1015,234,781
7,02116,170240,23004,17745,1126,42,3310,0,276,505,60,445
8,02118,117991,27664,14723,4953,121,4140,0,2441,1286,179,1107
9,02119,47250,26833,3964,16043,87,695,0,4721,1323,445,878


In [98]:
df_crime = pd.read_csv('crime_filtered.csv')
df_crime_16 = df_crime[df_crime.YEAR == 2016]
df_crime_15 = df_crime[df_crime.YEAR == 2015]
df_crime_16

Unnamed: 0,INCIDENT_NUMBER,OFFENSE_CODE,OFFENSE_CODE_GROUP,OFFENSE_DESCRIPTION,DISTRICT,REPORTING_AREA,SHOOTING,OCCURRED_ON_DATE,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long,Location
5466,I162106415,1874,Drug Violation,DRUGS - OTHER,D14,282,,12/31/16 18:50,2016,12,Saturday,18,Part Two,WASHINGTON ST,42.328663,-71.085634,"(42.32866284, -71.08563401)"
5467,I162106406,1845,Drug Violation,DRUGS - POSS CLASS D,B3,,,12/31/16 16:08,2016,12,Saturday,16,Part Two,,,,"(0.00000000, 0.00000000)"
5468,I162106170,1849,Drug Violation,"DRUGS - POSS CLASS B - COCAINE, ETC.",C6,,,12/30/16 18:52,2016,12,Friday,18,Part Two,,42.327097,-71.052536,"(42.32709687, -71.05253559)"
5469,I162106167,1830,Drug Violation,DRUGS - SICK ASSIST - HEROIN,B2,181,,12/30/16 18:12,2016,12,Friday,18,Part Two,WOODWARD AVE,42.325029,-71.073414,"(42.32502870, -71.07341448)"
5470,I162106157,1845,Drug Violation,DRUGS - POSS CLASS D,B3,431,,12/30/16 14:57,2016,12,Friday,14,Part Two,ARBUTUS ST,42.287501,-71.088298,"(42.28750082, -71.08829836)"
5471,I162106156,1810,Drug Violation,DRUGS - SALE / MANUFACTURING,C6,177,,12/30/16 14:00,2016,12,Friday,14,Part Two,MASSACHUSETTS AVE,42.331521,-71.070853,"(42.33152148, -71.07085307)"
5472,I162106156,1815,Drug Violation,DRUGS - POSSESSION,C6,177,,12/30/16 14:00,2016,12,Friday,14,Part Two,MASSACHUSETTS AVE,42.331521,-71.070853,"(42.33152148, -71.07085307)"
5473,I162106127,1842,Drug Violation,"DRUGS - POSS CLASS A - HEROIN, ETC.",D4,172,,12/30/16 15:20,2016,12,Friday,15,Part Two,ALBANY ST,42.335206,-71.071103,"(42.33520569, -71.07110315)"
5474,I162106118,1849,Drug Violation,"DRUGS - POSS CLASS B - COCAINE, ETC.",D14,282,,12/30/16 13:49,2016,12,Friday,13,Part Two,WASHINGTON ST,42.328663,-71.085634,"(42.32866284, -71.08563401)"
5475,I162106095,1874,Drug Violation,DRUGS - OTHER,D14,783,,12/30/16 13:37,2016,12,Friday,13,Part Two,WARREN ST,42.349886,-71.145230,"(42.34988575, -71.14522971)"


In [125]:
lat_crime_16 = df_crime_16['Lat'].values
long_crime_16  = df_crime_16['Long'].values
location_crime_16 = [[str(lat_crime_16[i]), str(long_crime_16[i])] for i in range(len(lat_crime_16))]
lat_crime_15 = df_crime_15['Lat'].values
long_crime_15  = df_crime_15['Long'].values
location_crime_15 = [[str(lat_crime_15[i]), str(long_crime_15[i])] for i in range(len(lat_crime_15))]

In [126]:
print(len(location_crime_15))
print(len(location_crime_16))

3299
5288


In [127]:
def drop_nan(location_crime_15):
    loc_to_ignore_15 = []
    for idx, loc in enumerate(location_crime_15):
        if loc[0] == 'nan':
            loc_to_ignore_15.append(idx)
    df_2015 = pd.DataFrame(location_crime_15)
    #print(len(loc_to_ignore_15))
    df_2015 = df_2015.drop(loc_to_ignore_15)
    lat = df_2015[0].values
    long = df_2015[1].values
    location_crime_15 = [[lat[i], long[i]] for i in range(len(lat))]
    #print(len(location_crime_15))
    return location_crime_15

In [128]:
location_crime_15 = drop_nan(location_crime_15)
location_crime_16 = drop_nan(location_crime_16)

In [129]:
print(len(location_crime_15))
print(len(location_crime_16))

3061
4679


In [147]:
from geopy.geocoders import Nominatim
def get_zipcode(locations):
    crime_zipcode = []
    geolocator = Nominatim()
    count = 0
    for i in range(len(locations)):
        count += 1
        if (count == 5):
            time.sleep(2)
            count = 0
        try:
            addr = geolocator.reverse(locations[i][0] + ',' + locations[i][1]).address
            if addr != None:
                zipcode = addr.split(',')[-2]
                crime_zipcode.append(zipcode)
                print('i:', i, 'zip code:', zipcode)
        except:
            print('error')
            continue
    return crime_zipcode

In [148]:
crime_zipcode = get_zipcode(location_crime_16[:500])

i: 0 zip code:  02114
i: 1 zip code:  02114
i: 2 zip code:  02119
i: 3 zip code:  02124
i: 4 zip code:  02114
i: 5 zip code:  02114
i: 6 zip code:  02114
i: 7 zip code:  02114
i: 8 zip code:  02135
i: 9 zip code:  02119
i: 10 zip code:  02119
i: 11 zip code:  02114
i: 12 zip code:  02114
i: 13 zip code:  02114
i: 14 zip code:  02114
i: 15 zip code:  02114
i: 16 zip code:  02114
i: 17 zip code:  02114
i: 18 zip code:  02114
i: 19 zip code:  02134
i: 20 zip code:  02135
i: 21 zip code:  02114
i: 22 zip code:  02119
i: 23 zip code:  02119
i: 24 zip code:  02114
i: 25 zip code:  02114
i: 26 zip code:  02114
i: 27 zip code:  02114
i: 28 zip code:  02114
i: 29 zip code:  02114
i: 30 zip code:  02114
i: 31 zip code:  02126
i: 32 zip code:  02109
i: 33 zip code:  02109
i: 34 zip code:  02109
i: 35 zip code:  02131
i: 36 zip code:  02128
i: 37 zip code:  02128
i: 38 zip code:  02132-3226
i: 39 zip code:  02114
i: 40 zip code:  02114
i: 41 zip code:  02114
i: 42 zip code:  02114
i: 43 zip code: 

i: 344 zip code:  02114
i: 345 zip code:  02114
i: 346 zip code:  02114
i: 347 zip code:  02114
i: 348 zip code:  02114
i: 349 zip code:  02114
i: 350 zip code:  02114
i: 351 zip code:  02125
i: 352 zip code:  02125
i: 353 zip code:  02125
i: 354 zip code:  02125
i: 355 zip code:  02125
i: 356 zip code:  02125
i: 357 zip code:  02114
i: 358 zip code:  02114
i: 359 zip code:  02114
i: 360 zip code:  02114
i: 361 zip code:  MA 02135
i: 362 zip code:  02118
i: 363 zip code:  02114
i: 364 zip code:  02114
i: 365 zip code:  02114
i: 366 zip code:  02119
i: 367 zip code:  02128
i: 368 zip code:  02114
i: 369 zip code:  02114
i: 370 zip code:  02114
i: 371 zip code:  02114
i: 372 zip code:  02114
i: 373 zip code:  02119
i: 374 zip code:  02435
i: 375 zip code:  02114
i: 376 zip code:  02114
i: 377 zip code:  02114
i: 378 zip code:  02128
i: 379 zip code:  02114
i: 380 zip code:  02134
i: 381 zip code:  02114
i: 382 zip code:  02134
i: 383 zip code:  02114
i: 384 zip code:  02126
i: 385 zip co

In [149]:
crime_zipcode2 = get_zipcode(location_crime_16[500:len(location_crime_16)])

i: 0 zip code:  02126
i: 1 zip code:  02114
i: 2 zip code:  02114
i: 3 zip code:  02127
i: 4 zip code:  02114
i: 5 zip code:  02114
i: 6 zip code:  02114
i: 7 zip code:  02127
i: 8 zip code:  02127
i: 9 zip code:  MA 02186
i: 10 zip code:  MA 02186
i: 11 zip code:  02132-3226
i: 12 zip code:  02119
i: 13 zip code:  02118
i: 14 zip code:  02118
i: 15 zip code:  02118
i: 16 zip code:  02118
i: 17 zip code:  02125
error
i: 19 zip code:  02125
i: 20 zip code:  02130
i: 21 zip code:  02114
i: 22 zip code:  02114
i: 23 zip code:  02114
i: 24 zip code:  02114
i: 25 zip code:  02114
i: 26 zip code:  02114
i: 27 zip code:  02125
i: 28 zip code:  02114
i: 29 zip code:  02114
i: 30 zip code:  02126
i: 31 zip code:  02126
i: 32 zip code:  02124
i: 33 zip code:  02128
i: 34 zip code:  02128
i: 35 zip code:  02128
i: 36 zip code:  02114
i: 37 zip code:  02125
i: 38 zip code:  02125
i: 39 zip code:  02114
i: 40 zip code:  02129
i: 41 zip code:  02128
i: 42 zip code:  02114
i: 43 zip code:  02128
i: 4

i: 349 zip code:  02134
i: 350 zip code:  02136-2460
i: 351 zip code:  02114
i: 352 zip code:  02114
i: 353 zip code:  02114
i: 354 zip code:  02119
i: 355 zip code:  02119
i: 356 zip code:  02114
i: 357 zip code:  02114
i: 358 zip code:  02114
i: 359 zip code:  01250
i: 360 zip code:  02152
i: 361 zip code:  02121
i: 362 zip code:  02121
i: 363 zip code:  02126
i: 364 zip code:  02136
i: 365 zip code:  02114
i: 366 zip code:  02135
i: 367 zip code:  02135
i: 368 zip code:  02114
i: 369 zip code:  02125
i: 370 zip code:  02114
i: 371 zip code:  02114
i: 372 zip code:  02114
i: 373 zip code:  02114
i: 374 zip code:  02114
i: 375 zip code:  02114
i: 376 zip code:  MA 02186
i: 377 zip code:  MA 02186
i: 378 zip code:  02121
i: 379 zip code:  02132-3226
i: 380 zip code:  02114
i: 381 zip code:  02171
i: 382 zip code:  02114
i: 383 zip code:  02114
i: 384 zip code:  02114
i: 385 zip code:  02114
i: 386 zip code:  02114
i: 387 zip code:  02114
i: 388 zip code:  02126
i: 389 zip code:  02114


i: 691 zip code:  02119
i: 692 zip code:  02119
i: 693 zip code:  02119
i: 694 zip code:  02119
i: 695 zip code:  02114
i: 696 zip code:  02114
i: 697 zip code:  02114
i: 698 zip code:  02114
i: 699 zip code:  02114
i: 700 zip code:  02114
i: 701 zip code:  02114
i: 702 zip code:  02114
i: 703 zip code:  02114
i: 704 zip code:  02130
i: 705 zip code:  02136-2460
i: 706 zip code:  02136-2460
i: 707 zip code:  02136-2460
i: 708 zip code:  02114
i: 709 zip code:  02124
i: 710 zip code:  02124
i: 711 zip code:  02152
i: 712 zip code:  02152
i: 713 zip code:  02119
i: 714 zip code:  02119
i: 715 zip code:  02119
i: 716 zip code:  02130
i: 717 zip code:  02130
i: 718 zip code:  02114
i: 719 zip code:  02114
i: 720 zip code:  02114
i: 721 zip code:  02114
i: 722 zip code:  02114
i: 723 zip code:  02118
i: 724 zip code:  02118
i: 725 zip code:  02131
i: 726 zip code:  02114
i: 727 zip code:  02114
i: 728 zip code:  02124
i: 729 zip code:  02114
i: 730 zip code:  02114
i: 731 zip code:  02114
i

i: 1034 zip code:  02125
i: 1035 zip code:  02128
i: 1036 zip code:  02114
i: 1037 zip code:  02131-4931
i: 1038 zip code:  02114
i: 1039 zip code:  02114
i: 1040 zip code:  02124
i: 1041 zip code:  02130
i: 1042 zip code:  02130
i: 1043 zip code:  02130
i: 1044 zip code:  02114
i: 1045 zip code:  02124
i: 1046 zip code:  02124
i: 1047 zip code:  02114
i: 1048 zip code:  02114
i: 1049 zip code:  02131-4931
i: 1050 zip code:  02131-4931
i: 1051 zip code:  02130
i: 1052 zip code:  02124
i: 1053 zip code:  02114
i: 1054 zip code:  02134
i: 1057 zip code:  02130
i: 1058 zip code:  02114
i: 1059 zip code:  02132
i: 1060 zip code:  02114
i: 1061 zip code:  02124
i: 1062 zip code:  02124
i: 1063 zip code:  02114
i: 1064 zip code:  02114
i: 1065 zip code:  02114
i: 1066 zip code:  02114
i: 1067 zip code:  02131-3025
i: 1068 zip code:  02114
i: 1069 zip code:  02114
i: 1070 zip code:  02114
i: 1071 zip code:  02136-2460
i: 1072 zip code:  02128
i: 1073 zip code:  02122
i: 1074 zip code:  02114


i: 1365 zip code:  02114
i: 1366 zip code:  02114
i: 1367 zip code:  02114
i: 1368 zip code:  02114
i: 1369 zip code:  02114
i: 1370 zip code:  02121
i: 1371 zip code:  02134-1433
i: 1372 zip code:  02134-1433
i: 1373 zip code:  02114
i: 1374 zip code:  02114
i: 1375 zip code:  02122
i: 1376 zip code:  02126
i: 1377 zip code:  02126
i: 1378 zip code:  02119
i: 1379 zip code:  02114
i: 1380 zip code:  02114
i: 1381 zip code:  02131-4931
i: 1382 zip code:  02124
i: 1383 zip code:  02119
i: 1384 zip code:  02129
i: 1385 zip code:  02121
i: 1386 zip code:  02114
i: 1387 zip code:  02114
i: 1388 zip code:  02114
i: 1389 zip code:  02126
i: 1390 zip code:  02130
i: 1391 zip code:  02152
i: 1392 zip code:  02118
i: 1393 zip code:  02128
i: 1394 zip code:  02128
i: 1395 zip code:  02114
i: 1396 zip code:  02114
i: 1397 zip code:  02114
i: 1398 zip code:  02114
i: 1399 zip code:  02114
i: 1400 zip code:  02114
i: 1401 zip code:  02114
i: 1402 zip code:  02127
i: 1403 zip code:  02119
i: 1404 zi

i: 1692 zip code:  02114
i: 1693 zip code:  02114
i: 1694 zip code:  02118
i: 1695 zip code:  02114
i: 1696 zip code:  02114
i: 1697 zip code:  02121
i: 1698 zip code:  02125
i: 1699 zip code:  02131
i: 1700 zip code:  02131
i: 1701 zip code:  02114
i: 1702 zip code:  02114
i: 1703 zip code:  02124
i: 1704 zip code:  02131-4931
i: 1705 zip code:  02131-4931
i: 1706 zip code:  02136
i: 1707 zip code:  02136
i: 1708 zip code:  02136
i: 1709 zip code:  02114
i: 1710 zip code:  02125
i: 1711 zip code:  02125
i: 1712 zip code:  02125
i: 1713 zip code:  02125
i: 1714 zip code:  02114
i: 1715 zip code:  02114
i: 1716 zip code:  02114
i: 1717 zip code:  02125
i: 1718 zip code:  02122
i: 1719 zip code:  02126
i: 1720 zip code:  02124
i: 1721 zip code:  02118
i: 1722 zip code:  02128
i: 1723 zip code:  02114
i: 1724 zip code:  02171
i: 1725 zip code:  MA 02135
i: 1726 zip code:  02131-3025
i: 1727 zip code:  02128
i: 1728 zip code:  02114
i: 1729 zip code:  02132
i: 1730 zip code:  02114
i: 1731

i: 2016 zip code:  02114
i: 2017 zip code:  02114
i: 2018 zip code:  02114
i: 2019 zip code:  02114
i: 2020 zip code:  02119
i: 2021 zip code:  02136-2460
i: 2022 zip code:  02134
i: 2023 zip code:  02132
i: 2024 zip code:  02114
i: 2025 zip code:  02114
i: 2026 zip code:  02128
i: 2027 zip code:  02128
i: 2028 zip code:  02114
i: 2029 zip code:  02114
i: 2030 zip code:  02130
i: 2031 zip code:  02127
i: 2032 zip code:  02114
i: 2033 zip code:  02114
i: 2034 zip code:  02130
i: 2035 zip code:  02130
i: 2036 zip code:  02126
i: 2037 zip code:  02126
i: 2038 zip code:  02124
i: 2039 zip code:  02132
i: 2040 zip code:  02129
i: 2041 zip code:  02121
i: 2042 zip code:  02130
i: 2043 zip code:  02114
i: 2044 zip code:  02114
i: 2045 zip code:  02124
i: 2046 zip code:  02125
i: 2047 zip code:  02125
i: 2048 zip code:  02114
i: 2049 zip code:  02152
i: 2050 zip code:  02114
i: 2051 zip code:  02114
i: 2052 zip code:  02114
i: 2053 zip code:  02114
i: 2054 zip code:  02114
i: 2055 zip code:  0

i: 2339 zip code:  02114
i: 2340 zip code:  02114
i: 2341 zip code:  02121
i: 2342 zip code:  02114
i: 2343 zip code:  02121
i: 2344 zip code:  02114
i: 2345 zip code:  02114
i: 2346 zip code:  02114
i: 2347 zip code:  02114
i: 2348 zip code:  02114
i: 2349 zip code:  02128
i: 2350 zip code:  02114
i: 2351 zip code:  02114
i: 2352 zip code:  02119
i: 2353 zip code:  MA 02135
i: 2354 zip code:  02114
i: 2355 zip code:  02130
i: 2356 zip code:  02131-4931
i: 2357 zip code:  02131-4931
i: 2358 zip code:  02131-4931
i: 2359 zip code:  02114
i: 2360 zip code:  02114
i: 2361 zip code:  02125
i: 2362 zip code:  02114
i: 2363 zip code:  02114
i: 2364 zip code:  02114
i: 2365 zip code:  02130
i: 2366 zip code:  02130
i: 2367 zip code:  02130
i: 2368 zip code:  02119
i: 2369 zip code:  02124
i: 2370 zip code:  02114
i: 2371 zip code:  02114
i: 2372 zip code:  02114
i: 2373 zip code:  02114
i: 2374 zip code:  02120
i: 2375 zip code:  02114
i: 2376 zip code:  02114
i: 2377 zip code:  02114
i: 2378

i: 2663 zip code:  02132
i: 2664 zip code:  02130
i: 2665 zip code:  02114
i: 2666 zip code:  02120
i: 2667 zip code:  02122
i: 2668 zip code:  02114
i: 2669 zip code:  02114
i: 2670 zip code:  02119
i: 2671 zip code:  02114
i: 2672 zip code:  02114
i: 2673 zip code:  02114
i: 2674 zip code:  02114
i: 2675 zip code:  02126
i: 2676 zip code:  02126
i: 2677 zip code:  02114
i: 2678 zip code:  02114
i: 2679 zip code:  02116
i: 2680 zip code:  02114
i: 2681 zip code:  02126
i: 2682 zip code:  02114
i: 2683 zip code:  02131-3025
i: 2684 zip code:  02124
i: 2685 zip code:  02122
i: 2686 zip code:  02131
i: 2687 zip code:  02130
i: 2688 zip code:  02125
i: 2689 zip code:  02125
i: 2690 zip code:  02124
i: 2691 zip code:  02114
i: 2692 zip code:  01250
i: 2693 zip code:  02136-2460
i: 2694 zip code:  02136-2460
i: 2695 zip code:  02114
i: 2696 zip code:  02122
i: 2697 zip code:  02122
i: 2698 zip code:  02114
i: 2699 zip code:  02114
i: 2700 zip code:  02114
i: 2701 zip code:  02122
i: 2702 zi

i: 2985 zip code:  02114
i: 2986 zip code:  02114
i: 2987 zip code:  02130
i: 2988 zip code:  MA 02135
i: 2989 zip code:  02114
i: 2990 zip code:  02131-4931
i: 2991 zip code:  02114
i: 2992 zip code:  02114
i: 2993 zip code:  02114
i: 2994 zip code:  02118
i: 2995 zip code:  02114
i: 2996 zip code:  02134-1433
i: 2997 zip code:  02132
i: 2998 zip code:  02127
i: 2999 zip code:  02114
i: 3000 zip code:  02122
i: 3001 zip code:  02114
i: 3002 zip code:  02114
i: 3003 zip code:  02122
i: 3004 zip code:  02122
i: 3005 zip code:  02121
i: 3006 zip code:  02132
i: 3007 zip code:  02114
i: 3008 zip code:  02114
i: 3009 zip code:  02121
i: 3010 zip code:  02121
i: 3011 zip code:  02130
i: 3012 zip code:  02114
i: 3013 zip code:  02131
i: 3014 zip code:  02114
i: 3015 zip code:  02126
i: 3016 zip code:  02136-2460
i: 3017 zip code:  02121
i: 3018 zip code:  02121
i: 3019 zip code:  02114
i: 3020 zip code:  02114
i: 3021 zip code:  02114
i: 3022 zip code:  02136
i: 3023 zip code:  02114
i: 3024

error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
error
erro

In [155]:
import copy
crime_zipcode_copy = copy.deepcopy(crime_zipcode)
crime_zipcode_copy2 = copy.deepcopy(crime_zipcode2)

In [159]:
len(crime_zipcode_copy) + len(crime_zipcode_copy2)

3601

In [161]:
crime_zipcode_total = crime_zipcode_copy + crime_zipcode_copy2
len(crime_zipcode_total)

3601

In [163]:
df_crime_total = pd.DataFrame(crime_zipcode_total)
df_crime_total.to_csv('crime_zipcode_16.csv')

In [164]:
crime_zipcode_15 = pd.read_csv('crime_zipcode_15.csv')
crime_zipcode_15

Unnamed: 0.1,Unnamed: 0,0
0,0,02114
1,1,02114
2,2,02114
3,3,02114
4,4,02114
5,5,02210
6,6,02125
7,7,02114
8,8,02130
9,9,02130


In [165]:
crime_zipcode_16 = pd.read_csv('crime_zipcode_16.csv')
crime_zipcode_16

Unnamed: 0.1,Unnamed: 0,0
0,0,02114
1,1,02114
2,2,02119
3,3,02124
4,4,02114
5,5,02114
6,6,02114
7,7,02114
8,8,02135
9,9,02119


In [181]:
crime_zipcode_15_ls = crime_zipcode_15['0'].values
crime_zipcode_15_ls[112][-5:]

'02135'

In [176]:
crime_zipcode_16_ls = crime_zipcode_16['0'].values
crime_zipcode_16_ls[104]

' 02114'

In [192]:
def map_crime_to_zipcode(crimes):
    total_crimes = {}
    for idx, each in enumerate(crimes):
        if len(each) == 6: # '02114'
            if each[1:] in total_crimes:
                total_crimes[each[1:]] += 1
            else:
                total_crimes[each[1:]] = 1
        elif len(each) == 11: #' 02132-3226'
            if each[1:6] in total_crimes:
                total_crimes[each[1:6]] += 1
            else:
                total_crimes[each[1:6]] = 1
        elif len(each) == 9:  #'MA 02118'
            if each[-5:] in total_crimes:
                total_crimes[each[-5:]] += 1
            else:
                total_crimes[each[-5:]] = 1
        else:
            print(each)
    return total_crimes


In [195]:
crime_zipcode_15 = map_crime_to_zipcode(crime_zipcode_15_ls)
crime_zipcode_16 = map_crime_to_zipcode(crime_zipcode_16_ls)

In [201]:
crime_zipcode_15

{'02026': 5,
 '02108': 4,
 '02111': 27,
 '02113': 1,
 '02114': 1474,
 '02115': 6,
 '02116': 11,
 '02118': 99,
 '02119': 62,
 '02120': 13,
 '02121': 82,
 '02122': 55,
 '02124': 188,
 '02125': 189,
 '02126': 162,
 '02127': 27,
 '02128': 60,
 '02129': 43,
 '02130': 134,
 '02131': 160,
 '02132': 52,
 '02134': 38,
 '02135': 48,
 '02136': 67,
 '02152': 16,
 '02163': 1,
 '02171': 5,
 '02186': 12,
 '02210': 1,
 '02215': 2,
 '02435': 3,
 '02445': 2}

In [199]:
crime_zipcode_16

{'01250': 4,
 '02026': 10,
 '02108': 2,
 '02109': 5,
 '02110': 2,
 '02111': 17,
 '02113': 6,
 '02114': 1792,
 '02115': 3,
 '02116': 2,
 '02118': 94,
 '02119': 105,
 '02120': 28,
 '02121': 98,
 '02122': 68,
 '02124': 244,
 '02125': 126,
 '02126': 122,
 '02127': 62,
 '02128': 81,
 '02129': 14,
 '02130': 112,
 '02131': 180,
 '02132': 114,
 '02134': 35,
 '02135': 64,
 '02136': 120,
 '02152': 8,
 '02163': 1,
 '02171': 19,
 '02186': 49,
 '02215': 6,
 '02435': 6,
 '02445': 2}

In [251]:
df_2015 = df_2015.drop(columns=['Total'])
df_2016 = df_2016.drop(columns=['Total'])

In [252]:
def get_x_variables(crime_zipcode, df):
    result = []
    y = []
    for index,row in df.iterrows():
        tmp = []
        if row['Zip Code'] in crime_zipcode:
            tmp.append(int(list(row)[1]))
            tmp += list(row)[2:]
            result.append(tmp)
            y.append(crime_zipcode[row['Zip Code']])
    return result,y

In [253]:
x_15,y_15 = get_x_variables(crime_zipcode_15, df_2015)

In [254]:
x_15

[[184161, 3515, 209, 26, 172, 0, 31, 96, 0, 96],
 [112127, 3157, 212, 0, 3926, 0, 64, 205, 30, 175],
 [106761, 7046, 27, 0, 206, 0, 87, 129, 34, 95],
 [133075, 10294, 560, 0, 1475, 0, 350, 322, 7, 315],
 [67392, 19475, 2574, 49, 4386, 27, 1450, 1015, 234, 781],
 [170240, 17745, 1126, 42, 3310, 0, 276, 505, 60, 445],
 [117991, 14723, 4953, 121, 4140, 0, 2441, 1286, 179, 1107],
 [47250, 3964, 16043, 87, 695, 0, 4721, 1323, 445, 878],
 [49732, 6838, 3854, 27, 2533, 0, 1638, 432, 105, 327],
 [42292, 2309, 19867, 64, 237, 0, 4103, 1449, 380, 1069],
 [71605, 9279, 7602, 5, 4520, 0, 2357, 861, 192, 669],
 [68300, 11989, 34100, 573, 3269, 0, 1519, 1536, 304, 1232],
 [69068, 12427, 10803, 174, 4649, 0, 6445, 1855, 595, 1260],
 [61664, 2429, 25436, 33, 260, 0, 795, 578, 121, 457],
 [105843, 29704, 2307, 45, 1829, 0, 1688, 738, 244, 494],
 [67755, 28877, 1341, 474, 1617, 16, 1498, 12385, 705, 11680],
 [127385, 13785, 2029, 0, 1401, 0, 707, 392, 37, 355],
 [100381, 25718, 4696, 95, 2425, 0, 4739, 

In [262]:
x_16,y_16 = get_x_variables(crime_zipcode_16, df_2016)

In [267]:
import statsmodels.api as sm
from pandas.core import datetools

# X = np.array(x[:20])
# Y = np.array(y[:20])
X_15 = np.array(x_15)
Y_15 = np.array(y_15)

X_16 = np.array(x_16)
Y_16 = np.array(y_16)

X = np.concatenate((X_15, X_16), axis=0)
Y = np.concatenate((Y_15, Y_16), axis=0)

est = sm.OLS(Y,X)

est = est.fit()

est.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.15
Model:,OLS,Adj. R-squared:,0.002
Method:,Least Squares,F-statistic:,1.016
Date:,"Thu, 12 Apr 2018",Prob (F-statistic):,0.44
Time:,01:09:44,Log-Likelihood:,-431.14
No. Observations:,61,AIC:,880.3
Df Residuals:,52,BIC:,899.3
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,0.0008,0.001,1.267,0.211,-0.000,0.002
x2,0.0011,0.006,0.178,0.859,-0.011,0.013
x3,0.0042,0.007,0.580,0.564,-0.010,0.019
x4,0.1546,0.509,0.303,0.763,-0.868,1.177
x5,-0.0027,0.023,-0.118,0.907,-0.049,0.043
x6,-0.6111,3.738,-0.163,0.871,-8.112,6.889
x7,0.0480,0.054,0.889,0.378,-0.060,0.156
x8,-0.2224,0.238,-0.934,0.354,-0.700,0.255
x9,-0.4735,0.509,-0.930,0.357,-1.495,0.548

0,1,2,3
Omnibus:,96.625,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1641.789
Skew:,4.849,Prob(JB):,0.0
Kurtosis:,26.492,Cond. No.,2.27e+17


In [265]:
a = np.array([[1, 2, 3], [4,5,6]])
b = np.array([[7,8,9]])
np.concatenate((a, b),axis=0)            

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])