In [1]:
import pandas as pd
import numpy as np

In [2]:
sf_data = pd.read_csv("/scratch/public/kushk/sf_with_homeloc.csv")


In [3]:
print("Original size: ", len(sf_data))

Original size:  47723945


In [4]:
homedropped = len(sf_data[sf_data["home_tract"].isna()])
print("Number of tracts without home_tract: " ,  homedropped)

Number of tracts without home_tract:  7360


In [5]:
tractdropped = (sf_data[sf_data["tract"].isna()]) 
print("Number of tracts without tract: " , tractdropped )

Number of tracts without tract:            sf_with_homeloc.csv           u_id        lat         lon  \
184             9.840597e-294  1.112537e-308  37.798620 -122.483460   
415             2.117199e-293  1.112537e-308  37.815344 -122.354951   
446             2.117197e-293  1.112537e-308  37.817001 -122.478217   
753             1.074382e-293  1.112537e-308  37.798620 -122.483460   
764             9.619576e-294  1.112537e-308  37.906298 -122.339094   
1079            6.296646e-294  1.112537e-308  37.759762 -122.371300   
1088            6.212982e-294  1.112537e-308  37.815344 -122.354951   
1162            2.114877e-293  1.112537e-308  37.826767 -122.421280   
1807            1.317894e-293  1.112537e-308  37.462800 -122.067817   
2885            2.116476e-293  1.112537e-308  37.807692 -122.387763   
2888            1.425875e-293  1.112537e-308  36.958137 -122.017767   
3242            1.028468e-293  1.112537e-308  37.798295 -122.344938   
3346            2.107359e-293  1.112537e-308

In [6]:
sf_data = sf_data[~sf_data["home_tract"].isna()]
sf_data = sf_data[~sf_data["tract"].isna()]
sf_data = sf_data[["date", "tract" , "home_tract"]]

In [7]:
print("Final size after dropping nulls: ", len(sf_data))

Final size after dropping nulls:  47653523


In [8]:
nn_data = pd.read_csv("/scratch/public/kushk/sfbay_13county_nearest_neigbbor_new.csv")

In [9]:
nn_data.head()

Unnamed: 0,SRC_GEOID,NBR_GEOID,LENGTH,NODE_COUNT
0,6001400100,6001404300,11.573519,0
1,6001400100,6001404400,4424.828741,0
2,6001400100,6001421600,477.810654,0
3,6001400100,6001422600,1282.939488,0
4,6001400100,6001422700,553.272946,0


In [10]:
sf_data.head()

Unnamed: 0,date,tract,home_tract
0,2012-07-26T00:49:46Z,6081613000.0,6085509000.0
1,2012-07-26T00:50:45Z,6067008000.0,6067008000.0
2,2012-07-26T00:50:50Z,6095253000.0,6095253000.0
3,2012-07-26T00:50:56Z,6081601000.0,6081603000.0
4,2012-07-26T00:50:58Z,6085501000.0,6085501000.0


In [11]:
neighbors = {}
for src,dest in zip(nn_data["SRC_GEOID"] , nn_data["NBR_GEOID"]):
    if src not in neighbors:
        neighbors[src] = set()
    neighbors[src].add(dest)

In [12]:
def processNeighbors(row):
    tract = int(row["tract"])
    home_tract = int(row["home_tract"])
    if(tract == home_tract):
        return "home"
    elif(home_tract in neighbors and tract in neighbors[home_tract]):
        return "neighbor"
    elif(home_tract in neighbors):
        return "non-neighbor"
    else:
        return "not in set?"

In [13]:
%%time
sf_data["locality"] = sf_data.apply(processNeighbors, axis=1)
sf_data["locality"].value_counts()

CPU times: user 40min 43s, sys: 14.4 s, total: 40min 57s
Wall time: 40min 59s


In [14]:
sf_data = sf_data[sf_data["locality"] !=  "not in set?"]

In [15]:
sf_data["locality"].value_counts()

home            28990907
non-neighbor    14188849
neighbor         3717728
Name: locality, dtype: int64

In [16]:
sf_data.head()

Unnamed: 0,date,tract,home_tract,locality
0,2012-07-26T00:49:46Z,6081613000.0,6085509000.0,non-neighbor
1,2012-07-26T00:50:45Z,6067008000.0,6067008000.0,home
2,2012-07-26T00:50:50Z,6095253000.0,6095253000.0,home
3,2012-07-26T00:50:56Z,6081601000.0,6081603000.0,non-neighbor
4,2012-07-26T00:50:58Z,6085501000.0,6085501000.0,home


In [17]:
overall = sf_data.groupby(["home_tract" , "locality" ]).size().reset_index().rename(columns={0:'count'})

In [18]:
sf_data['hour'] = sf_data['date'].str[11:13]
dayvalues = ['04', '05', '06', '07', '08', '09', '10', '11', '12', '13','14', '15', '16', '17', '18']
sf_data['day'] = sf_data.hour.map(lambda x: 1 if x in dayvalues else 0)

In [19]:
dayNight = sf_data.groupby(["home_tract" , "locality", "day"]).size().reset_index().rename(columns={0:'count'})
dayNight.head()

Unnamed: 0,home_tract,locality,day,count
0,6001400000.0,home,0,719
1,6001400000.0,home,1,753
2,6001400000.0,neighbor,0,202
3,6001400000.0,neighbor,1,116
4,6001400000.0,non-neighbor,0,1060


In [20]:
sf_data['date_helper'] = sf_data['date'].str[0:10]
sf_data['weekday'] = sf_data.date_helper.map(lambda x: 0 if pd.to_datetime(x).weekday() > 4 else 1)


In [21]:
weekDay = sf_data.groupby(["home_tract" , "locality", "weekday"]).size().reset_index().rename(columns={0:'count'})
weekDay.head()

Unnamed: 0,home_tract,locality,weekday,count
0,6001400000.0,home,0,378
1,6001400000.0,home,1,1094
2,6001400000.0,neighbor,0,71
3,6001400000.0,neighbor,1,247
4,6001400000.0,non-neighbor,0,586


In [22]:
tracts = list(neighbors.keys())

#sf_data.

In [23]:
%%time
counts = [[] , [] , []]
dayCounts = [[[], []] , [[],[]] , [[],[]]]
weekDayCounts = [[[], []] , [[],[]] , [[],[]]]
locs = ["home" , "neighbor" , "non-neighbor"]


for tract in tracts:
    one_tract = overall[overall["home_tract"] == tract]
    i=0
    for loc in locs:
        one_loc = one_tract[one_tract["locality"] == loc]
        if(len(one_loc) > 0):
            counts[i].append(one_loc["count"].iloc[0])
        else:
            counts[i].append(0)
        i+=1
        
for tract in tracts:
    one_tract = dayNight[dayNight["home_tract"] == tract]
    i=0
    for loc in locs:
        one_loc = one_tract[one_tract["locality"] == loc]
        for val in [0,1]:
            one_val = one_loc[one_loc["day"] == val]
            if(len(one_val) > 0):
                dayCounts[i][val].append(one_val["count"].iloc[0])
            else:
                dayCounts[i][val].append(0)
        i+=1
                         
for tract in tracts:
    one_tract = weekDay[weekDay["home_tract"] == tract]
    i=0
    for loc in locs:
        one_loc = one_tract[one_tract["locality"] == loc]
        for val in [0,1]:
            one_val = one_loc[one_loc["weekday"] == val]
            if(len(one_val) > 0):
                weekDayCounts[i][val].append(one_val["count"].iloc[0])
            else:
                weekDayCounts[i][val].append(0)
        i+=1

CPU times: user 51.2 s, sys: 164 ms, total: 51.4 s
Wall time: 51.4 s


In [24]:
sf_final = pd.DataFrame({"tract": pd.Series(tracts)})
i=0
for loc in locs:
    sf_final[loc] = pd.Series(counts[i])
    sf_final[loc + " day"] = pd.Series(dayCounts[i][1])
    sf_final[loc + " night"] = pd.Series(dayCounts[i][0])
    sf_final[loc + " weekday"] = pd.Series(weekDayCounts[i][1])
    sf_final[loc + " weekend"] = pd.Series(weekDayCounts[i][0])
    i+=1

In [25]:
sf_final

Unnamed: 0,tract,home,home day,home night,home weekday,home weekend,neighbor,neighbor day,neighbor night,neighbor weekday,neighbor weekend,non-neighbor,non-neighbor day,non-neighbor night,non-neighbor weekday,non-neighbor weekend
0,6081607000,2791,1699,1092,1861,930,243,122,121,192,51,966,430,536,634,332
1,6097152000,7667,3707,3960,5155,2512,1097,438,659,747,350,13026,6183,6843,8745,4281
2,6067003400,3931,2419,1512,3011,920,292,165,127,187,105,1925,893,1032,1326,599
3,6067007501,31714,21086,10628,23011,8703,2097,1046,1051,1545,552,7637,3731,3906,5213,2424
4,6067007503,6972,4245,2727,4993,1979,880,495,385,614,266,3036,1658,1378,2193,843
5,6067007504,2700,1478,1222,2141,559,427,186,241,332,95,3468,1497,1971,2317,1151
6,6013354001,1065,532,533,810,255,103,27,76,69,34,567,206,361,374,193
7,6013354002,5051,3015,2036,3649,1402,375,140,235,260,115,1614,768,846,1046,568
8,6067009317,18637,11765,6872,13492,5145,1783,846,937,1276,507,7668,3563,4105,5059,2609
9,6085508204,2963,1778,1185,2102,861,359,178,181,265,94,2377,931,1446,1705,672


In [26]:
sf_final.to_csv("Absolute_new_1013_13county.csv")

In [None]:
sum(sf_final["home"]) + sum(sf_final["non-neighbor"])