In [1]:
Name= 'Catherine Bui'
Organization= 'Center For Community Innovation'
Project= 'Twitter Displacement Study'

In [None]:
%%time
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import subprocess
from pathlib import Path
import csv
import xlrd
import numpy as np
import dask.dataframe as das
import sqlite3
import datetime



### READING THE CSV FILE 
1. Clean and prepare the data

In [2]:
%%time
file = "/scratch/public/catherinebui/sf_with_homeloc_one_tweet_per_day.csv"
rawfile = "/scratch/public/catherinebui/sf_with_homeloc.csv"
df = das.read_csv(rawfile)
df = df.rename(columns = {'sf_with_homeloc.csv': 'id'})

CPU times: user 3min 5s, sys: 24.3 s, total: 3min 29s
Wall time: 3min 30s


In [5]:
#Dropping Nan values
df=df.dropna(subset = ['home_tract'])
df = df.dropna(subset= ['tract'])
df['tract'] = df['tract'].astype('int')
df['home_tract'] = df['home_tract'].astype('int')

## Creating the neighbor binary variable for each tweet 

In [6]:
#Creating a map of neigbhors to the source tract
sf_nn = pd.read_csv('sfbay_13county_nearest_neigbbor_new.csv')
def neighbors(src):
    for i in sf_nn['SRC_GEOID'].unique():
        if i == src:
            table=sf_nn[sf_nn['SRC_GEOID'] == i]
            return np.array(table['NBR_GEOID'])
    return []
n_ = []
for i in sf_nn['SRC_GEOID'].unique():
    n_.append(neighbors(i))
nn = pd.DataFrame(sf_nn['SRC_GEOID'].unique()) 
nn['neighbors'] = n_
nn = nn.rename(columns = {0: 'SRC_GEOID'})
neighbormap = dict(zip(nn['SRC_GEOID'], nn['neighbors']))

#Function that adds a 1 or 0 to each tweet if a tract is a neighbor or not of home_tract
def neighbor_check(x):
    if np.any(neighbormap.get(x['home_tract']) != None): 
        if x['tract'] in neighbormap.get(x['home_tract']):
            return 1
        else:
            return 0
    else:
        return 2
#Applying the neighbor_check to the dataframe
df['neighbor'] = df.apply(lambda x: neighbor_check(x), axis = 1, meta = 'float64')
df= df[(df["tract"].isin(neighbormap.keys())) & (df["home_tract"].isin(neighbormap.keys()))]
#creating the non-neighbor table
other = df[(df['neighbor'] == 0) & (df['tract'] != df['home_tract'])]

In [21]:
#total tweets in each tract BEFORE MERGING WITH DEMOGRAPHIC VARIABLES
other.groupby(['tract']).aggregate({'id': 'count'}).reset_index().to_csv('newnn2_bayarea_totaltweets_*.csv')

['newnn2_bayarea_totaltweets_0.csv']

In [43]:
#DEMOGRAPHIC VARIABLES OF BAY AREA
demo_var = pd.read_csv('home_tract_variables.csv')
demo_var = demo_var.rename(
    columns = {'geo_fips': 'home_tract'})

In [46]:
#Joining the non-neighbor table with the home tract variables csv (demo_var)
other = other.merge(demo_var, on = 'home_tract', how = 'inner')

In [99]:
#Testing total tweets in each tract after Merge
other.groupby(['tract']).aggregate({'id': 'count'}).reset_index().to_csv('new_nn_bayarea_totaltweets_*.csv')

['new_nn_bayarea_totaltweets_0.csv']

In [100]:
#Counting total neighbor tweets in each tract after Merge
df[df['neighbor'] == 1].groupby(['tract']).aggregate({'id': 'count'}).reset_index().to_csv('neighborbayarea_totaltweets_*.csv')

['neighborbayarea_totaltweets_0.csv']

In [None]:
#Counting total local tweets in each tract after Merge
df[df['home_tract'] == df['tract']].groupby(['tract']).aggregate({'id': 'count'}).reset_index().to_csv('localbayarea_totaltweets_*.csv')

## Aggregation: 
Creating csv files for all the numbers and percentages of tweets sent from non-neighbor users in a tract with specific demographic characteristics. 

In [47]:
%%time
#aboverm_per_col15
aboverm_per_col15 = other.groupby(['tract', 'aboverm_per_col15']).aggregate(
    {'u_id': 'count'}).reset_index()
aboverm_per_col15 = aboverm_per_col15.categorize(columns = 'aboverm_per_col15')
aboverm_per_col15 = aboverm_per_col15.pivot_table(values = 'u_id',
                                                 columns = 'aboverm_per_col15', 
                                                 index = 'tract')

CPU times: user 56min 18s, sys: 4min 28s, total: 1h 47s
Wall time: 54min 4s


In [48]:
#Function to take the variable and return the csv of the counts of non-neighbor tweets with that condition
def create_csv(demo_variable):
    g = other.groupby(['tract', demo_variable]).aggregate(
        {'u_id': 'count'}).reset_index()
    g = g.categorize(columns = demo_variable)
    g = g.pivot_table(values = 'u_id',
                      columns = demo_variable, 
                      index = 'tract')
    g.to_csv(demo_variable + '_*.csv')

In [49]:
%%time
create_csv('aboverm_per_nonwhite15')


CPU times: user 1h 52min 31s, sys: 8min 37s, total: 2h 1min 9s
Wall time: 1h 48min 3s


In [50]:
%%time
create_csv('aboverm_empd14')

CPU times: user 1h 51min 14s, sys: 8min 5s, total: 1h 59min 19s
Wall time: 1h 46min 22s


In [51]:
%%time
create_csv('aboverm_per_nhblk15')

CPU times: user 1h 45min 4s, sys: 7min 10s, total: 1h 52min 14s
Wall time: 1h 40min 12s


In [52]:
%%time
create_csv('aboverm_per_asian15')

CPU times: user 1h 48min 4s, sys: 7min 52s, total: 1h 55min 56s
Wall time: 1h 43min 18s


In [53]:
%%time
create_csv('aboverm_density15')

CPU times: user 1h 48min 27s, sys: 7min 53s, total: 1h 56min 21s
Wall time: 1h 43min 40s


In [54]:
%%time
create_csv('aboverm_per_hisp15')

CPU times: user 1h 46min 54s, sys: 7min 35s, total: 1h 54min 30s
Wall time: 1h 42min 9s


In [55]:
%%time
create_csv('LI_under80AMI')

CPU times: user 1h 47min 39s, sys: 7min 48s, total: 1h 55min 28s
Wall time: 1h 42min 59s


In [56]:
%%time
create_csv('HI_above120AMI')

CPU times: user 1h 46min 25s, sys: 7min 32s, total: 1h 53min 58s
Wall time: 1h 41min 41s


In [57]:
%%time
create_csv('MI_80_120AMI')

CPU times: user 1h 48min 9s, sys: 7min 33s, total: 1h 55min 43s
Wall time: 1h 43min 18s


In [58]:
%%time
create_csv('disp_type')

CPU times: user 1h 47min 47s, sys: 7min 38s, total: 1h 55min 25s
Wall time: 1h 42min 50s


In [59]:
%%time
aboverm_per_col15.to_csv('aboverm_per_col15_*.csv')

CPU times: user 53min 8s, sys: 3min 37s, total: 56min 46s
Wall time: 50min 41s


['aboverm_per_col15_0.csv']

In [60]:
[i + '_0.csv' for i in demo_var.columns if i != 'home_tract']

['aboverm_per_col15_0.csv',
 'aboverm_per_nonwhite15_0.csv',
 'aboverm_empd14_0.csv',
 'aboverm_per_nhblk15_0.csv',
 'aboverm_per_asian15_0.csv',
 'aboverm_density15_0.csv',
 'aboverm_per_hisp15_0.csv',
 'disp_type_0.csv',
 'LI_under80AMI_0.csv',
 'HI_above120AMI_0.csv',
 'MI_80_120AMI_0.csv']

## OUTPUT FILE
Creating the final result table. 

Labeling the columns with its correct name

Dividing the count by total tweets to get percentages

In [7]:
col15 = pd.read_csv('aboverm_per_col15_0.csv') # 0 and 1
nonwhite = pd.read_csv('aboverm_per_nonwhite15_0.csv') # 0 and 1
empd = pd.read_csv( 'aboverm_empd14_0.csv') # 0 and 1
nhblk = pd.read_csv( 'aboverm_per_nhblk15_0.csv') # 0 and 1
asian = pd.read_csv('aboverm_per_asian15_0.csv') # 0 and 1
density = pd.read_csv('aboverm_density15_0.csv') # 0 and 1
hisp = pd.read_csv('aboverm_per_hisp15_0.csv') # 0 and 1
disptype = pd.read_csv('disp_type_0.csv') # 9 variables
under80 = pd.read_csv('LI_under80AMI_0.csv') # 0 and 1
above120 = pd.read_csv('HI_above120AMI_0.csv') # 0 and 1
MI = pd.read_csv('MI_80_120AMI_0.csv') # 0 and 1
totaltweets= pd.read_csv('new_nn_bayarea_totaltweets_0.csv')

In [8]:
MI = MI[['tract', '1']]
totaltweets = totaltweets[['tract', 'id']].rename({'id': 'total_tweets'}, axis = 1)
MI = MI.rename({'1': 'ct_othertweets_MI_80_120AMI'}, axis =1)
above120 = above120[['tract', '1']].rename({'1': 'ct_othertweets_HI_above_120AMI'}, axis =1)
under80 = under80[['tract', '1']].rename({'1': 'ct_othertweets_LI_under80AMI'}, axis =1)
disptype.columns = ['tract'] + ['ct_othertweets_' + k for k in disptype.columns if k != 'tract']
hisp = hisp.rename({'1.0': 'ct_othertweets_aboverm_per_hisp15', 
                   '0.0': 'ct_othertweets_underm_per_hisp15'}, axis =1)
density = density.rename({'1': 'ct_othertweets_aboverm_density15', 
                   '0': 'ct_othertweets_underm_density15'}, axis =1)
asian = asian.rename({'1.0': 'ct_othertweets_aboverm_per_asian15',
                     '0.0': 'ct_othertweets_underm_per_asian15'}, axis =1)
nhblk = nhblk.rename({'1.0': 'ct_othertweets_aboverm_per_nhblk15',
                     '0.0': 'ct_othertweets_underm_per_nhblk15'}, axis = 1)
empd = empd.rename({'1': 'ct_othertweets_aboverm_empd14', 
                   '0': 'ct_othertweets_underm_empd14'}, axis = 1)
nonwhite = nonwhite.rename({'1.0': 'ct_othertweets_aboverm_per_nonwhite15',
                     '0.0': 'ct_othertweets_underm_per_nonwhite15'}, axis = 1)
col15 = col15.rename({'1.0': 'ct_othertweets_aboverm_per_col15',
                     '0.0': 'ct_othertweets_underm_per_col15'}, axis =1)

In [9]:
twitter_demo_sf= MI.merge(above120, on = 'tract', how = 'inner').merge(
under80, on = 'tract', how ='inner').merge(
disptype, on ='tract', how ='inner').merge(
hisp, on ='tract', how ='inner').merge(
density, on ='tract', how ='inner').merge(
asian, on ='tract', how = 'inner').merge(
nhblk, on = 'tract', how ='inner').merge(
empd, on ='tract', how ='inner').merge(
nonwhite, on ='tract', how = 'inner').merge(
col15, on ='tract', how = 'inner').merge(
totaltweets, on = 'tract', how = 'inner')

In [10]:
twitter_demo_sf = twitter_demo_sf.fillna(0)
twitter_demo_sf['total_nonneighbortweets'] = twitter_demo_sf['ct_othertweets_HI_above_120AMI'] + twitter_demo_sf['ct_othertweets_LI_under80AMI'] + twitter_demo_sf['ct_othertweets_MI_80_120AMI']
for i in twitter_demo_sf.columns:
    if i not in ['total_nonneighbortweets', 'tract']:
        twitter_demo_sf['%_'+ i[2:len(i)]] = twitter_demo_sf[i]/twitter_demo_sf['total_nonneighbortweets']

In [73]:
twitter_demo_sf.to_csv('Twitter_NonNeighborBayArea_demog_10_23.csv')

In [16]:
#Tweets from outside of Bay Area in each tract
outside = pd.read_csv('nnfromoutside_bayarea_totaltweets_0.csv')

## Filtering raw data for Alameda County

In [4]:
sf_nn = pd.read_csv('sfbay_13county_nearest_neigbbor_new.csv')
alameda = []
for i in sf_nn['SRC_GEOID'].unique():
    if 6001000000 <= i & i < 6002000000:
        alameda.append(i)


In [5]:
alamedatwitter = df[(df['tract'] < 6002000000.0) & (df['tract'] >= 6001000000.0)]

In [None]:
%%time
alamedatwitter.to_csv('alamedatwitter_*.csv')

## Understanding Data with SF Profiles
Includes user description, text, username, etc

In [3]:
%%time
pro = [str(k) for k in list(Path('/scratch/public/catherinebui/SF Profiles').glob('*.csv'))]
p0= pd.read_csv(pro[0])
# p0 = p0.merge(sf_o, on = 'id', how = 'inner')
# p0 =  p0[(p0['tract'] < 6002000000.0) & (p0['tract'] >= 6001000000.0)]



CPU times: user 15.4 s, sys: 944 ms, total: 16.3 s
Wall time: 16.5 s


In [41]:
np.count_nonzero(p0.groupby(
    ['u_id', 'u_location']).aggregate(
    {'u_id' : 'count'}).rename(
    {'u_id': 'count'}, axis =1).reset_index().groupby(
    'u_id').count()['u_location'] > 1)

962

In [44]:
p0.groupby(
    ['u_id', 'lat', 'lon']).aggregate(
    {'u_id' : 'count'}).rename(
    {'u_id': 'count'}, axis =1).reset_index()

Unnamed: 0,u_id,lat,lon,count
0,12,37.616989,-122.391838,1
1,12,37.781066,-122.405316,1
2,12,37.781591,-122.405985,1
3,12,37.781970,-122.406161,1
4,12,37.820302,-122.455271,1
5,15,37.616424,-122.386279,1
6,15,37.760643,-122.421317,1
7,15,37.766449,-122.455650,1
8,15,37.766588,-122.455368,1
9,15,37.770292,-122.440457,4


In [45]:
p0.groupby(
    ['lat', 'lon']).aggregate(
    {'u_id' : 'count'}).rename(
    {'u_id': 'count'}, axis =1).reset_index()

Unnamed: 0,lat,lon,count
0,36.801265,-121.681091,1
1,36.801274,-121.665722,1
2,36.801394,-121.680897,1
3,36.801629,-121.786155,1
4,36.801647,-121.659535,1
5,36.801733,-121.660195,1
6,36.801770,-121.788705,1
7,36.801834,-121.665878,1
8,36.801944,-121.727119,1
9,36.801980,-121.661300,1


In [5]:
p1=pd.read_csv(pro[1])
# p1['id'] = p1['id'].astype(float)
# p1 = p1.merge(sf_o, on = 'id', how = 'inner')
# # p1 =  p1[(p1['tract'] < 6002000000.0) & (p1['tract'] >= 6001000000.0)]

p2=pd.read_csv(pro[2])
# p2['id'] = p2['id'].astype(float)
# p2 = p2.merge(sf_o, on = 'id', how = 'inner')
# p2 =  p2[(p2['tract'] < 6002000000.0) & (p2['tract'] >= 6001000000.0)]


In [6]:
p1

Unnamed: 0,id,u_id,lat,lon,created_at,type,place_type,u_created_at,u_followers_count,u_location,u_lang,u_statuses_count,u_name,u_screen_name,u_description,urls,text
0,4.978132e+17,269555096,36.959819,-122.027553,1407522882000,llp,city,1300669446000,106,,en,5457,Arianne Nova,ariannenova,I hate people,,@trentistweeting 💖
1,4.978132e+17,744492187,37.783333,-122.416667,1407522885000,llp,city,1344402974000,117,"Los Angeles, CA",en,2511,Jasen Martin,jasenmvaca,Photographer,,"🌁 @ San Francisco, CA http://t.co/MmvHR9enIa"
2,4.978132e+17,22708540,38.555097,-121.429662,1407522888000,llp,city,1236127628000,24887,"Sacramento, CA",en,8954,HoppyBrewing,HoppyBrewing,Great Food • Awesome Beer • Cool People!!!,,Hi #Sacramento! :-) Here's a pic of 2days lunc...
3,4.978131e+17,293143885,38.900242,-121.322048,1407522877000,llp,city,1304542170000,263,,en,19177,ΤΣΦ,joshvp4,12/01-forever,,Stupid ass shit man. Who would buy a whole new...
4,4.978131e+17,858428251,38.342015,-121.964636,1407522868000,llp,city,1349199465000,257,Jevan & Kamie,en,22995,MORE TRASH,lifterofweight,Kevan//Heavy riffs//heavy weight//depth before...,,And just threw away ten bucks bc I forgot abou...
5,4.978132e+17,2236904258,37.748598,-122.168636,1407522878000,llp,city,1386557664000,246,,en,21406,A9K__,ANKatoa,"Oakland, CA",,Dayum ugly fobby ass just kilt my effin day! S...
6,4.978131e+17,23366474,37.850235,-122.270612,1407522871000,llp,city,1236552772000,1417,"Oakland, California",en,11366,Jimi Devine,JimiDevine,"Cannabis, Stand Up Comedy, & Politics. All RT...",,Every Olsen twins straight to VHS movie. #Movi...
7,4.978132e+17,1330392194,37.712584,-122.020670,1407522891000,llp,city,1365209950000,753,,en,20042,Daddy Sav✨,amazing_sav,Life is good✊ but my pussy better || ig: @amaz...,,@junior98ribas I'm not legal either I'm just s...
8,4.978132e+17,141618790,37.616424,-122.386279,1407522895000,llp,admin,1273331487000,237,"Santiago, Chile",en,17809,Camilo Pedrero,Cdmas2,"Living life, working on what I love at W Santi...",,In transit ... back home (@ San Francisco Inte...
9,4.978132e+17,738159110,38.293474,-122.459071,1407522899000,llp,city,1344150336000,107,"Napa, CA",en,880,StrawberrieFox,StrawberrieFox,"Just me, 30 yrs old, chef, CIAGreystone studen...",,#avocadoeggtoast @ Sunflower Caffe Sonoma Vall...


In [6]:
%%time
p3=pd.read_csv(pro[3])
# p3['id'] = p3['id'].astype(float)
# p3 = p3.merge(sf_o, on = 'id', how = 'inner')
# p3 =  p3[(p3['tract'] < 6002000000.0) & (p3['tract'] >= 6001000000.0)]

p4=das.read_csv(pro[4])
# p4['id'] = p4['id'].astype(float)
# p4 = p4.merge(sf_o, on = 'id', how = 'inner')
# p4 =  p4[(p4['tract'] < 6002000000.0) & (p4['tract'] >= 6001000000.0)]


CPU times: user 12.7 s, sys: 672 ms, total: 13.3 s
Wall time: 13.6 s


In [7]:
p5=das.read_csv(pro[5])
# p5['id'] = p5['id'].astype(float)
# p5 = p5.merge(sf_o, on = 'id', how = 'inner')
# p5 =  p5[(p5['tract'] < 6002000000.0) & (p5['tract'] >= 6001000000.0)]

p6=pd.read_csv(pro[6], encoding='iso-8859-1')
# p6 = p6.merge(sf_o, on = 'id', how = 'inner')
# p6 =  p6[(p6['tract'] < 6002000000.0) & (p6['tract'] >= 6001000000.0)]



In [8]:
p7=pd.read_csv(pro[7], encoding='iso-8859-1')
# p7['id'] = p7['id'].astype(float)
# p7 = p7.merge(sf_o, on = 'id', how = 'inner')
# p7 =  p7[(p7['tract'] < 6002000000.0) & (p7['tract'] >= 6001000000.0)]

p8=pd.read_csv(pro[8])
# p8['id'] = p8['id'].astype(float)
# p8 = p8.merge(sf_o, on = 'id', how = 'inner')
# p8 =  p8[(p8['tract'] < 6002000000.0) & (p8['tract'] >= 6001000000.0)]


In [9]:
p9=pd.read_csv(pro[9])
# p9['id'] = p9['id'].astype(float)
# p9 = p9.merge(sf_o, on = 'id', how = 'inner')
# p9 =  p9[(p9['tract'] < 6002000000.0) & (p9['tract'] >= 6001000000.0)]

p10=pd.read_csv(pro[10])
# p10['id'] = p10['id'].astype(float)
# p10 = p10.merge(sf_o, on = 'id', how = 'inner')
# p10 =  p10[(p10['tract'] < 6002000000.0) & (p10['tract'] >= 6001000000.0)]



In [10]:
p11=pd.read_csv(pro[11])
# p11['id'] = p11['id'].astype(float)
# p11 = p11.merge(sf_o, on = 'id', how = 'inner')
# p11 =  p11[(p11['tract'] < 6002000000.0) & (p11['tract'] >= 6001000000.0)]

p12=pd.read_csv(pro[12])
# p12['id'] = p12['id'].astype(float)
# p12 = p12.merge(sf_o, on = 'id', how = 'inner')
# p12 =  p12[(p12['tract'] < 6002000000.0) & (p12['tract'] >= 6001000000.0)]


In [11]:
p13=pd.read_csv(pro[13])
# p13['id'] = p13['id'].astype(float)
# p13 = p13.merge(sf_o, on = 'id', how = 'inner')
# p13 =  p13[(p13['tract'] < 6002000000.0) & (p13['tract'] >= 6001000000.0)]

p14= pd.read_csv(pro[14])
# p14['id'] = p14['id'].astype(float)
# p14 = p14.merge(sf_o, on = 'id', how = 'inner')
# p14 =  p14[(p14['tract'] < 6002000000.0) & (p14['tract'] >= 6001000000.0)]



In [12]:
p15=pd.read_csv(pro[15])
# p15['id'] = p15['id'].astype(float)
# p15 = p15.merge(sf_o, on = 'id', how = 'inner')
# p15 =  p15[(p15['tract'] < 6002000000.0) & (p15['tract'] >= 6001000000.0)]

p16=pd.read_csv(pro[16])
# p16['id'] = p16['id'].astype(float)
# p16 = p16.merge(sf_o, on = 'id', how = 'inner')
# p16 =  p16[(p16['tract'] < 6002000000.0) & (p16['tract'] >= 6001000000.0)]

In [13]:
%%time
p17=pd.read_csv(pro[17])
# p17['id'] = p17['id'].astype(float)
# p17 = p17.merge(sf_o, on = 'id', how = 'inner')
# p17 =  p17[(p17['tract'] < 6002000000.0) & (p17['tract'] >= 6001000000.0)]

p18=pd.read_csv(pro[18])
# p18['id'] = p18['id'].astype(float)
# p18 = p18.merge(sf_o, on = 'id', how = 'inner')
# p18 =  p18[(p18['tract'] < 6002000000.0) & (p18['tract'] >= 6001000000.0)]



CPU times: user 17.5 s, sys: 1.24 s, total: 18.8 s
Wall time: 20.4 s


In [14]:
p19=pd.read_csv(pro[19])
# p19['id'] = p19['id'].astype(float)
# p19 = p19.merge(sf_o, on = 'id', how = 'inner')
# p19 =  p19[(p19['tract'] < 6002000000.0) & (p19['tract'] >= 6001000000.0)]

p20=pd.read_csv(pro[20])
# p20['id'] = p20['id'].astype(float)
# p20 = p20.merge(sf_o, on = 'id', how = 'inner')
# p20 =  p20[(p20['tract'] < 6002000000.0) & (p20['tract'] >= 6001000000.0)]



In [15]:
p21=pd.read_csv(pro[21])
# p21['id'] = p21['id'].astype(float)
# p21 = p21.merge(sf_o, on = 'id', how = 'inner')
# p21 =  p21[(p21['tract'] < 6002000000.0) & (p21['tract'] >= 6001000000.0)]

p22=pd.read_csv(pro[22])
# p22['id'] = p22['id'].astype(float)
# p22 = p22.merge(sf_o, on = 'id', how = 'inner')
# p22 =  p22[(p22['tract'] < 6002000000.0) & (p22['tract'] >= 6001000000.0)]



In [16]:
p23=pd.read_csv(pro[23])
# p23['id'] = p23['id'].astype(float)
# p23 = p23.merge(sf_o, on = 'id', how = 'inner')
# p23 =  p23[(p23['tract'] < 6002000000.0) & (p23['tract'] >= 6001000000.0)]

p24=pd.read_csv(pro[24])
# p24['id'] = p24['id'].astype(float)
# p24 = p24.merge(sf_o, on = 'id', how = 'inner')
# p24 =  p24[(p24['tract'] < 6002000000.0) & (p24['tract'] >= 6001000000.0)]



In [17]:
p25=pd.read_csv(pro[25])
# p25['id'] = p25['id'].astype(float)
# p25 = p25.merge(sf_o, on = 'id', how = 'inner')
# p25 =  p25[(p25['tract'] < 6002000000.0) & (p25['tract'] >= 6001000000.0)]

p26=pd.read_csv(pro[26])
# p26['id'] = p26['id'].astype(float)
# p26 = p26.merge(sf_o, on = 'id', how = 'inner')
# p26 =  p26[(p26['tract'] < 6002000000.0) & (p26['tract'] >= 6001000000.0)]


In [18]:
p27=pd.read_csv(pro[27])
# p27['id'] = p27['id'].astype(float)
# p27 = p27.merge(sf_o, on = 'id', how = 'inner')
# p27 =  p27[(p27['tract'] < 6002000000.0) & (p27['tract'] >= 6001000000.0)]

p28=pd.read_csv(pro[28])
# p28['id'] = p28['id'].astype(float)
# p28 = p28.merge(sf_o, on = 'id', how = 'inner')
# p28 =  p28[(p28['tract'] < 6002000000.0) & (p28['tract'] >= 6001000000.0)]



In [19]:
p29=pd.read_csv(pro[29])
# p29['id'] = p29['id'].astype(float)
# p29 = p29.merge(sf_o, on = 'id', how = 'inner')
# p29 =  p29[(p29['tract'] < 6002000000.0) & (p29['tract'] >= 6001000000.0)]

p30=pd.read_csv(pro[30])
# p30['id'] = p30['id'].astype(float)
# p30 = p30.merge(sf_o, on = 'id', how = 'inner')
# p30 =  p30[(p30['tract'] < 6002000000.0) & (p30['tract'] >= 6001000000.0)]


In [20]:
p31=pd.read_csv(pro[31])
# p31['id'] = p31['id'].astype(float)
# p31 = p31.merge(sf_o, on = 'id', how = 'inner')
# p31 =  p31[(p31['tract'] < 6002000000.0) & (p31['tract'] >= 6001000000.0)]

p32=pd.read_csv(pro[32])
# p32['id'] = p32['id'].astype(float)
# p32 = p32.merge(sf_o, on = 'id', how = 'inner')
# p32 =  p32[(p32['tract'] < 6002000000.0) & (p32['tract'] >= 6001000000.0)]


In [21]:
p33 = pd.read_csv(pro[33])
# p33['id'] = p33['id'].astype(float)
# p33 = p33.merge(sf_o, on = 'id', how = 'inner')
# p33 =  p33[(p33['tract'] < 6002000000.0) & (p33['tract'] >= 6001000000.0)]

p34 = pd.read_csv(pro[34])
# p34['id'] = p34['id'].astype(float)
# p34 = p34.merge(sf_o, on = 'id', how = 'inner')
# p34 =  p34[(p34['tract'] < 6002000000.0) & (p34['tract'] >= 6001000000.0)]



pd.concat([pd.read_csv(pro[i]) for i in np.arange(17, 35, 1)]).to_csv('/scratch/public/catherinebui/prof_17_35.csv')

pd.concat([pd.read_csv(pro[i], encoding = 'iso-8859-1') for i in np.arange(0, 35, 1)]).to_csv('/scratch/public/catherinebui/sf_alameda_twitterprofile.csv')

pd.concat([p0,p1,p2,p3,p4,p5,p6,p7,p8,
          p9,p10,p11,p12,p13,p14,p15,p16,p17, p18,
          p19, p20, p21, p22, p23, p24, p25, p26, p27, p28,
          p29, p30, p31,p32, p33, p34]).to_csv('/scratch/public/catherinebui/sf_alameda_twitterprofile.csv')