In [1]:
import csv
from scipy import spatial
import numpy as np
import sys
csv.field_size_limit(sys.maxsize)

131072

In [2]:
inputFile = "pennData/sample.csv"
locFile = "pennData/locationlist.txt"

In [3]:
#twitter data format
# id
# message_id
# term
# category
# message
# created_time
# coordinates
# coordinates_state
# coordinates_address
# from_id
# in_reply_to_message_id
# in_reply_to_from_id
# retweet_message_id
# location
# friend_count
# followers_count
# time_zone
# lang
# has_hashtag
class pennTwitterData(object):
    def __init__(self, fname, flocations):
        self.fname = fname
        lines = []
        self.fcities={}
        self.frates={}
        with open ( flocations, 'r') as f:
            for line in f.readlines():
                s_line = line.split('.')[0].split('_')
                #print(s_line)
                
                if s_line[1] in self.fcities:
                    self.fcities[s_line[1]].append(s_line[0])
                else:
                    self.fcities[ s_line[1] ] = [ s_line[0] ]
                    
                self.frates [ ",".join(s_line[:2]) ]= s_line[2]
           
    def __iter__(self):
        for line in csv.reader( open(self.fname, 'r' )):
            yield line

In [8]:
# DATA FORMAT - http://download.geonames.org/export/dump/
        #0  geonameid         : integer id of record in geonames database
        #1  name              : name of geographical point (utf8) varchar(200)
        #2  asciiname         : name of geographical point in plain ascii characters, varchar(200)
        #3  alternatenames    : alternatenames, comma separated, ascii names automatically transliterated, convenience attribute from alternatename table, varchar(10000)
        #4  latitude          : latitude in decimal degrees (wgs84)
        #5  longitude         : longitude in decimal degrees (wgs84)
        #6  feature class     : see http://www.geonames.org/export/codes.html, char(1)
        #7  feature code      : see http://www.geonames.org/export/codes.html, varchar(10)
        #8  country code      : ISO-3166 2-letter country code, 2 characters
        #9  cc2               : alternate country codes, comma separated, ISO-3166 2-letter country code, 200 characters
        #10 admin1 code       : fipscode (subject to change to iso code), see exceptions below, see file admin1Codes.txt for display names of this code; varchar(20)
        #11 admin2 code       : code for the second administrative division, a county in the US, see file admin2Codes.txt; varchar(80) 
        #12 admin3 code       : code for third level administrative division, varchar(20)
        #13 admin4 code       : code for fourth level administrative division, varchar(20)
        #14 population        : bigint (8 byte int) 
        #15 elevation         : in meters, integer
        #16 dem               : digital elevation model, srtm3 or gtopo30, average elevation of 3''x3'' (ca 90mx90m) or 30''x30'' (ca 900mx900m) area in meters, integer. srtm processed by cgiar/ciat.
        #17 timezone          : the timezone id (see file timeZone.txt) varchar(40)
        #18 modification date : date of last modification in yyyy-MM-dd format
class geoLocFinder(object):
    def __init__(self, countyCodes, flocations):

        #from countycode to the County Name
        self.code2county= {}
        for line in csv.reader( open(countyCodes, 'r' ), delimiter='\t'):
            self.code2county[line[0]] = line[1:]
            
        #create nearest neighbor
        self.cord2loc = {}
        for line in csv.reader( open(flocations, 'r' ), delimiter='\t'):
            if ( line[10] == '' or line[11] == ''):
                continue
            
            countyCode = 'US.'+line[10]+'.'+line[11]
            self.cord2loc[ (float(line[4]), float(line[5]) ) ] = countyCode
            
        #nearest neighbors
        data = []
        for (lat, long) in self.cord2loc.keys():
            data.append((lat,long))
        
        self.tree = spatial.KDTree(data)
        
    def findLoc(self, latitude, longitude):
        pts = np.array([latitude, longitude])
        nn = self.tree.query(pts)
        
        loc_cords = self.tree.data[nn[1]]
        locCode = self.cord2loc[(loc_cords[0], loc_cords[1])]
        return self.code2county[locCode] + [nn[0], locCode]

In [5]:
ptd = pennTwitterData(inputFile, locFile)

In [9]:
glf = geoLocFinder("geoname/US/us_counties.txt","geoname/US/US.txt" )

In [11]:
ptd.fcities['NY']

['Albany',
 'Allegany',
 'Bronx',
 'Broome',
 'Cattaraugus',
 'Chenango',
 'Columbia',
 'Cortland',
 'Delaware',
 'Erie',
 'Fulton',
 'Herkimer',
 'Kings',
 'Lewis',
 'Madison',
 'Monroe',
 'Montgomery',
 'Niagara',
 'Onondaga',
 'Ontario',
 'Oswego',
 'Otsego',
 'Putnam',
 'Queens',
 'Rensselaer',
 'Richmond',
 'Rockland',
 'Saratoga',
 'Schenectady',
 'Schoharie',
 'Steuben',
 'Tioga',
 'Tompkins',
 'Warren',
 'Wayne',
 'Yates']

In [10]:
headings = []
for i, line in enumerate(ptd):
    print (i)
    if i == 0:
        headings = line
    else:
        for j, item in enumerate(line):
            if headings[j] == "coordinates":
                temp = item.split("'")
                lat = float(temp[1])
                long = float(temp[3])
                
                print (glf.findLoc(lat,long))
                print (headings[j], ":", item)
                print (lat,long)
            #if headings[j] == "created_time":
            #    print (headings[j], ":", item)
            if headings[j] == "coordinates_address":
                print (headings[j], ":", item)
            if headings[j] == "coordinates_state":
                print (headings[j], ":", item)
            if headings[j] == "location":
                print (headings[j], ":", item)
    

0
1
['New York County', 'New York County', '5128594', 0.00093562439044885448, 'US.NY.061']
coordinates : ['40.856228', '-73.935783']
40.856228 -73.935783
coordinates_state : NY
coordinates_address : NY
location : iPhone: 40.856228,-73.935783
2
['Santa Cruz County', 'Santa Cruz County', '5393068', 0.00059478567568957842, 'US.CA.087']
coordinates : ['36.963333', '-121.990479']
36.963333 -121.990479
coordinates_state : CA
coordinates_address : CA
location : iPhone: 36.963333,-121.990479
3
['New Haven County', 'New Haven County', '4839373', 0.0030578714819281955, 'US.CT.009']
coordinates : ['41.489857', '-73.032043']
41.489857 -73.032043
coordinates_state : CT
coordinates_address : CT
location : iPhone: 41.489857,-73.032043
4
['Queens County', 'Queens County', '5133268', 0.0019673357618870718, 'US.NY.081']
coordinates : ['40.774963', '-73.930779']
40.774963 -73.930779
coordinates_state : NY
coordinates_address : NY
location : iPhone: 40.774963,-73.930779
5
['Queens County', 'Queens County'

In [142]:
#

In [140]:



data = []
for (lat, long) in cord2loc:
    data.append((float(lat),float(long)))
tree = spatial.KDTree(data)
tree.data[40]

pts = np.array([52.39, 173.60])
result = tree.query(pts)
print (result[1])
cords = tree.data[result[1]]
cord2loc[(cords[0], cords[1])]


glf.findLoc(52.39, 173.60)
dd[0.1] = 1
dd

['Aleutians West Census Area', 'Aleutians West Census Area', '5879164']