In [1]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

In [2]:
lines = sc.textFile("../Data/filtered_registered_business_sf.csv")

In [3]:
lines.take(5)

['94123,Tournahu George L,3301 Broderick St,San Francisco,CA',
 '94124,Stephens Institute Inc,2225 Jerrold Ave,San Francisco,CA',
 '94105,Stephens Institute Inc,180 New Montgomery St,San Francisco,CA',
 '94108,Stephens Institute Inc,540 Powell St,San Francisco,CA',
 '94107,Stephens Institute Inc,460 Townsend St,San Francisco,CA']

## Create pairs of (zip, (Business Name, City))

In [4]:
def create_key_value_pair(x):
    comma_separated_x = x.split(",")
    return(comma_separated_x[0],(comma_separated_x[1], comma_separated_x[3]))

In [5]:
pairs = lines.map(create_key_value_pair)

In [6]:
pairs.take(5)

[('94123', ('Tournahu George L', 'San Francisco')),
 ('94124', ('Stephens Institute Inc', 'San Francisco')),
 ('94105', ('Stephens Institute Inc', 'San Francisco')),
 ('94108', ('Stephens Institute Inc', 'San Francisco')),
 ('94107', ('Stephens Institute Inc', 'San Francisco'))]

## Count by zip code

In [7]:
pairs.countByKey()

defaultdict(int,
            {'94123': 5750,
             '94124': 6226,
             '94105': 6176,
             '94108': 5558,
             '94107': 9394,
             '94109': 9623,
             '94102': 7962,
             '94133': 6358,
             '94111': 6630,
             '94103': 10919,
             '94158': 558,
             '94127': 2588,
             '94132': 2558,
             '94110': 12459,
             '94545': 410,
             '94117': 6539,
             '94118': 7022,
             '94114': 6493,
             '94062': 93,
             '94112': 6634,
             '94014': 1147,
             '94104': 5438,
             '94116': 4213,
             '94121': 5313,
             '0': 377,
             '94122': 7066,
             '94115': 6268,
             '94005': 202,
             '90058': 4,
             '94577': 431,
             '94601': 382,
             '94587': 276,
             '94134': 2933,
             '94404': 215,
             '94080': 1525,
             '9172

## Count pairs which do not have a zip (empty zip) as a key.

In [8]:
pairs.lookup('')

[('Hartmann Studios Incorporated', ''),
 ('Cardno Entrix', ''),
 ('Bond Blacktop Inc', ''),
 ('Moonka Nishi', ''),
 ('Cooper Jim B', ''),
 ('Ascencion Flores Ismael O', ''),
 ('Opower Inc', ''),
 ('Htut Chris', ''),
 ('Red Oxygen Inc', ''),
 ('Miniclip America Inc', '"London'),
 ('Vip Plumbing And Drain Cleanin', ''),
 ('Intelex Technologies Inc', '"Toronto'),
 ('East & West Alum Craft Inc', 'Burnaby'),
 ('Act Fuels Inc', '"Amsterdam'),
 ('Allstream Inc', 'Mp20d+canada+mb+r3v3v6+winnipeg'),
 ('Pointclickcare', '"Mississauga'),
 ('Sara Gulyas', '"Budapest'),
 ('Malik Alia', 'San Francisco'),
 ('Cantrell Harris & Assoc Inc', 'San Francisco'),
 ('Odotech Inc', '"Montreal'),
 ('Ortiz Jose E', 'San Francisco'),
 ('Law Office Of Scott A Sommer', ''),
 ('Lexa Mary C', 'Oakland'),
 ('Vieira Reynaldo', 'San Francisco'),
 ('Margaret Apartments Lp', 'San Francisco'),
 ('Torres Alvaro', 'Daly+city'),
 ('Magdaluyo Melecio', '94124'),
 ('Barth Roofing Company Inc', 'Tracy'),
 ('Ultra Electronics For

In [9]:
len(pairs.lookup(''))

92

## Filter pairs that do not include “San Francisco” in the city value.

In [10]:
pairs.filter(lambda x: "San Francisco" not in x[1][1]).collect()

[('94108', ('"Fugazi Travel Agency', '170 Grant Ave 4th Fl')),
 ('94111', ('"Fugazi Travel Agency', '400 Sansome St')),
 ('94111', ('"Fugazi Travel Agency', '400 Sansome St')),
 ('94111', ('"Fugazi Travel Agency', '400 Sansome St')),
 ('94111', ('"Fugazi Travel Agency', '400 Sansome St')),
 ('94133', ('Alioto Fish Co Ltd', 'San+francisco')),
 ('94545', ('Allan Auto Sprinkler Corp', 'Hayward')),
 ('94062', ('Antonchuk Richard A', 'Redwood+city')),
 ('94132', ('Atlantic Richfield Co', 'San+francisco')),
 ('94014', ('Aunt Anns Corp Headquarters', 'Daly+city')),
 ('0', ('Balliet Brothers Const Corp', 'Mexico')),
 ('94102', ('Baumell Pearl Co Inc', 'San+francisco')),
 ('94108', ('Berlitz School Of Languages', 'San+francisco')),
 ('94005', ('Birite Rest Supply Co Inc', 'Brisbane')),
 ('90058', ('Bradley E B Co Inc', 'Vernon')),
 ('94545', ('Bradley E B Co Inc', 'Hayward')),
 ('94577', ('Brayer Electric Co', 'San+leandro')),
 ('94107', ('Bressie & Company', 'San+francisco')),
 ('94601', ('Bri