In [1]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

## Load business and supervisor data

In [2]:
business = sc.textFile("../Data/filtered_registered_business_sf.csv")
business_zip_name_pair = business.map(lambda x : x.split(","))\
                                .map(lambda x : (x[0],x[1])).distinct()

In [3]:
supervisor = sc.textFile("../Data/supervisor_sf.csv")
supervisor_zip_id_pair = supervisor.map(lambda x : x.split(","))\
                                   .map(lambda x : (x[0],x[1])).distinct()

In [4]:
business_zip_name_pair.take(5)

[('94124', 'Stephens Institute Inc'),
 ('94108', 'Stephens Institute Inc'),
 ('94102', 'Stephens Institute Inc'),
 ('94133', 'Stephens Institute Inc'),
 ('94111', 'Stephens Institute Inc')]

In [5]:
supervisor_zip_id_pair.take(5)

[('94102', '8'),
 ('94103', '6'),
 ('94103', '3'),
 ('94103', '5'),
 ('94105', '6')]

## Create pairs of business and supervisor id if both exists.

In [6]:
business_zip_name_pair.join(supervisor_zip_id_pair)

PythonRDD[21] at RDD at PythonRDD.scala:53

In [7]:
business_supervisor = business_zip_name_pair.join(supervisor_zip_id_pair)\
                                            .values()\
                                            .distinct()

In [8]:
business_supervisor.count() 

377612

In [9]:
business_supervisor.take(5)

[('Stephens Institute Inc', '8'),
 ('Cal Parlor Car Tours Inc', '8'),
 ('D & R Plumbing & Heating Inc', '8'),
 ('Jones Schiller & Company Llp', '8'),
 ('Dudum Basim & Adib', '8')]

## Create pairs of business and supervisor id for all the business (although it may not have a district supervisor)

In [10]:
business_zip_name_pair.leftOuterJoin(supervisor_zip_id_pair)

PythonRDD[42] at RDD at PythonRDD.scala:53

In [11]:
business_supervisor = business_zip_name_pair.leftOuterJoin(supervisor_zip_id_pair)\
                                            .values()\
                                            .distinct()

In [12]:
business_supervisor.count()

417034

## Create pairs of business and supervisor id if a supervisor exists (although it may not have a business supervised by the supervisor).

In [13]:
business_zip_name_pair.rightOuterJoin(supervisor_zip_id_pair)

PythonRDD[62] at RDD at PythonRDD.scala:53

In [14]:
business_supervisor = business_zip_name_pair.rightOuterJoin(supervisor_zip_id_pair)\
                                            .values()\
                                            .distinct()

In [15]:
business_supervisor.count()

377612

In [16]:
sc.stop()