In [291]:
#Import required packages
from pyspark import SparkConf, SparkContext
from pyspark.sql import Row
from pyspark.sql import SQLContext
import re

In [292]:
#Create configs
conf = SparkConf().setMaster("local").setAppName("Project_BigData")
sc = SparkContext.getOrCreate(conf = conf)
sqlcon = SQLContext.getOrCreate(sc)

In [293]:
#Some global variables
pat1 = re.compile(r'"([a-xA-Z0-9. ,]+), ([a-xA-Z0-9. ,]+)"')
pat2 = re.compile(r'"([,]+)"')

In [294]:
#Remove commas within quotes with some words, recursion makes sure we reomve all such commas
def replaceCommaWithinQuotes(line):
    if len(pat1.findall(line)) == 0:
        return line
    line = pat1.sub( r'"\1 \2"', line )
    return replaceCommaWithinQuotes(line)

In [295]:
#Read the data
h1b_data = sc.textFile(\
           "hdfs://quickstart.cloudera:8020/user/cloudera/Project/H-1B_Disclosure_Data_FY17.csv")

In [296]:
#Returns a dictionary with headers as key and their values as the values
def createRow(line, headers):
    data_dict = {}
    #Replace comma, within words between two quotes, with blank
    line = replaceCommaWithinQuotes(line) #This line may still have just comma within quotes-","
    #line = pat2.sub(r'""', line) - something weird is happening because of this line

    data_list = line.split(",")
    j = 0 #another index
    for i in range(len(headers)):
        if data_list[j] == '"': #In case we encounter a " we avoid it and move ahead.
            j = j+1
        if headers[i] == "":
            data_dict["S_NO"] = int(data_list[j])
        else:
            data_dict[headers[i]] = data_list[j]
        j = j+1
    return data_dict

In [297]:
#Get the headers
headers_string = h1b_data.take(1)[0]
headers = headers_string.split(",")

In [298]:
#Remove the header from the data
h1b_data = h1b_data.filter(lambda x: x != headers_string)

In [299]:
#Create the map
h1_data_map = h1b_data.map(lambda x: Row(**createRow(x, headers)))

In [300]:
#Create the dataframe using SQLContext
h1b_data_frame = sqlcon.createDataFrame(h1_data_map).cache()

In [301]:
#Take a small subset and convert to pandas just to show the data.
h1b_data_frame_1 = h1b_data_frame.where(h1b_data_frame['S_NO'] < 20 )
h1b_data_frame_1.toPandas()

Unnamed: 0,AGENT_ATTORNEY_CITY,AGENT_ATTORNEY_NAME,AGENT_ATTORNEY_STATE,AGENT_REPRESENTING_EMPLOYER,AMENDED_PETITION,CASE_NUMBER,CASE_STATUS,CASE_SUBMITTED,CHANGE_EMPLOYER,CHANGE_PREVIOUS_EMPLOYMENT,...,TOTAL_WORKERS,VISA_CLASS,WAGE_RATE_OF_PAY_FROM,WAGE_RATE_OF_PAY_TO,WAGE_UNIT_OF_PAY,WILLFUL_VIOLATOR,WORKSITE_CITY,WORKSITE_COUNTY,WORKSITE_POSTAL_CODE,WORKSITE_STATE
0,NEW YORK,"""ELLSWORTH CHAD""",NY,Y,0,I-200-16055-173457,CERTIFIED-WITHDRAWN,2016-02-24,0,0,...,1,H-1B,65811.0,67320.0,Year,N,RIVERWOODS,LAKE,60015,IL
1,NEW YORK,"""ELLSWORTH CHAD""",NY,Y,0,I-200-16064-557834,CERTIFIED-WITHDRAWN,2016-03-04,0,0,...,1,H-1B,53000.0,57200.0,Year,N,RIVERWOODS,LAKE,60015,IL
2,WASHINGTON,"""BURKE KAREN""",DC,Y,0,I-200-16063-996093,CERTIFIED-WITHDRAWN,2016-03-10,0,0,...,2,H-1B,77000.0,0.0,Year,N,WASHINGTON,,20007,DC
3,,"""",,N,0,I-200-16272-196340,WITHDRAWN,2016-09-28,0,0,...,1,H-1B,102000.0,0.0,Year,N,JERSEY CITY,HUDSON,7302,NJ
4,ATLANTA,"""SCOFIELD EILEEN""",GA,Y,0,I-200-15053-636744,CERTIFIED-WITHDRAWN,2015-02-22,1,0,...,1,H-1B,132500.0,0.0,Year,N,NEW YORK,NEW YORK,10036,NY
5,ATLANTA,"""SCOFIELD EILEEN""",GA,Y,0,I-200-15071-336195,CERTIFIED-WITHDRAWN,2015-03-12,0,0,...,1,H-1B,71750.0,0.0,Year,N,ATLANTA,FULTON,30303,GA
6,,"""",,N,0,I-200-16056-842817,CERTIFIED-WITHDRAWN,2016-02-25,0,0,...,1,H-1B,61000.0,0.0,Year,N,EDISON,MIDDLESEX,8837,NJ
7,,"""",,N,0,I-200-16056-757335,CERTIFIED-WITHDRAWN,2016-02-25,0,0,...,1,H-1B,60500.0,0.0,Year,N,EDISON,MIDDLESEX,8837,NJ
8,,"""",,N,0,I-200-16058-469533,CERTIFIED-WITHDRAWN,2016-02-27,0,0,...,1,H-1B,60450.0,0.0,Year,N,NEW YORK,NEW YORK,10005,NY
9,,"""",,N,0,I-200-16059-084066,CERTIFIED-WITHDRAWN,2016-02-28,0,0,...,1,H-1B,50000.0,0.0,Year,N,ISELIN,MIDDLESEX,8830,NJ
