In [123]:
import requests,time,pandas as pd

In [124]:
# File Paths Import/Export
company_ids_file_import = pd.read_csv(r'C:\Users\amorrow\Documents\builtinNYCjobs\builtin_nyc_companyids.csv') # Read-in Company IDs
company_profile_file_export = r'C:\Users\amorrow\Documents\builtinNYCjobs\company_extract.csv' # Export Company Profiles here
job_details_file_export = r'C:\Users\amorrow\Documents\builtinNYCjobs\job_extract.csv' # Export Job Details here

In [125]:
# Company Profile global df
company_df = pd.DataFrame(columns=['id','title','year_founded','total_employees', # Overview
                                   'mission','why_work_with_us','perks_overview', # Perks / Culture
                                   'street_address_1','street_address_2','city','state','zipcode', # Location
                                   'facebook','instagram','twitter','url', # Contact Information
                                   'funding']) # Funding if present

In [126]:
# Job Details global df
job_df = pd.DataFrame(columns=['company_id','job_id','title','experience_level','category_id', 'category_type', # Overview
                               'created_at','location','original_location', # Posting Date / Location(s)
                               'how_to_apply']) # Application Link

In [127]:
# Job Category Mapping
job_category_id = {
    146 : "Finance",
    147 : "Data + Analytics",
    148 : "Design + UX",
    149 : "Developer + Engineering",
    150 : "HR + Recruiting",
    151 : "Internships",
    152 : "Legal",
    153 : "Marketing",
    154 : "Operations",
    155 : "Product",
    156 : "Project Management",
    157 : "Sales",
    158 : "Content"

}

In [128]:
def extract_csv(csv_file):
    company_ids = []
    for k in csv_file['company_id']:
        company_ids.append(str(k))
    return company_ids
# print(extract_csv(company_ids_file_import))
# print(len(extract_csv(company_ids_file_import)))

In [129]:
def extract_company_profile(company_id):

    results = []
    company_profile_url = f"https://api.builtin.com/companies/{company_id}"
    payload = ""
    response = requests.request("GET", company_profile_url, data=payload)
    data = response.json()

    company_profile = {'id': data.get('id'),
                      'title': data.get('title'),
                      'year_founded' : data.get('year_founded'),
                      'total_employees' : data.get('total_employees'),
                      'funding' : data.get('funding'),
                      'mission' : data.get('mission'),
                      'why_work_with_us' : data.get('why_work_with_us'),
                      'perks_overview' : data.get('perks_overview'),
                      'street_address_1' : data.get('street_address_1'),
                      'street_address_2' : data.get('street_address_2'),
                      'city' : data.get('city'),
                      'state' : data.get('state'),
                      'zipcode' : data.get('zipcode'),
                      'facebook' : data.get('facebook'),
                      'instagram' : data.get('instagram'),
                      'twitter' : data.get('twitter'),
                      'url' : data.get('url')}
    results.append(company_profile)
    df = pd.DataFrame.from_dict(results)

    return df

In [130]:
def load_csv(df,file_path):
    df.to_csv(file_path,index=False,header=True)
    print('CSV - Created')

In [131]:
def company_profile_main():
    results = []
    progress = 0
    company_ids = extract_csv(company_ids_file_import)

    for j in company_ids:
        new_str = j.strip()
        results.append(extract_company_profile(new_str))
        progress += 1
        if progress % 50 == 0:
            print(progress)
        else:
            continue
        time.sleep(1)

    results.append(company_df)
    final = pd.concat(results, ignore_index=True)
    load_csv(final,company_profile_file_export)

    print('Done')

In [133]:
def extract_job_information(company_id):

    results = []
    company_profile_url = f"https://api.builtin.com/companies/{company_id}/jobs"
    payload = ""
    response = requests.request("GET", company_profile_url, data=payload)
    data = response.json()
    for k in data['jobs']:
        job_profile = {
            'company_id' : k.get('company_id'),
            'job_id': k.get('id'),
            'title': k.get('title'),
            'experience_level' : f"[{k.get('experience_level')} - years]",
            'category_id' : k.get('category_id'),
            'category_type' : job_category_id[k.get('category_id')],
            'created_at' : k.get('created_at'),
            'location' : k.get('location'),
            'original_location' : k.get('original_location'),
            'how_to_apply' : k.get('how_to_apply'),
            }
        results.append(job_profile)
    df = pd.DataFrame.from_dict(results)
    return df
# print(extract_job_information(3418))

In [137]:
def job_profile_main():
    results = []
    progress = 0
    company_ids = extract_csv(company_ids_file_import)
    company_id_len = len(company_ids)
    for j in company_ids:
        try:
            new_str = j.strip()
            results.append(extract_job_information(new_str))
            progress += 1
            time.sleep(1)
            if progress%50 == 0:
                print(f"Count:{progress}\nCompletion:{round(progress/company_id_len,2)}")

        except KeyError as ke:
            print(ke)
            print(f"Key Error - {new_str}")
            print(f'Progress:{progress}')
            continue
        except TypeError as te:
            print(te)
            print(f"Type Error - {new_str}")
            print(f'Progress:{progress}')
            continue



    results.append(job_df)
    final = pd.concat(results, ignore_index=True)
    load_csv(final,job_details_file_export)

    print('Done')

In [144]:
def merge_dataframe(df1,df2,filepath):
    df2_transformed = df2[['id','title']]
    merged_df = pd.merge(df1,df2_transformed.rename(columns={'id':'company_id'}), on='company_id',how='left')
    cols = merged_df.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    merged_df = merged_df[cols]
    load_csv(merged_df,filepath)

In [145]:
def main():
    start_time = time.time()

    print('Start Company Extract')

    company_profile_main()

    print('End Company Extract\nStart Job Profile Extract')

    job_profile_main()

    print('End Job Profile Extract\nStart File Merge')

    company_df_import = pd.read_csv(company_profile_file_export)
    job_df_import = pd.read_csv(job_details_file_export)
    job_df_export_w_company = r'C:\Users\amorrow\Documents\builtinNYCjobs\job_extract_w_company.csv'

    merge_dataframe(job_df_import,company_df_import,job_df_export_w_company)

    end_time = time.time()
    duration = round((end_time - start_time),2)

    print(f'Runtime: {duration/60}')

In [147]:
main()