In [1]:
import os
import re
from dotenv import load_dotenv
import pandas as pd

pd.options.display.max_rows = 999

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.ext.declarative import declarative_base
import json

%load_ext lab_black

# get JSON data

In [15]:
def get_analytics_engine():
    # take environment variables from .env
    load_dotenv()

    # create engine (below is a Jupyter fix)
    return create_engine(
        re.sub(r"(?<=2019)\d+", "$$", os.environ.get("ANALYTICS_DB_AUTH")),
        echo=False,
        encoding="utf-8",
    )


def get_linkedin_data():
    # get analytics engine
    engine = get_analytics_engine()

    # return selected fields in a dataframe
    query = "SELECT id, value, attribution_id, migrated FROM raw_data WHERE scraped_job_id = 3"
    return pd.read_sql_query(query, engine)

In [16]:
def extract_profile(raw_json):
    """extract information from a single LinkedIn profile"""

    # information to extract from given profile
    keys = [
        "flagshipProfileUrl",
        "fullName",
        "headline",
        "summary",
        "location",
        "industry",
    ]

    # if JSON can be read as a dictionary
    try:
        raw_dict = json.loads(raw_json)
        # store info in new dictionary (value is None if key is not found)
        profile_dict = {
            key: (raw_dict[key] if key in raw_dict else None) for key in keys
        }
    # if JSON is unreadable, fill values with None
    except ValueError:
        profile_dict = {key: None for key in keys}

    # return values only
    return profile_dict.values()

def extract_all_profiles(linkedin_data):
    """extract information from all scraped LinkedIn profiles"""

    # return dataframe with profile information
    return pd.DataFrame(
        linkedin_data["value"].apply(lambda x: extract_profile(x)).tolist(),
        columns=["url", "name", "headline", "summary", "location", "industry"],
    )

# raw data on all scraped LinkedIn profiles
linkedin_data = get_linkedin_data()

# extract info from raw data
profiles = extract_all_profiles(linkedin_data)

In [None]:
# create class for profiles in scraper_models

In [18]:
# raw data on all scraped LinkedIn profiles
linkedin_data = get_linkedin_data()

In [19]:
# extract info from raw data
profiles = extract_all_profiles(linkedin_data)

In [20]:
profiles.head(100)

Unnamed: 0,url,name,headline,summary,location,industry
0,https://www.linkedin.com/in/louis-cialdella,Louis Cialdella,Senior Data Scientist @ Facebook,I recently joined as a Data Scientist at Faceb...,"New York, New York, United States",Computer Software
1,https://www.linkedin.com/in/suleman-farooqi-2a...,Suleman Farooqi,Data Scientist @ Facebook,Experienced Data analyst with a demonstrated h...,"London Area, United Kingdom",Computer Software
2,https://www.linkedin.com/in/skanajan,Sri K.,Senior Data Scientist/ML Engineer,NOTE: I much prefer a direct email vs. a linke...,"San Francisco, California, United States",Logistics and Supply Chain
3,https://www.linkedin.com/in/stephanie-zhang-a2...,Stephanie Zhang,Data Scientist at Instagram,I am extremely competitive in framing the righ...,"Menlo Park, California, United States",Information Technology and Services
4,https://www.linkedin.com/in/michaelgsuttles,Michael Suttles,Data Scientist and Founder,Strategic thinker. Data scientist that thrives...,"Washington, District of Columbia, United States",Information Technology and Services
5,https://www.linkedin.com/in/tonypaek,Tony Paek,Data Scientist at Instagram,My interests lie in extracting insights from l...,"New York, New York, United States",Computer Software
6,https://www.linkedin.com/in/haisu-ma-4761b5a7,Haisu Ma,Research Data Scientist at Facebook,Seasoned data scientist with a Ph.D. degree an...,Greater Seattle Area,Internet
7,https://www.linkedin.com/in/annabethke,Anna Bethke,Senior Data Scientist,I am a senior data scientist with experience i...,"San Francisco, California, United States",Research
8,https://www.linkedin.com/in/mostafamajidpour,Mostafa Majidpour,Senior Research Data Scientist at Facebook,- 2+ years of Management experience in Data Sc...,"Los Angeles, California, United States",Internet
9,https://www.linkedin.com/in/qilu75,Lu Qi,Data Scientist at Facebook,,"London, England, United Kingdom",Internet
