In [2]:
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import re

# Load and Clean the Data
# @st.cache  # This function will be cached
def load_and_clean_data():
    # Load
    data = pd.read_csv('glassdoor_jobs.csv')
    
    # Clean: Extract Min and Max Salary Estimate

    def extract_salary_min_max(salary_text):
        salary_match = re.findall(r'\$(\d+)[Kk]', salary_text)
        if len(salary_match) == 1:
            return [int(salary_match[0]), int(salary_match[0])]
        elif len(salary_match) == 2:
            return [int(salary_match[0]), int(salary_match[1])]
        else:
            return [-1, -1]
    
    data[['Min Salary Estimate', 'Max Salary Estimate']] = data['Salary Estimate'].apply(lambda x: pd.Series(extract_salary_min_max(x)))
    
    # Clean: Extract Company Name and Rating
    def extract_company_info(company_text):
        split_text = company_text.split('\n')
        return [split_text[0], float(split_text[1])] if len(split_text) == 2 else [company_text, -1]
    data[['Company Name Clean', 'Company Rating']] = data['Company Name'].apply(lambda x: pd.Series(extract_company_info(x)))
    


    # Clean: Extract Job Title
    def extract_job_title(job_title_text):
        return job_title_text.split('(')[0].strip()
    data['Job Title'] = data['Job Title'].apply(lambda x: extract_job_title(x))

    # Clean: Summarize Similar Job Titles

    #
    category_mapping = {
            "Data Scientist": "Data Scientist",
            "Data Science": "Data Scientist",
            "Senior": "Senior Data Scientist",
            "Sr": "Senior Data Scientist",
            "Junior": "Junior Data Scientist",
            "Jr": "Junior Data Scientist",
            "Entry Level": "Junior Data Scientist",
            "Principal": "Principal/Lead Data Scientist",
            "Lead": "Principal/Lead Data Scientist",
            "Data Engineer": "Data Engineer",
            "Machine Learning": "Machine Learning Specialist",
            "Manager": "Manager/Director",
            "Director": "Manager/Director",
            "Analyst": "Analyst",
            "Database Administrator": "Database Administrator"
        }
    
    category_mapping = {
            "Data Scientist": "Data Scientist",
            "Senior": "Senior Data Scientist",
            "Sr": "Senior Data Scientist",
            "Junior": "Junior Data Scientist",
            "Jr": "Junior Data Scientist",
            "Entry Level": "Junior Data Scientist",
            "Principal": "Principal/Lead Data Scientist",
            "Lead": "Principal/Lead Data Scientist",
            "Data Engineer": "Data Engineer",
            "Machine Learning": "Machine Learning Specialist",
            "Manager": "Manager/Director",
            "Director": "Manager/Director",
            "Analyst": "Analyst"
        }
    data['Job Category'] = 'Other'

    for key in category_mapping.keys():
        data.loc[data['Job Title'].str.contains(key, flags=re.IGNORECASE), 'Job Category'] = category_mapping[key]

    data['Mean Salary Estimate'] = data[['Min Salary Estimate', 'Max Salary Estimate']].mean(axis=1)
    data['Mean Salary Estimate MUSD'] = data['Mean Salary Estimate']/1000

    return data

# Load your data
data = load_and_clean_data()
data

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue,Min Salary Estimate,Max Salary Estimate,Company Name Clean,Company Rating,Job Category,Mean Salary Estimate,Mean Salary Estimate MUSD
0,Data Scientist,-1,Job Overview\nA Data Scientist at ExploreLearn...,4.2,Cambium Learning Group\n4.3,Remote,1001 to 5000 Employees,2004,Company - Private,Primary & Secondary Schools,Education,$500 million to $1 billion (USD),-1,-1,Cambium Learning Group,4.3,Data Scientist,-1.0,-0.0010
1,2024 University Graduate - Data Scientist,Employer Provided Salary:$83K - $153K,Our Company\n\nChanging the world through digi...,4.4,Adobe\n4.4,"San Jose, CA",10000+ Employees,1982,Company - Public,Computer Hardware Development,Information Technology,$5 to $10 billion (USD),83,153,Adobe,4.4,Data Scientist,118.0,0.1180
2,Data Scientist – Entry Level 2024,Employer Provided Salary:$71K - $133K,Introduction\nRanked by Forbes as one of the w...,3.9,IBM\n3.9,"Atlanta, GA",10000+ Employees,1911,Company - Public,Information Technology Support Services,Information Technology,$10+ billion (USD),71,133,IBM,3.9,Junior Data Scientist,102.0,0.1020
3,Data Scientist 2,Employer Provided Salary:$94K - $183K,The Microsoft 365 team is looking for a Data S...,4.3,Microsoft\n4.3,"Redmond, WA",10000+ Employees,1975,Company - Public,Computer Hardware Development,Information Technology,$10+ billion (USD),94,183,Microsoft,4.3,Data Scientist,138.5,0.1385
4,Entry Level Data Scientist 2023/2024,$48K - $78K (Glassdoor est.),"You may not realize it, but you’ve likely used...",3.9,CPChem\n3.9,"The Woodlands, TX",1001 to 5000 Employees,2000,Company - Private,Chemical Manufacturing,Manufacturing,$10+ billion (USD),48,78,CPChem,3.9,Junior Data Scientist,63.0,0.0630
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,Global Technology Transformation Analytics Vic...,Employer Provided Salary:$128K - $190K,JOB DESCRIPTION\n\nThe Global Technology Trans...,4.0,JPMorgan Chase & Co\n4.0,"New York, NY",10000+ Employees,1799,Company - Public,Banking & Lending,Financial Services,$10+ billion (USD),128,190,JPMorgan Chase & Co,4.0,Other,159.0,0.1590
1496,Senior Data Scientist,Employer Provided Salary:$145K,Job Summary:\nWe are seeking a talented and ex...,4.3,Cydcor\n4.3,"Agoura Hills, CA",51 to 200 Employees,1994,Company - Private,Advertising & Public Relations,Media & Communication,$5 to $25 million (USD),145,145,Cydcor,4.3,Senior Data Scientist,145.0,0.1450
1497,Remote - Data Scientist,Employer Provided Salary:$79K - $120K,"We’re looking for talented professionals, anyw...",3.0,Green Dot Corporation\n3.1,Remote,1001 to 5000 Employees,1999,Company - Public,Financial Transaction Processing,Financial Services,$1 to $5 billion (USD),79,120,Green Dot Corporation,3.1,Data Scientist,99.5,0.0995
1498,Market Data Analyst,$53K - $77K (Glassdoor est.),Summary:\n**PLEASE NOTE - This is NOT a techni...,-1.0,S4 Market Data,"Trenton, NJ",Unknown,-1,Company - Private,Information Technology Support Services,Information Technology,Unknown / Non-Applicable,53,77,S4 Market Data,-1.0,Analyst,65.0,0.0650


In [4]:

# Streamlit App
# st.title("Salary Estimates vs. Company Rating by Job Title")
#st.markdown('<h1 style="font-size: 24px;">Glassdoor Data Science Jobs</h1>', unsafe_allow_html=True)

# Generate and sort job title counts
title_counts = data["Job Category"].value_counts().reset_index()
title_counts.columns = ['Job Category', 'Count']
title_counts['Dropdown'] = title_counts['Job Category'] + " (" + title_counts['Count'].astype(str) + ")"

# Dropdown: Select Job Title with Counts
#selected_title_with_count = st.selectbox("Select Job Category", options=title_counts['Dropdown'])

# Extract job title from selection
selected_title = selected_title_with_count.split(" (")[0]

# Filter data based on selected job title
filtered_data = data[data["Job Category"] == selected_title]

# Scatter Plot: Salary Estimates vs. Company Rating
#fig = px.scatter(filtered_data, 
#                 x='Company Rating', 
#                 y=['Min Salary Estimate', 'Max Salary Estimate'], 
#                 title=f'Salary Estimates vs. Company Rating for {selected_title}')
#st.plotly_chart(fig)# Display the plot

# Pie chart for job distribution
#job_distribution = data['Job Category'].value_counts()
#fig_jobs = px.pie(job_distribution, values=job_distribution.values, names=job_distribution.index, title="Job Distribution")
#st.plotly_chart(fig_jobs)


# Dropdown menu for job distribution selection
#options = list(job_distribution.index) + ['All']
#selected_option = st.selectbox('Select a Job Category:', options)

# Filter data based on selected job category
#if selected_option != 'All':
#    filtered_data = data[data['Job Category'] == selected_option]
#else:
#    filtered_data = data

# Overlapping histograms for Min and Max Salary Estimates with distinct colors for the filtered data
#fig_histogram = go.Figure()
#fig_histogram.add_trace(go.Histogram(x=filtered_data['Min Salary Estimate'], name='Min Salary Estimate', opacity=0.7, marker_color='blue'))
#fig_histogram.add_trace(go.Histogram(x=filtered_data['Max Salary Estimate'], name='Max Salary Estimate', opacity=0.7, marker_color='orange'))
#fig_histogram.update_layout(barmode='overlay', title=f"Distribution of Min and Max Salary Estimates for {selected_option} Category", xaxis_title="Salary ($)", yaxis_title="Count")
#st.plotly_chart(fig_histogram)
filtered_data

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue,Min Salary Estimate,Max Salary Estimate,Company Name Clean,Company Rating,Job Category,Mean Salary Estimate,Mean Salary Estimate MUSD
0,Data Scientist,-1,Job Overview\nA Data Scientist at ExploreLearn...,4.2,Cambium Learning Group\n4.3,Remote,1001 to 5000 Employees,2004,Company - Private,Primary & Secondary Schools,Education,$500 million to $1 billion (USD),-1,-1,Cambium Learning Group,4.3,Data Scientist,-1.0,-0.0010
1,2024 University Graduate - Data Scientist,Employer Provided Salary:$83K - $153K,Our Company\n\nChanging the world through digi...,4.4,Adobe\n4.4,"San Jose, CA",10000+ Employees,1982,Company - Public,Computer Hardware Development,Information Technology,$5 to $10 billion (USD),83,153,Adobe,4.4,Data Scientist,118.0,0.1180
3,Data Scientist 2,Employer Provided Salary:$94K - $183K,The Microsoft 365 team is looking for a Data S...,4.3,Microsoft\n4.3,"Redmond, WA",10000+ Employees,1975,Company - Public,Computer Hardware Development,Information Technology,$10+ billion (USD),94,183,Microsoft,4.3,Data Scientist,138.5,0.1385
5,Data Scientist,Employer Provided Salary:$65.00 - $75.00 Per Hour,The Team:\nAt Client’s Artificial Intelligence...,-1.0,IVID TEK INC,Remote,1 to 50 Employees,-1,Company - Public,-1,-1,Unknown / Non-Applicable,-1,-1,IVID TEK INC,-1.0,Data Scientist,-1.0,-0.0010
7,Data Scientist,Employer Provided Salary:$85K,Details\nDepartment: Workforce Planning and An...,3.4,Ascension\n3.4,Remote,10000+ Employees,1902,Nonprofit Organization,Health Care Services & Hospitals,Healthcare,Unknown / Non-Applicable,85,85,Ascension,3.4,Data Scientist,85.0,0.0850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1489,Data Scientist Advanced Development Program,$85K - $115K (Glassdoor est.),Why join Vanguard’s Data Science Advanced Deve...,3.7,Vanguard\n3.7,"Charlotte, NC",10000+ Employees,1975,Company - Private,Investment & Asset Management,Financial Services,Unknown / Non-Applicable,85,115,Vanguard,3.7,Data Scientist,100.0,0.1000
1491,Data Scientist,Employer Provided Salary:$70.00 - $73.00 Per Hour,Data Scientist.\nTwice a week in Jersey City. ...,-1.0,Verdant Infotech Solutions,"Jersey City, NJ",Unknown,-1,Company - Public,-1,-1,Unknown / Non-Applicable,-1,-1,Verdant Infotech Solutions,-1.0,Data Scientist,-1.0,-0.0010
1492,Data Scientist,$110K - $151K (Glassdoor est.),Overview:\nThe Data Scientists will take a lea...,3.0,Plymouth Rock Assurance\n3.0,"Boston, MA",1001 to 5000 Employees,1982,Company - Private,Insurance Carriers,Insurance,$5 to $25 million (USD),110,151,Plymouth Rock Assurance,3.0,Data Scientist,130.5,0.1305
1497,Remote - Data Scientist,Employer Provided Salary:$79K - $120K,"We’re looking for talented professionals, anyw...",3.0,Green Dot Corporation\n3.1,Remote,1001 to 5000 Employees,1999,Company - Public,Financial Transaction Processing,Financial Services,$1 to $5 billion (USD),79,120,Green Dot Corporation,3.1,Data Scientist,99.5,0.0995


DeltaGenerator()

In [None]:

# Layout: 3 Columns
#col1, col2, col3 = st.columns([4, 3, 4])

# Column 1: Scatter Plot
with col1:
    fig = px.scatter(filtered_data, 
                     x='Company Rating', 
                     y=['Min Salary Estimate', 'Max Salary Estimate'])
    fig.update_layout(title=f'Salary vs. Rating',
                      width=400,
                      height=300)  # Adjusted title and height
    st.plotly_chart(fig)
    st.caption(f'Salary Estimates vs. Company Rating for {selected_title}')  # Additional info as caption

# Column 2: Pie Chart
with col2:
    fig_jobs = px.pie(job_distribution, values=job_distribution.values, names=job_distribution.index)
    fig_jobs.update_layout(margin=dict(t=20, b=20, l=20, r=20),
                           title="Job Distribution",
                           width=200,
                           height=300)  # Adjusted height
    
    # Update traces to put text info inside the pie slices and remove the legend
    fig_jobs.update_traces(textposition='inside', textinfo='label+percent')
    fig_jobs.update_layout(showlegend=False)

    st.plotly_chart(fig_jobs)

# Column 3: Histogram
with col3:        
    fig_histogram = go.Figure()
    fig_histogram.add_trace(go.Histogram(x=filtered_data['Min Salary Estimate'], name='Min Salary', opacity=0.7, marker_color='blue'))
    fig_histogram.add_trace(go.Histogram(x=filtered_data['Max Salary Estimate'], name='Max Salary', opacity=0.7, marker_color='orange'))
    fig_histogram.update_layout(barmode='overlay', 
                                title="Salary Distribution", 
                                width=400,
                                height=300,
                                xaxis_title="Salary ($)",
                                yaxis_title="Count"
                                )  # Adjusted title and height
    st.plotly_chart(fig_histogram)
    st.caption(f'Distribution of Min and Max Salary Estimates for {selected_option} Category')  # Additional info as caption


