In [None]:
pip install plotly

In [None]:
pip install seaborn

In [None]:
pip install 'numpy<2.0'

In [None]:
pip install --upgrade plotly

In [None]:
!pip install numpy==1.24.4 --quiet

In [None]:
pip install wordcloud

In [None]:
# Import necessary libraries
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns


In [None]:
# 📦 Import libraries
import pandas as pd
import plotly.express as px

# 📁 Load data
companies = pd.read_csv("companies.csv")
company_industries = pd.read_csv("company_industries.csv")
industries = pd.read_csv("industries.csv")
job_skills = pd.read_csv("job_skills.csv")
skills = pd.read_csv("skills.csv")
salaries = pd.read_csv("salaries.csv")
employee_counts = pd.read_csv("employee_counts.csv")

# 🔍 Let's check the columns to avoid merge errors
print("company_industries columns:", company_industries.columns)
print("industries columns:", industries.columns)

# 1️⃣ Top 10 Industries by Number of Companies
# company_industries uses 'industry' (name), not 'industry_id'
top_industries = company_industries['industry'].value_counts().head(10).reset_index()
top_industries.columns = ['Industry', 'Company Count']

fig1 = px.bar(top_industries, x='Industry', y='Company Count',
              title="Top 10 Industries by Number of Companies",
              text='Company Count', color='Company Count')
fig1.update_layout(xaxis_tickangle=-45)
fig1.show()


In [None]:

# 2️⃣ Top 10 In-Demand Skills
skills_df = job_skills.merge(skills, on="skill_abr", how="left")
top_skills = skills_df['skill_name'].value_counts().head(10).reset_index()
top_skills.columns = ['Skill', 'Job Count']

fig2 = px.bar(top_skills, x='Skill', y='Job Count',
              title="Top 10 In-Demand Skills in Job Postings",
              text='Job Count', color='Job Count')
fig2.update_layout(xaxis_tickangle=-45)
fig2.show()


In [None]:

# 3️⃣ Median Salary by Pay Period
salaries_cleaned = salaries.dropna(subset=["med_salary"])
median_salary = salaries_cleaned.groupby("pay_period")["med_salary"].median().reset_index()

fig3 = px.bar(median_salary, x="pay_period", y="med_salary",
              title="Median Salary by Pay Period",
              text="med_salary", color="med_salary")
fig3.update_layout(xaxis_title="Pay Period", yaxis_title="Median Salary (USD)")
fig3.show()



In [None]:
# 4️⃣ Employee Count vs Follower Count
fig4 = px.scatter(employee_counts,
                  x="employee_count", y="follower_count",
                  title="Employee Count vs Follower Count",
                  labels={"employee_count": "Employee Count", "follower_count": "Follower Count"},
                  size="employee_count", color="follower_count")
fig4.show()


In [None]:
# Load postings (has both job_id and company_id)
postings = pd.read_csv("postings.csv")

# Merge salary with postings to get company_id
salaries_with_company = postings[['job_id', 'company_id']].merge(
    salaries[['job_id', 'med_salary']],
    on='job_id', how='inner'
)

# Merge with company info to get company size
company_salary_df = salaries_with_company.merge(
    companies[['company_id', 'company_size']],
    on='company_id', how='left'
)

# Drop missing values if any
company_salary_df = company_salary_df.dropna(subset=['company_size', 'med_salary'])

# Plot with Plotly
fig5 = px.box(company_salary_df, x='company_size', y='med_salary',
              title="Salary Distribution by Company Size",
              labels={"company_size": "Company Size", "med_salary": "Median Salary"})
fig5.show()


In [None]:
# Load dataset
postings_df = pd.read_csv("postings.csv")

# Filter to only US jobs and non-null states
us_jobs = postings_df[(postings_df['location'].str.contains('United States|USA|US|U.S.', na=False)) | (postings_df['location'].str.contains(', [A-Z]{2}', na=False))]
us_jobs = us_jobs[~us_jobs['location'].isna()]

# Extract state abbreviation from location string (e.g., "Princeton, NJ")
us_jobs['state'] = us_jobs['location'].str.extract(r',\s*([A-Z]{2})')

# Group by state and count job listings
state_counts = us_jobs['state'].value_counts().reset_index()
state_counts.columns = ['state', 'job_count']

# Plot US State Choropleth
fig1 = px.choropleth(
    state_counts,
    locations='state',
    locationmode="USA-states",
    color='job_count',
    scope="usa",
    color_continuous_scale="Viridis",
    title="📍 Jobs Distribution by State in the United States"
)
fig1.show()


In [None]:
# Drop nulls for all 3 geographic levels
geo_df = postings_df.dropna(subset=['location'])

# Try extracting city/state/country (basic assumption parsing location string)
# This will only work properly if you had explicit city/state/country columns; here we infer:
geo_df[['city', 'state']] = geo_df['location'].str.extract(r'^([^,]+),\s*([A-Z]{2})')
geo_df['country'] = 'United States'  # Optional: defaulting for demo if most data is US-based

# Group by location hierarchy
grouped_geo = geo_df.groupby(['country', 'state', 'city']).size().reset_index(name='job_count')

# Sunburst Chart
fig2 = px.sunburst(
    grouped_geo,
    path=['country', 'state', 'city'],
    values='job_count',
    title="🌐 Global Job Opportunities Sunburst: From Country to City",
    height=600
)
fig2.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the CSV files
job_skills = pd.read_csv("job_skills.csv")
skills = pd.read_csv("skills.csv")

# Merge using 'skill_abr' (not 'skill_id')
merged = job_skills.merge(skills, on="skill_abr", how="left")

# Count skills and get top 10
skill_counts = merged['skill_abr'].value_counts()
top_skills = skill_counts[:10]

# Plot the bar chart
plt.figure(figsize=(10, 5))
top_skills.plot(kind="bar", color='skyblue', edgecolor='black')
plt.title("Top 10 In-Demand Skills")
plt.xlabel("Skill")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Load the data
job_skills = pd.read_csv("job_skills.csv")
skills = pd.read_csv("skills.csv")

# Merge on skill abbreviation
merged_skills = job_skills.merge(skills, on='skill_abr', how='left')

# Create a long string of all skill abbreviations
skill_text = ' '.join(merged_skills['skill_abr'])

# Generate the Word Cloud
wordcloud = WordCloud(
    width=800, 
    height=400, 
    background_color='white'
).generate(skill_text)

# Plot the Word Cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud of Skills")
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load CSVs
job_skills = pd.read_csv("job_skills.csv")
skills = pd.read_csv("skills.csv")
postings = pd.read_csv("postings.csv")

# Extract state from location (assuming format: "City, ST")
postings['state'] = postings['location'].str.extract(r',\s*([A-Z]{2})$')

# Merge postings with job_skills
job_post_skills = postings[['job_id', 'state']].merge(job_skills, on='job_id', how='left')

# Merge with skill names
full_data = job_post_skills.merge(skills, on='skill_abr', how='left')

# Count each skill by state
skill_counts_by_state = full_data.groupby(['state', 'skill_abr']).size()

# Get Top 10 skills per state
top_skills_by_state = skill_counts_by_state.groupby(level=0).nlargest(10).reset_index(level=0, drop=True).reset_index(name='count')

# ✅ Choose a specific state to plot
state = 'NC'  

# Filter top skills for selected state
top_skills_in_state = top_skills_by_state[top_skills_by_state['state'] == state]

# Plot
plt.figure(figsize=(10, 5))
plt.bar(top_skills_in_state['skill_abr'], top_skills_in_state['count'], color='cornflowerblue')
plt.title(f"Top 10 Skills in Demand in {state}")
plt.xlabel("Skills")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import plotly.express as px

# Load postings dataset
postings = pd.read_csv("postings.csv")

# Drop rows with missing titles
postings = postings.dropna(subset=["title"])

# Count top 10 job titles
top_titles = postings['title'].value_counts().head(10)

# Plot donut chart
fig = px.pie(
    names=top_titles.index,
    values=top_titles.values,
    title="Distribution of Top 10 Job Titles",
    hole=0.4  # makes it a donut chart
)

# Show chart
fig.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
postings = pd.read_csv("postings.csv")

# Map 0 and 1 in 'remote_allowed' to labels
postings['remote_allowed'] = postings['remote_allowed'].map({0.0: 'Non-Remote', 1.0: 'Remote'})

# Group by work type and remote status, then calculate mean of median salary
grouped_salaries = postings.groupby(['formatted_work_type', 'remote_allowed'])['med_salary'].mean().unstack()

# Plot grouped bar chart
grouped_salaries.plot(kind='bar', figsize=(12, 6))
plt.title('Average Median Salary by Work Type and Remote Allowance', fontsize=16)
plt.xlabel('Work Type', fontsize=14)
plt.ylabel('Average Median Salary', fontsize=14)
plt.xticks(rotation=45)
plt.legend(title='Remote Allowed', bbox_to_anchor=(1.05, 1), loc='upper left')  # Legend outside
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load your dataset
postings = pd.read_csv("postings.csv")

# Extract state from location (assuming format like "City, State")
postings['state'] = postings['location'].str.extract(r',\s*(\w+)$')

# Group by state and calculate average median salary
avg_salary_by_state = postings.groupby('state')['med_salary'].mean().sort_values(ascending=False).head(10)

# Plot
plt.figure(figsize=(10, 6))
avg_salary_by_state.sort_values().plot(kind='barh', cmap='viridis')
plt.title('Top 10 States with the Highest Average Salary')
plt.xlabel('Average Salary')
plt.ylabel('State')
plt.tight_layout()
plt.show()


In [None]:
import plotly.express as px

fig = px.bar(
    avg_salary_by_state.sort_values(),
    x=avg_salary_by_state.sort_values(),
    y=avg_salary_by_state.sort_values().index,
    orientation='h',
    color=avg_salary_by_state.sort_values(),
    title="Top 10 States with the Highest Average Salary",
    labels={'x': 'Average Salary', 'y': 'State'}
)
fig.show()


In [None]:
pip install dash