# Physics Teacher Data Analysis

Let's begin by exploring the physics teacher dataset and examining all available columns.

In [54]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("kanchana1990/linkedins-2023-24-us-math-teacher-jobs")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\adamd\.cache\kagglehub\datasets\kanchana1990\linkedins-2023-24-us-math-teacher-jobs\versions\1


In [55]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Display all columns without truncation
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [60]:
# Install plotly if not already installed
import subprocess
import sys

try:
    import plotly
    print("plotly is already installed!")
except ImportError:
    print("Installing plotly...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "plotly"])
    import plotly
    print("plotly installed successfully!")

plotly is already installed!


In [61]:
# Drop unwanted columns
print("Before dropping columns:")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# List of columns to drop
columns_to_drop = ['description', 'jobUrl', 'jobURL', 'url', 'URL', 'link']

# Drop columns that exist
dropped_columns = []
for col in columns_to_drop:
    if col in df.columns:
        df = df.drop(col, axis=1)
        dropped_columns.append(col)

if dropped_columns:
    print(f"\nDropped columns: {dropped_columns}")
else:
    print("\nNo target columns found to drop")

print(f"\nAfter dropping columns:")
print(f"Shape: {df.shape}")
print(f"Remaining columns: {list(df.columns)}")

Before dropping columns:
Shape: (769, 12)
Columns: ['title', 'location', 'postedTime', 'publishedAt', 'companyName', 'applicationsCount', 'contractType', 'experienceLevel', 'workType', 'sector', 'salary', 'state']

No target columns found to drop

After dropping columns:
Shape: (769, 12)
Remaining columns: ['title', 'location', 'postedTime', 'publishedAt', 'companyName', 'applicationsCount', 'contractType', 'experienceLevel', 'workType', 'sector', 'salary', 'state']


In [62]:
# Display the first 5 rows of the dataset in table format
print("FIRST 5 ROWS OF THE DATASET:")
print("="*50)

# Display as a nice table
df.head()

FIRST 5 ROWS OF THE DATASET:


Unnamed: 0,title,location,postedTime,publishedAt,companyName,applicationsCount,contractType,experienceLevel,workType,sector,salary,state
0,Math Tutors/Teachers (Grades 6-9) *Urgently Hiring*,"Thompson, CT",2 months ago,2023-11-30,Catapult Learning,Be among the first 25 applicants,Part-time,Entry level,Education and Training,Education Administration Programs,,CT
1,24/25 SY Teacher - Elementary 8th Grade Math,"Glendale, AZ",3 days ago,2024-02-06,Peoria Unified School District,Be among the first 25 applicants,Full-time,Entry level,Education and Training,Primary and Secondary Education,,AZ
2,Math Tutors/Teachers (Grades 6-9) *Urgently Hiring*,"Winchester, CT",1 month ago,2023-12-15,Catapult Learning,Be among the first 25 applicants,Part-time,Entry level,Education and Training,Education Administration Programs,,CT
3,Math Tutors/Teachers (Grades 6-9) *Urgently Hiring*,"Stafford, CT",2 months ago,2023-11-30,Catapult Learning,Be among the first 25 applicants,Part-time,Entry level,Education and Training,Education Administration Programs,,CT
4,5th/6th Grade Math Teacher- iSchool Virtual Academy TCPA (iSVA),"Lewisville, TX",2 months ago,2023-11-17,Responsive Education Solutions,Be among the first 25 applicants,Full-time,Entry level,Education and Training,Education Administration Programs,,TX


In [63]:
# Import plotly for interactive visualizations
import plotly.express as px
import plotly.graph_objects as go

# Create a map showing job counts by state
print("Creating interactive map of job counts by state...")

# First, let's see what location-related columns we have
location_columns = [col for col in df.columns if any(keyword in col.lower() 
                   for keyword in ['state', 'location', 'place', 'city', 'region'])]
print(f"Location-related columns: {location_columns}")

# Extract state from location column (assuming format: "City, ST")
if 'location' in df.columns:
    print("\nSample location values:")
    print(df['location'].head())
    
    # Extract state abbreviation (last 2 characters after comma)
    df['state'] = df['location'].str.split(',').str[-1].str.strip()
    
    # Count jobs by state
    state_counts = df['state'].value_counts().reset_index()
    state_counts.columns = ['state', 'job_count']
    
    print(f"\nFound {len(state_counts)} unique states/regions")
    print("Top 10 states:")
    print(state_counts.head(10))
    
    # Create choropleth map
    fig = px.choropleth(
        state_counts,
        locations='state',
        color='job_count',
        locationmode='USA-states',
        scope='usa',
        color_continuous_scale='Blues',
        title='Physics/Math Teacher Job Postings by State',
        labels={'job_count': 'Number of Jobs', 'state': 'State'}
    )
    
    fig.update_layout(
        title_x=0.5,
        geo=dict(showframe=False, showcoastlines=True),
        height=600
    )
    
    fig.show()
    
else:
    print("No 'location' column found. Available columns:")
    print(list(df.columns))

Creating interactive map of job counts by state...
Location-related columns: ['location', 'state']

Sample location values:
0      Thompson, CT
1      Glendale, AZ
2    Winchester, CT
3      Stafford, CT
4    Lewisville, TX
Name: location, dtype: object

Found 52 unique states/regions
Top 10 states:
           state  job_count
0             TX         81
1             CA         66
2             MA         47
3             NY         46
4             IL         40
5             FL         36
6             VA         33
7             NJ         31
8  United States         30
9             AZ         30

Location-related columns: ['location', 'state']

Sample location values:
0      Thompson, CT
1      Glendale, AZ
2    Winchester, CT
3      Stafford, CT
4    Lewisville, TX
Name: location, dtype: object

Found 52 unique states/regions
Top 10 states:
           state  job_count
0             TX         81
1             CA         66
2             MA         47
3             NY         46


In [65]:
# Create a map showing job counts by city
print("Creating interactive map of job counts by city...")

if 'location' in df.columns:
    # Extract city from location column (assuming format: "City, ST")
    df['city'] = df['location'].str.split(',').str[0].str.strip()
    
    # Count jobs by city
    city_counts = df['city'].value_counts().reset_index()
    city_counts.columns = ['city', 'job_count']
    
    print(f"\nFound {len(city_counts)} unique cities")
    print("Top 15 cities with most job postings:")
    print(city_counts.head(15))
    
    # For a proper city map, we need coordinates. Let's create a simple bar chart first
    # and a scatter map for top cities
    
    # Create bar chart for top 20 cities
    top_cities = city_counts.head(20)
    
    fig_bar = px.bar(
        top_cities,
        x='job_count',
        y='city',
        orientation='h',
        title='Top 20 Cities: Physics/Math Teacher Job Postings',
        labels={'job_count': 'Number of Jobs', 'city': 'City'},
        color='job_count',
        color_continuous_scale='Blues'
    )
    
    fig_bar.update_layout(
        height=600,
        yaxis={'categoryorder':'total ascending'}
    )
    
    fig_bar.show()
    
    # Create a bubble map for top cities (using approximate coordinates)
    # Note: For a real implementation, you'd want to geocode the cities
    top_10_cities = city_counts.head(10)
    
    # Sample coordinates for demonstration (you'd normally geocode these)
    city_coords = {
        'New York': {'lat': 40.7128, 'lon': -74.0060},
        'Los Angeles': {'lat': 34.0522, 'lon': -118.2437},
        'Chicago': {'lat': 41.8781, 'lon': -87.6298},
        'Houston': {'lat': 29.7604, 'lon': -95.3698},
        'Phoenix': {'lat': 33.4484, 'lon': -112.0740},
        'Philadelphia': {'lat': 39.9526, 'lon': -75.1652},
        'San Antonio': {'lat': 29.4241, 'lon': -98.4936},
        'San Diego': {'lat': 32.7157, 'lon': -117.1611},
        'Dallas': {'lat': 32.7767, 'lon': -96.7970},
        'San Jose': {'lat': 37.3382, 'lon': -121.8863},
        'Austin': {'lat': 30.2672, 'lon': -97.7431},
        'Jacksonville': {'lat': 30.3322, 'lon': -81.6557},
        'San Francisco': {'lat': 37.7749, 'lon': -122.4194},
        'Columbus': {'lat': 39.9612, 'lon': -82.9988},
        'Fort Worth': {'lat': 32.7555, 'lon': -97.3308}
    }
    
    # Add coordinates to top cities if available
    top_10_cities['lat'] = top_10_cities['city'].map(lambda x: city_coords.get(x, {}).get('lat'))
    top_10_cities['lon'] = top_10_cities['city'].map(lambda x: city_coords.get(x, {}).get('lon'))
    
    # Filter out cities without coordinates
    cities_with_coords = top_10_cities.dropna(subset=['lat', 'lon'])
    
    if not cities_with_coords.empty:
        fig_map = px.scatter_mapbox(
            cities_with_coords,
            lat='lat',
            lon='lon',
            size='job_count',
            hover_name='city',
            hover_data={'job_count': True, 'lat': False, 'lon': False},
            size_max=50,
            zoom=3,
            title='Physics/Math Teacher Jobs by City (Interactive Map)',
            mapbox_style='open-street-map'
        )
        
        fig_map.update_layout(height=600)
        fig_map.show()
    else:
        print("No coordinate data available for bubble map visualization")
        
else:
    print("No 'location' column found for city analysis")

Creating interactive map of job counts by city...

Found 395 unique cities
Top 15 cities with most job postings:
             city  job_count
0         Chicago         24
1        New York         21
2        Richmond         19
3    Philadelphia         15
4         Houston         15
5        Brooklyn         14
6   San Francisco         12
7      Washington         11
8     Springfield         11
9   United States          9
10        Lansing          7
11     Fort Worth          6
12         Boston          6
13      San Diego          6
14          Salem          6




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


*scatter_mapbox* is deprecated! Use *scatter_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/

