In [1]:
import pandas as pd
import numpy as np
import json
import os
import datetime

# Cleaning data

## First I scraped listings from target cities (jsons.zip file)

In [32]:
folder_path = ''


In [33]:
json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

data = []

In [34]:
for file in json_files:
    with open(os.path.join(folder_path, file), 'r') as json_file:
        json_data = json.load(json_file)
        file_name = file.split('.json')[0]
        
        city = file_name.split(', ')[0]
        state = file_name.split(', ')[-1].split('_')[0]
        elements = json_data['elements']
        
        for element in elements:
            job_id = element['jobCardUnion']['jobPostingCard']['preDashNormalizedJobPostingUrn']
            footer_items = element.get('jobCardUnion', {}).get('jobPostingCard', {}).get('footerItems', [])
    
            if isinstance(footer_items, list) and len(footer_items) > 1:
                applicants = footer_items[1].get('text', None).get('text', None)
            else:
                applicants = None
                
            data.append({
                'City': city,
                'State': state,
                'job_id': job_id,
                'applicants': applicants})
            


In [95]:
df_applicants = pd.DataFrame(data)

## And initially my df had 184710 rows and 48083 rows without applicants data

In [96]:
df_applicants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184710 entries, 0 to 184709
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   City        184710 non-null  object
 1   State       184710 non-null  object
 2   job_id      184710 non-null  object
 3   applicants  136627 non-null  object
dtypes: object(4)
memory usage: 5.6+ MB


## After that I dropped duplicates by job_id

In [97]:
df_applicants = df_applicants.drop_duplicates(subset=['job_id']).reset_index(drop=True)

In [98]:
df_applicants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161535 entries, 0 to 161534
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   City        161535 non-null  object
 1   State       161535 non-null  object
 2   job_id      161535 non-null  object
 3   applicants  121144 non-null  object
dtypes: object(4)
memory usage: 4.9+ MB


## And noticed 'Easy Apply' in unique values in applicants column

In [80]:
df_applicants['applicants'].unique()

array(['0 applicants', '1 applicant', '3 applicants', '13 applicants',
       '4 applicants', None, '8 applicants', '10 applicants',
       '2 applicants', 'Easy Apply', '6 applicants', '17 applicants',
       '14 applicants', '16 applicants', '11 applicants', '5 applicants',
       '18 applicants', '9 applicants', '23 applicants', '7 applicants',
       '22 applicants', '19 applicants', '20 applicants', '12 applicants',
       '24 applicants', '21 applicants', '15 applicants'], dtype=object)

## So I replaced 'Easy Apply' with None values, after that I got 58549 jobs without applicants data

In [99]:
df_applicants['applicants'] = df_applicants['applicants'].apply(lambda x: None if x == 'Easy Apply' else x)

In [100]:
df_applicants.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161535 entries, 0 to 161534
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   City        161535 non-null  object
 1   State       161535 non-null  object
 2   job_id      161535 non-null  object
 3   applicants  102986 non-null  object
dtypes: object(4)
memory usage: 4.9+ MB


Let's turn the column to float format

In [101]:
df_applicants['applicants'] = df_applicants['applicants'].apply(lambda x: int(x.split(' ')[0]) if pd.notna(x) else x)

df_applicants['applicants'] = df_applicants['applicants'].apply(lambda x: float(x))

In [84]:
df_applicants

Unnamed: 0,City,State,job_id,applicants
0,Laval,Canada,urn:li:fs_normalized_jobPosting:3956077399,0.0
1,Laval,Canada,urn:li:fs_normalized_jobPosting:3956564623,0.0
2,Laval,Canada,urn:li:fs_normalized_jobPosting:3956077398,0.0
3,Laval,Canada,urn:li:fs_normalized_jobPosting:3953305174,0.0
4,Laval,Canada,urn:li:fs_normalized_jobPosting:3956147414,1.0
...,...,...,...,...
161530,Alaska,US States,urn:li:fs_normalized_jobPosting:3955084066,0.0
161531,Alaska,US States,urn:li:fs_normalized_jobPosting:3956565876,0.0
161532,Alaska,US States,urn:li:fs_normalized_jobPosting:3953777633,0.0
161533,Alaska,US States,urn:li:fs_normalized_jobPosting:3948599619,1.0


## While scraping listings I made a mistake and scraped data for US cities twice

It happened because, in the previous 2023 project, on the Global Cities tab, the country written for US cities is 'United States'. However, on the US Cities tab, each city has its own state name instead of 'United States', and I wasn't careful enough to double check this when I sent the data to scrape.

In [85]:
# In this example you can see that there are two different 'states' for the city of Los Angeles
df_applicants[df_applicants.City == 'Los Angeles'].State.unique()

array(['California', 'United States'], dtype=object)

In [65]:
# One
df_applicants[(df_applicants.City == 'Los Angeles') & (df_applicants.State =='California') ]

Unnamed: 0,City,State,job_id,applicants
200,Los Angeles,California,urn:li:fs_normalized_jobPosting:3868378172,
201,Los Angeles,California,urn:li:fs_normalized_jobPosting:3953335231,
202,Los Angeles,California,urn:li:fs_normalized_jobPosting:3812109204,
203,Los Angeles,California,urn:li:fs_normalized_jobPosting:3918613701,
204,Los Angeles,California,urn:li:fs_normalized_jobPosting:3911369653,
...,...,...,...,...
158102,Los Angeles,California,urn:li:fs_normalized_jobPosting:3954440299,10.0
158103,Los Angeles,California,urn:li:fs_normalized_jobPosting:3868303988,15.0
158104,Los Angeles,California,urn:li:fs_normalized_jobPosting:3920501831,
158105,Los Angeles,California,urn:li:fs_normalized_jobPosting:3910044791,17.0


In [66]:
# Two 
df_applicants[(df_applicants.City == 'Los Angeles') & (df_applicants.State =='United States') ]

Unnamed: 0,City,State,job_id,applicants
2424,Los Angeles,United States,urn:li:fs_normalized_jobPosting:3931155920,
2425,Los Angeles,United States,urn:li:fs_normalized_jobPosting:3904950551,
2426,Los Angeles,United States,urn:li:fs_normalized_jobPosting:3954763860,15.0
2427,Los Angeles,United States,urn:li:fs_normalized_jobPosting:3932049072,
2428,Los Angeles,United States,urn:li:fs_normalized_jobPosting:3891228287,
...,...,...,...,...
149608,Los Angeles,United States,urn:li:fs_normalized_jobPosting:3951553987,
149609,Los Angeles,United States,urn:li:fs_normalized_jobPosting:3951512362,
149610,Los Angeles,United States,urn:li:fs_normalized_jobPosting:3954469197,20.0
149611,Los Angeles,United States,urn:li:fs_normalized_jobPosting:3951891551,8.0


In the further analysis for US cities I only used data where State =='United States' with one exception for Boston

In [86]:
df_applicants.State.unique()

array(['Canada', 'Pennsylvania', 'US States', 'Australia', 'California',
       'United Kingdom', 'Germany', 'Turkey', 'Argentina', 'Washington',
       'Colorado', 'China', 'France', 'Massachusetts', 'Tennessee',
       'Texas', 'Ohio', 'Brazil', 'Belgium', 'Thailand', 'United States',
       'Arizona', 'Indiana', 'Denmark', 'Florida', 'Spain',
       'North Carolina', 'Illinois', 'South Africa', 'Austria', 'Poland',
       'Greece', 'Italy', 'United Arab Emirates', 'Hungary', 'Taiwan',
       'Malaysia', 'New York', 'Japan', 'Portugal', 'Ireland',
       'Singapore', 'Switzerland', 'Norway', 'South Korea',
       'Czech Republic', 'Qatar', 'Mexico', 'Luxemborg', 'Israel',
       'Sweden', 'Finland', 'Netherlands', 'New Zealand'], dtype=object)

In [102]:
us_states_list = ['Texas','North Carolina', 'Illinois', 'Ohio','Colorado', 'Florida',
            'California','Tennessee', 'New York','Pennsylvania', 'Arizona',
            'Washington', 'Indiana'] # just added 'Indiana' now to this list, missed it in the working notebook

In [103]:
df_applicants = df_applicants[~df_applicants.State.isin(us_states_list)]

In [89]:
df_applicants.State.unique()

array(['Canada', 'US States', 'Australia', 'United Kingdom', 'Germany',
       'Turkey', 'Argentina', 'China', 'France', 'Massachusetts',
       'Brazil', 'Belgium', 'Thailand', 'United States', 'Indiana',
       'Denmark', 'Spain', 'South Africa', 'Austria', 'Poland', 'Greece',
       'Italy', 'United Arab Emirates', 'Hungary', 'Taiwan', 'Malaysia',
       'Japan', 'Portugal', 'Ireland', 'Singapore', 'Switzerland',
       'Norway', 'South Korea', 'Czech Republic', 'Qatar', 'Mexico',
       'Luxemborg', 'Israel', 'Sweden', 'Finland', 'Netherlands',
       'New Zealand'], dtype=object)

In [104]:
df_applicants.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147733 entries, 0 to 161534
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   City        147733 non-null  object 
 1   State       147733 non-null  object 
 2   job_id      147733 non-null  object 
 3   applicants  96032 non-null   float64
dtypes: float64(1), object(3)
memory usage: 5.6+ MB


## After that I enriched data with additionaly scraped applicants from job ids (json_jobs.zip file) and filtered the jobs which were posted longer than a week ago

In [105]:
folder_path = '/Users/anna.zemit/Desktop/Programming/2024/22. The Most Competitive Job Markets in The World in 2024/jsons jobs'


In [106]:
json_files = [f for f in os.listdir(folder_path) if f.endswith('.json')]

In [107]:
for file in json_files:
    with open(os.path.join(folder_path, file), 'r') as json_file:
        json_data = json.load(json_file)
        job_id = file.split('.json')[0].replace('-', ':')
        applicants = json_data.get('applies', None)
        originalListedAt = json_data.get('originalListedAt', None)
        
        try:
            # Convert milliseconds to seconds
            timestamp_s = originalListedAt / 1000
        
            # Convert to a datetime object
            date_time = datetime.datetime.fromtimestamp(timestamp_s)

        
            # Calculate the date and time one week ago
            reference_date = datetime.datetime(2024, 6, 25) 
            one_week_ago = reference_date - datetime.timedelta(days=7)
        
            if date_time < one_week_ago: # at this point I filtered data which was longer that 1 week ago

                df_applicants.loc[df_applicants['job_id'] == job_id, 'applicants'] = applicants
        except:
            print(applicants)

As you can see I added 9.5k jobs with applicants data

In [94]:
df_applicants.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 148385 entries, 0 to 161534
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   City        148385 non-null  object 
 1   State       148385 non-null  object 
 2   job_id      148385 non-null  object 
 3   applicants  105543 non-null  float64
dtypes: float64(1), object(3)
memory usage: 5.7+ MB


But even after filtering jobs posted more than a week ago there are still plenty of jobs with thousands of applicants

In [108]:
df_applicants.sort_values(by='applicants', ascending=False).head(50)

Unnamed: 0,City,State,job_id,applicants
28645,Dubai,United Arab Emirates,urn:li:fs_normalized_jobPosting:3853085139,6186.0
20971,Luxemborg,Luxemborg,urn:li:fs_normalized_jobPosting:3899088702,5898.0
8706,Brussels,Belgium,urn:li:fs_normalized_jobPosting:3665717041,5887.0
61844,Kuala Lumpur,Malaysia,urn:li:fs_normalized_jobPosting:3444758544,3941.0
18199,Dubai,United Arab Emirates,urn:li:fs_normalized_jobPosting:3829884793,3806.0
49831,New York City,United States,urn:li:fs_normalized_jobPosting:3915915799,3096.0
5961,Colorado,US States,urn:li:fs_normalized_jobPosting:3820651634,2979.0
29261,Doha,Qatar,urn:li:fs_normalized_jobPosting:3773675725,2963.0
16458,Johannesburg,South Africa,urn:li:fs_normalized_jobPosting:3894851602,2928.0
2427,Los Angeles,United States,urn:li:fs_normalized_jobPosting:3932049072,2870.0


## After that I dropped rows with missing 'applicants' values and gruoped data by city

In [111]:
applicants_grouped = df_applicants.dropna(subset=['applicants']).groupby(['City', 'State']).agg(job_id_count=('job_id', 'count'),
                                             applicants_sum=('applicants', 'sum')).reset_index()

In [112]:
applicants_grouped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 172 entries, 0 to 171
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   City            172 non-null    object 
 1   State           172 non-null    object 
 2   job_id_count    172 non-null    int64  
 3   applicants_sum  172 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 5.5+ KB


## And did analysis

In [113]:
applicants_grouped['Average Applicants per Job 2024'] = applicants_grouped['applicants_sum']/applicants_grouped['job_id_count']

In [115]:
applicants_grouped.columns = ['City', 'State', 'Number of Jobs 2024', 
                              'Number of Applicants 2024',
                              'Average Applicants per Job 2024'] 

In [116]:
applicants_grouped

Unnamed: 0,City,State,Number of Jobs 2024,Number of Applicants 2024,Average Applicants per Job 2024
0,Abu Dhabi,United Arab Emirates,271,22768.0,84.014760
1,Adelaide,Australia,767,3879.0,5.057366
2,Alabama,US States,866,11617.0,13.414550
3,Alaska,US States,869,3403.0,3.915995
4,Amsterdam,Netherlands,692,8432.0,12.184971
...,...,...,...,...,...
167,Winnipeg,Canada,684,2337.0,3.416667
168,Wisconsin,US States,746,10323.0,13.837802
169,Wollongong,Australia,288,375.0,1.302083
170,Wyoming,US States,731,2663.0,3.642955


# Finalise the data into tabs

## Global Comp

In [117]:
research_2023 = pd.read_excel('Copy of Research_The Most Competitive Job Markets in the World.xlsx', 
                              sheet_name=None)

In [118]:
Global_Cities = research_2023['Global Cities']

In [121]:
Global_Cities.columns = ['location', 'Number of Jobs', 'Number of Applicants',
       'Average Applicants per Job 2023']

In [122]:
Global_Cities['City'] = Global_Cities['location'].apply(lambda x: x.split(', ')[0])

Global_Cities['State'] = Global_Cities['location'].apply(lambda x: x.split(', ')[-1])

In [124]:
Global_Cities['State'] = Global_Cities['State'].apply(lambda x: x.replace('\xa0', ' '))
Global_Cities['City'] = Global_Cities['City'].apply(lambda x: x.replace('\xa0', ' ').replace('-', ' '))

In [126]:
Global_Cities.columns = ['location', 'Number of Jobs 2023', 'Number of Applicants 2023',
                         'Average Applicants per Job 2023', 'City', 'State',]

In [128]:
Global_Cities = Global_Cities[['City', 'State', 'Number of Jobs 2023', 
               'Number of Applicants 2023','Average Applicants per Job 2023',]]

In [129]:
Global_Cities

Unnamed: 0,City,State,Number of Jobs 2023,Number of Applicants 2023,Average Applicants per Job 2023
0,Doha,Qatar,263,104939,399.007605
1,Dubai,United Arab Emirates,344,97316,282.895349
2,Istanbul,Turkey,336,56551,168.306548
3,Johannesburg,South Africa,325,52065,160.200000
4,Abu Dhabi,United Arab Emirates,329,48942,148.759878
...,...,...,...,...,...
79,Liverpool,United Kingdom,205,2023,9.868293
80,Bristol,United Kingdom,189,1569,8.301587
81,Seattle,United States,198,1476,7.454545
82,Sheffield,United Kingdom,208,1534,7.375000


In [130]:
Global_Comp_top = Global_Cities.merge(applicants_grouped, on=['City', 'State'], how='left')

In [131]:
Global_Comp_top = Global_Comp_top.sort_values(by='Average Applicants per Job 2024', ascending=False).head(10)
Global_Comp_top

Unnamed: 0,City,State,Number of Jobs 2023,Number of Applicants 2023,Average Applicants per Job 2023,Number of Jobs 2024,Number of Applicants 2024,Average Applicants per Job 2024
1,Dubai,United Arab Emirates,344,97316,282.895349,128,36507.0,285.210938
6,San Jose,United States,299,32163,107.568562,326,50129.0,153.769939
4,Abu Dhabi,United Arab Emirates,329,48942,148.759878,271,22768.0,84.01476
21,Dallas,United States,263,12832,48.790875,297,24594.0,82.808081
13,Toronto,Canada,344,22535,65.508721,305,24951.0,81.806557
34,Los Angeles,United States,167,6187,37.047904,329,25251.0,76.75076
8,San Francisco,United States,317,29141,91.927445,374,27134.0,72.550802
10,New York City,United States,277,22274,80.411552,308,20614.0,66.928571
48,Houston,United States,228,5307,23.276316,339,21184.0,62.489676
19,Chicago,United States,322,15964,49.57764,316,17559.0,55.566456


In [133]:
Global_Comp_bottom = Global_Cities.merge(applicants_grouped, on=['City', 'State'], how='left')

Global_Comp_bottom = Global_Comp_bottom.sort_values(by='Average Applicants per Job 2024', ascending=True).head(10)
Global_Comp_bottom

Unnamed: 0,City,State,Number of Jobs 2023,Number of Applicants 2023,Average Applicants per Job 2023,Number of Jobs 2024,Number of Applicants 2024,Average Applicants per Job 2024
51,Beijing,China,136,2901,21.330882,531,428.0,0.806026
27,Shanghai,China,207,8540,41.256039,618,1946.0,3.148867
61,Leicester,United Kingdom,300,5599,18.663333,450,1637.0,3.637778
78,Cardiff,United Kingdom,212,2265,10.683962,550,2104.0,3.825455
80,Bristol,United Kingdom,189,1569,8.301587,894,3972.0,4.442953
83,Bern,Switzerland,157,595,3.789809,928,4405.0,4.746767
82,Sheffield,United Kingdom,208,1534,7.375,845,4626.0,5.474556
71,Hamburg,Germany,210,2898,13.8,881,5319.0,6.037457
55,Tokyo,Japan,194,3820,19.690722,749,4924.0,6.574099
56,London,United Kingdom,293,5768,19.686007,772,5669.0,7.343264


## Global Increase

In [135]:
Global_Increase = Global_Cities.merge(applicants_grouped, on=['City', 'State'], how='left')

In [136]:
Global_Increase['% Change in Average Applicants'] = ((Global_Increase['Average Applicants per Job 2024'] - Global_Increase['Average Applicants per Job 2023']) / Global_Increase['Average Applicants per Job 2023'])

Global_Increase = Global_Increase.sort_values(by='% Change in Average Applicants', ascending=False)

Global_Increase

Unnamed: 0,City,State,Number of Jobs 2023,Number of Applicants 2023,Average Applicants per Job 2023,Number of Jobs 2024,Number of Applicants 2024,Average Applicants per Job 2024,% Change in Average Applicants
81,Seattle,United States,198,1476,7.454545,348,14235.0,40.905172,4.487279
76,Luxemborg,Luxemborg,270,2987,11.062963,635,24544.0,38.651969,2.493817
48,Houston,United States,228,5307,23.276316,339,21184.0,62.489676,1.684689
75,San Diego,United States,181,2026,11.193370,415,10059.0,24.238554,1.165438
34,Los Angeles,United States,167,6187,37.047904,329,25251.0,76.750760,1.071663
...,...,...,...,...,...,...,...,...,...
12,Bangkok,Thailand,326,21839,66.990798,758,8021.0,10.581794,-0.842041
20,Sao Paulo,Brazil,336,16394,48.791667,598,4399.0,7.356187,-0.849233
27,Shanghai,China,207,8540,41.256039,618,1946.0,3.148867,-0.923675
0,Doha,Qatar,263,104939,399.007605,525,12603.0,24.005714,-0.939836


## US Comp

In [137]:
US_cities = research_2023['US cities']

In [138]:
US_cities.columns = ['location', 'Number of Jobs 2023', 'Number of Applicants 2023',
       'Average Applicants per Job 2023']

US_cities['City'] = US_cities['location'].apply(lambda x: x.split(', ')[0])

US_cities['State'] = US_cities['location'].apply(lambda x: x.split(', ')[-1])

In [140]:
US_cities['State'] = 'United States'

In [142]:
US_cities = US_cities[['City', 'State', 'Number of Jobs 2023', 
               'Number of Applicants 2023','Average Applicants per Job 2023',]]

In [144]:
US_cities.loc[US_cities['City'] == 'Boston', 'State'] =  'Massachusetts'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [145]:
US_cities

Unnamed: 0,City,State,Number of Jobs 2023,Number of Applicants 2023,Average Applicants per Job 2023
0,San Jose,United States,299.0,32163.0,107.568562
1,San Francisco,United States,317.0,29141.0,91.927445
2,New York City,United States,277.0,22274.0,80.411552
3,Chicago,United States,322.0,15964.0,49.57764
4,Dallas,United States,263.0,12832.0,48.790875
5,Austin,United States,259.0,12199.0,47.100386
6,Los Angeles,United States,167.0,6187.0,37.047904
7,Fort Worth,United States,335.0,11400.0,34.029851
8,Phoenix,United States,160.0,5248.0,32.8
9,Philadelphia,United States,277.0,8028.0,28.981949


In [146]:
US_Comp = US_cities.merge(applicants_grouped, on=['City', 'State'], how='left')

In [147]:
US_Comp_top = US_Comp.sort_values(by='Average Applicants per Job 2024', ascending=False).head(10)

US_Comp_top

Unnamed: 0,City,State,Number of Jobs 2023,Number of Applicants 2023,Average Applicants per Job 2023,Number of Jobs 2024,Number of Applicants 2024,Average Applicants per Job 2024
0,San Jose,United States,299.0,32163.0,107.568562,326,50129.0,153.769939
4,Dallas,United States,263.0,12832.0,48.790875,297,24594.0,82.808081
6,Los Angeles,United States,167.0,6187.0,37.047904,329,25251.0,76.75076
1,San Francisco,United States,317.0,29141.0,91.927445,374,27134.0,72.550802
2,New York City,United States,277.0,22274.0,80.411552,308,20614.0,66.928571
12,Houston,United States,228.0,5307.0,23.276316,339,21184.0,62.489676
3,Chicago,United States,322.0,15964.0,49.57764,316,17559.0,55.566456
7,Fort Worth,United States,335.0,11400.0,34.029851,349,15948.0,45.696275
20,Boston,Massachusetts,,,,332,13702.0,41.271084
5,Austin,United States,259.0,12199.0,47.100386,357,14676.0,41.109244


In [148]:
US_Comp_bottom = US_Comp.sort_values(by='Average Applicants per Job 2024', ascending=True).head(10)
US_Comp_bottom

Unnamed: 0,City,State,Number of Jobs 2023,Number of Applicants 2023,Average Applicants per Job 2023,Number of Jobs 2024,Number of Applicants 2024,Average Applicants per Job 2024
14,Jacksonville,United States,127.0,2366.0,18.629921,544,5877.0,10.803309
16,San Antonio,United States,193.0,3300.0,17.098446,491,6803.0,13.855397
13,Columbus,United States,177.0,3358.0,18.971751,481,6857.0,14.255717
15,Nashville,United States,225.0,3965.0,17.622222,422,6666.0,15.796209
17,Indianapolis,United States,208.0,3270.0,15.721154,503,8415.0,16.729622
18,San Diego,United States,181.0,2026.0,11.19337,415,10059.0,24.238554
11,Denver,United States,185.0,4502.0,24.335135,374,9577.0,25.606952
10,Charlotte,United States,261.0,6530.0,25.019157,414,11489.0,27.751208
8,Phoenix,United States,160.0,5248.0,32.8,371,10895.0,29.366577
9,Philadelphia,United States,277.0,8028.0,28.981949,370,13226.0,35.745946


## US Increase

In [149]:
US_Increase = US_cities.merge(applicants_grouped, on=['City', 'State'], how='left')
US_Increase['% Change in Average Applicants'] = ((US_Increase['Average Applicants per Job 2024'] - US_Increase['Average Applicants per Job 2023']) / US_Increase['Average Applicants per Job 2023'])
US_Increase = US_Increase.sort_values(by='% Change in Average Applicants', ascending=False)

US_Increase

Unnamed: 0,City,State,Number of Jobs 2023,Number of Applicants 2023,Average Applicants per Job 2023,Number of Jobs 2024,Number of Applicants 2024,Average Applicants per Job 2024,% Change in Average Applicants
19,Seattle,United States,198.0,1476.0,7.454545,348,14235.0,40.905172,4.487279
12,Houston,United States,228.0,5307.0,23.276316,339,21184.0,62.489676,1.684689
18,San Diego,United States,181.0,2026.0,11.19337,415,10059.0,24.238554,1.165438
6,Los Angeles,United States,167.0,6187.0,37.047904,329,25251.0,76.75076,1.071663
4,Dallas,United States,263.0,12832.0,48.790875,297,24594.0,82.808081,0.697204
0,San Jose,United States,299.0,32163.0,107.568562,326,50129.0,153.769939,0.429506
7,Fort Worth,United States,335.0,11400.0,34.029851,349,15948.0,45.696275,0.342829
9,Philadelphia,United States,277.0,8028.0,28.981949,370,13226.0,35.745946,0.233387
3,Chicago,United States,322.0,15964.0,49.57764,316,17559.0,55.566456,0.120797
10,Charlotte,United States,261.0,6530.0,25.019157,414,11489.0,27.751208,0.109198


## States Comp

In [162]:
us_states = research_2023['US States']

us_states['City'] = us_states['US State'].apply(lambda x: x.split(', ')[0])

In [163]:
us_states['State'] = 'US States'

In [165]:
us_states = us_states[['City','State', 'Number of Jobs 2023', 'Number of Applicants 2023',
           'Average Applicants per Job 2023']]

In [166]:
us_states

Unnamed: 0,City,State,Number of Jobs 2023,Number of Applicants 2023,Average Applicants per Job 2023
0,Illinois,US States,167,10118,60.586826
1,California,US States,275,13184,47.941818
2,Massachusetts,US States,106,5038,47.528302
3,Florida,US States,147,6312,42.938776
4,Connecticut,US States,173,4925,28.468208
5,Michigan,US States,127,3374,26.566929
6,Arizona,US States,191,5053,26.455497
7,Kentucky,US States,127,3170,24.96063
8,Arkansas,US States,126,3022,23.984127
9,Kansas,US States,114,2683,23.535088


In [167]:
States_Comp = us_states.merge(applicants_grouped, on=['City', 'State'], how='left')

States_Comp.columns = ['State', 'Category', 'Number of Jobs 2023', 'Number of Applicants 2023',
       'Average Applicants per Job 2023', 'Number of Jobs 2024',
       'Number of Applicants 2024', 'Average Applicants per Job 2024']

States_Comp = States_Comp.sort_values(by='Average Applicants per Job 2024', ascending=False)

States_Comp

Unnamed: 0,State,Category,Number of Jobs 2023,Number of Applicants 2023,Average Applicants per Job 2023,Number of Jobs 2024,Number of Applicants 2024,Average Applicants per Job 2024
26,Washington,US States,162,1286,7.938272,460,18859.0,40.997826
15,Colorado,US States,150,2468,16.453333,461,12961.0,28.114967
2,Massachusetts,US States,106,5038,47.528302,419,11653.0,27.811456
12,New York,US States,160,3244,20.275,355,9050.0,25.492958
33,New Jersey,US States,212,1179,5.561321,495,12560.0,25.373737
6,Arizona,US States,191,5053,26.455497,454,11214.0,24.700441
3,Florida,US States,147,6312,42.938776,484,11322.0,23.392562
10,Georgia,US States,172,3793,22.052326,549,12530.0,22.823315
0,Illinois,US States,167,10118,60.586826,482,10949.0,22.715768
38,Pennsylvania,US States,159,618,3.886792,495,10364.0,20.937374


## States Increase

In [168]:
States_Increase = States_Comp.copy()
States_Increase['% Change in Average Applicants'] = ((States_Increase['Average Applicants per Job 2024'] - States_Increase['Average Applicants per Job 2023']) / States_Increase['Average Applicants per Job 2023'])
States_Increase = States_Increase.sort_values(by='% Change in Average Applicants', ascending=False)

States_Increase

Unnamed: 0,State,Category,Number of Jobs 2023,Number of Applicants 2023,Average Applicants per Job 2023,Number of Jobs 2024,Number of Applicants 2024,Average Applicants per Job 2024,% Change in Average Applicants
49,Wisconsin,US States,258,438,1.697674,746,10323.0,13.837802,7.151034
38,Pennsylvania,US States,159,618,3.886792,495,10364.0,20.937374,4.3868
43,Missouri,US States,145,458,3.158621,783,13047.0,16.662835,4.275352
26,Washington,US States,162,1286,7.938272,460,18859.0,40.997826,4.164578
40,Ohio,US States,116,415,3.577586,603,10360.0,17.180763,3.802334
33,New Jersey,US States,212,1179,5.561321,495,12560.0,25.373737,3.562538
45,New Mexico,US States,180,474,2.633333,958,9667.0,10.090814,2.831955
34,Tennessee,US States,154,829,5.383117,539,11021.0,20.447124,2.79838
36,Virginia,US States,177,810,4.576271,757,12942.0,17.096433,2.735887
39,South Carolina,US States,122,456,3.737705,761,10374.0,13.632063,2.647175


# Create an Excel file

In [169]:
# Create an Excel writer object

writer = pd.ExcelWriter('The most competitive job markets in the world in 2024.xlsx')

Global_Comp_top.to_excel(writer, sheet_name='Global Comp', startrow=0, startcol=0, index=False)
Global_Comp_bottom.to_excel(writer, sheet_name='Global Comp', startrow=0, 
                   startcol=len(Global_Comp_top.columns) + 2, index=False)

Global_Increase.to_excel(writer, sheet_name='Global Increase', index=False)

US_Comp_top.to_excel(writer, sheet_name='US Comp', startrow=0, startcol=0, index=False)
US_Comp_bottom.to_excel(writer, sheet_name='US Comp', startrow=0, 
                   startcol=len(US_Comp_top.columns) + 2, index=False)

US_Increase.to_excel(writer, sheet_name='US Increase', index=False)
States_Comp.to_excel(writer, sheet_name='States Comp', index=False)
States_Increase.to_excel(writer, sheet_name='States Increase', index=False)

applicants_grouped.to_excel(writer, sheet_name='Raw Data', index=False)

# Save the Excel file
writer.save()