# <ins>**Data Collection**</ins>

**Link to dataset** - https://www.kaggle.com/datasets/arshkon/linkedin-job-postings

In [None]:
# Required Imports
import pandas as pd
import json
from src.utils import print_dataframe_summary

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Data Type Conversion based on reading the contents of the .csv files

data_type_job_postings = {
    "job_id": "int64",
    "company_name": "string",
    "title": "string",
    "description": "string",
    "max_salary": "float64",
    "pay_period": "string",
    "location": "string",
    "company_id": "float64",  # need to convert to int64
    "views": "float64",  # need to convert to int64
    "med_salary": "float64",
    "min_salary": "float64",
    "formatted_work_type": "string",
    "applies": "float64",
    "original_listed_time": "float64",  # need to convert to date time
    "remote_allowed": "float64",
    "job_posting_url": "string",
    "application_url": "string",
    "application_type": "string",
    "expiry": "float64",  # need to convert to date time
    "closed_time": "float64",  # need to convert to date time
    "formatted_experience_level": "string",
    "skills_desc": "string",
    "listed_time": "float64",  # need to convert to date time
    "posting_domain": "string",
    "sponsored": "string",
    "work_type": "string",
    "currency": "string",
    "compensation_type": "string",
    "normalized_salary": "float64",
    "zip_code": "string",
    "fips": "string",
}

data_type_companies = {
    "company_id": "int64",
    "name": "string",
    "description": "string",
    "company_size": "Int64",
    "state": "string",
    "country": "string",
    "city": "string",
    "zip_code": "string",
    "address": "string",
    "url": "string",
}

data_type_company_industries = {
    "company_id": "int64",
    "industry": "string"
}

data_type_company_specialities = {
    "company_id": "int64",
    "speciality": "string"
}

data_type_employee_counts = {
    "company_id": "int64",
    "employee_count": "int64",
    "follower_count": "int64",
    "time_recorded": "float64",  # need to convert to date time
}

data_type_salaries = {
    "salary_id": "int64",
    "job_id": "int64",
    "max_salary": "float64",
    "med_salary": "float64",
    "min_salary": "float64",
    "pay_period": "string",
    "currency": "string",
    "compensation_type": "string",
}

data_type_job_industries = {
    "job_id": "int64",
    "industry_id": "int64"
}

data_type_job_skills = {
    "job_id": "int64",
    "skill_abr": "string"
}

data_type_job_benefits = {
    "job_id": "int64",
    "inferred": "int64",
    "type": "string"
}

data_type_skills = {
    "skill_abr": "string",
    "skill_name": "string"
}

data_type_industries = {
    "industry_id": "int64",
    "industry_name": "string"
}

In [3]:
# Load Raw Data from the csv files into Pandas dataframes with the correct data types

df_job_postings = pd.read_csv("../data/raw/postings.csv", dtype=data_type_job_postings)

df_companies = pd.read_csv("../data/raw/companies.csv", dtype=data_type_companies)
df_company_industries = pd.read_csv("../data/raw/company_industries.csv", dtype=data_type_company_industries)
df_company_specialities = pd.read_csv("../data/raw/company_specialities.csv", dtype=data_type_company_specialities)
df_employee_counts = pd.read_csv("../data/raw/employee_counts.csv", dtype=data_type_employee_counts)

df_salaries = pd.read_csv("../data/raw/salaries.csv", dtype=data_type_salaries)
df_job_industries = pd.read_csv("../data/raw/job_industries.csv", dtype=data_type_job_industries)
df_job_skills = pd.read_csv("../data/raw/job_skills.csv", dtype=data_type_job_skills)
df_job_benefits = pd.read_csv("../data/raw/benefits.csv", dtype=data_type_job_benefits)

df_skills = pd.read_csv("../data/raw/skills.csv", dtype=data_type_skills)
df_industries = pd.read_csv("../data/raw/industries.csv", dtype=data_type_industries)

In [4]:
# Manual correction of few data types
df_job_postings["original_listed_time"] = pd.to_datetime(df_job_postings["original_listed_time"], unit="ms")
df_job_postings["closed_time"] = pd.to_datetime(df_job_postings["closed_time"], unit="ms")
df_job_postings["listed_time"] = pd.to_datetime(df_job_postings["listed_time"], unit="ms")
df_job_postings["expiry"] = pd.to_datetime(df_job_postings["expiry"], unit="ms")
df_job_postings["company_id"] = df_job_postings["company_id"].astype("Int64")
df_job_postings["views"] = df_job_postings["views"].astype("Int64")

df_employee_counts["time_recorded"] = pd.to_datetime(df_employee_counts["time_recorded"], unit="ms")

------------

# <ins>**Data Pre-Processing**</ins>

### **1. Companies**

####  **1.1 Companies Data Schema (`companies.csv`)**

  | Column         | Description                   |
  |----------------|-------------------------------|
  | company_id     | Unique identifier for company |
  | name           | Company Name                  |
  | description    | Description of the company    |
  | company_size   | Size of the company           |
  | state          | State where the company is located |
  | country        | Country where the company is located |
  | city           | City where the company is located |
  | zip_code       | ZIP code of company"s location |
  | address        | Address of the company        |
  | url            | URL of the company website    |

  <br>

####  **1.2 Company Industries Data Schema (`company_industries.csv`)**

  | Column      | Description                          |
  |-------------|--------------------------------------|
  | company_id  | Unique identifier for company        |
  | industry    | Industry associated with the company |

  <br>

####  **1.3 Company Specialities Data Schema (`company_specialities.csv`)**

  | Column      | Description                          |
  |-------------|--------------------------------------|
  | company_id  | Unique identifier for company        |
  | speciality  | Speciality of the company            |

  <br>

####  **1.4 Employee Counts Data Schema (`employee_counts.csv`)**

  | Column         | Description                                  |
  |----------------|----------------------------------------------|
  | company_id     | Unique identifier for company                |
  | employee_count | Number of employees in the company           |
  | follower_count | Number of followers for the company          |
  | time_recorded  | Time when the count was recorded             |

In [5]:
print("Company Industries:\n")
print_dataframe_summary(df_company_industries)

Company Industries:

Shape: (24375, 2)
Number of Rows: 24375
Number of Columns: 2
Total Duplicates: 0
Total Null Values: 0

Null Values per Column:
{'company_id': 0, 'industry': 0}

Columns with Nulls:
None

Column Data Types:
{'company_id': dtype('int64'), 'industry': string[python]}

Unique Values per Column:
{'company_id': 24365, 'industry': 144}


In [6]:
print("Company Specialities:\n")
print_dataframe_summary(df_company_specialities)

Company Specialities:

Shape: (169387, 2)
Number of Rows: 169387
Number of Columns: 2
Total Duplicates: 0
Total Null Values: 0

Null Values per Column:
{'company_id': 0, 'speciality': 0}

Columns with Nulls:
None

Column Data Types:
{'company_id': dtype('int64'), 'speciality': string[python]}

Unique Values per Column:
{'company_id': 17780, 'speciality': 82960}


In [7]:
print("Employee Counts:\n")
print_dataframe_summary(df_employee_counts)

Employee Counts:

Shape: (35787, 4)
Number of Rows: 35787
Number of Columns: 4
Total Duplicates: 0
Total Null Values: 0

Null Values per Column:
{'company_id': 0, 'employee_count': 0, 'follower_count': 0, 'time_recorded': 0}

Columns with Nulls:
None

Column Data Types:
{'company_id': dtype('int64'), 'employee_count': dtype('int64'), 'follower_count': dtype('int64'), 'time_recorded': dtype('<M8[ns]')}

Unique Values per Column:
{'company_id': 24473, 'employee_count': 10033, 'follower_count': 25554, 'time_recorded': 3531}


In [8]:
# Cleaning Company Industries
df_company_industries["industry"] = (df_company_industries["industry"].str.strip().str.upper())
df_company_industries_agg = (df_company_industries.groupby("company_id")["industry"].apply(list).reset_index())

df_company_industries_agg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24365 entries, 0 to 24364
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   company_id  24365 non-null  int64 
 1   industry    24365 non-null  object
dtypes: int64(1), object(1)
memory usage: 380.8+ KB


In [9]:
# Cleaning Company Specialities
df_company_specialities["speciality"] = (df_company_specialities["speciality"].str.strip().str.upper())
df_company_specialities_agg = (df_company_specialities.groupby("company_id")["speciality"].apply(list).reset_index())

df_company_specialities_agg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17780 entries, 0 to 17779
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   company_id  17780 non-null  int64 
 1   speciality  17780 non-null  object
dtypes: int64(1), object(1)
memory usage: 277.9+ KB


In [10]:
# Selecting only the latest count of employees since there are multiple counts for different times
df_employee_counts_latest = df_employee_counts.loc[df_employee_counts.groupby("company_id")["time_recorded"].idxmax()]

print_dataframe_summary(df_employee_counts_latest)

Shape: (24473, 4)
Number of Rows: 24473
Number of Columns: 4
Total Duplicates: 0
Total Null Values: 0

Null Values per Column:
{'company_id': 0, 'employee_count': 0, 'follower_count': 0, 'time_recorded': 0}

Columns with Nulls:
None

Column Data Types:
{'company_id': dtype('int64'), 'employee_count': dtype('int64'), 'follower_count': dtype('int64'), 'time_recorded': dtype('<M8[ns]')}

Unique Values per Column:
{'company_id': 24473, 'employee_count': 5100, 'follower_count': 16128, 'time_recorded': 3374}


In [11]:
print("Company Dataframe:")
print_dataframe_summary(df_companies)

Company Dataframe:
Shape: (24473, 10)
Number of Rows: 24473
Number of Columns: 10
Total Duplicates: 0
Total Null Values: 3145

Null Values per Column:
{'company_id': 0, 'name': 1, 'description': 297, 'company_size': 2774, 'state': 22, 'country': 0, 'city': 1, 'zip_code': 28, 'address': 22, 'url': 0}

Columns with Nulls:
['name', 'description', 'company_size', 'state', 'city', 'zip_code', 'address']

Column Data Types:
{'company_id': dtype('int64'), 'name': string[python], 'description': string[python], 'company_size': Int64Dtype(), 'state': string[python], 'country': string[python], 'city': string[python], 'zip_code': string[python], 'address': string[python], 'url': string[python]}

Unique Values per Column:
{'company_id': 24473, 'name': 24428, 'description': 24164, 'company_size': 7, 'state': 788, 'country': 81, 'city': 4124, 'zip_code': 7779, 'address': 19476, 'url': 24473}


In [12]:
# Selecting only the required columns from Company dataframe
df_company_final = df_companies[["company_id", "name", "country", "state", "city", "url"]]

# Cleaning Company dataframe
df_company_final["name"] = df_company_final["name"].str.strip().str.upper()
df_company_final["country"] = df_company_final["country"].str.strip().str.upper()
df_company_final["state"] = df_company_final["state"].str.strip().str.upper()
df_company_final["city"] = df_company_final["city"].str.strip().str.upper()

# Drop columns with NULL values in any of the columns - since we will only work companies having a name, country, state, city and a url
df_company_final = df_company_final.dropna().reset_index(drop=True)

In [13]:
# Merge with industries
df_company_final = pd.merge(df_company_final, df_company_industries_agg, on="company_id", how="left")

# Merge with specialities
df_company_final = pd.merge(df_company_final, df_company_specialities_agg, on="company_id", how="left")

# Merge with employee counts
df_company_final = pd.merge(df_company_final, df_employee_counts_latest, on="company_id", how="left")

In [14]:
# Handling NULL values in "industry" and "speciality" columns
df_company_final["industry"] = df_company_final["industry"].fillna("[OTHER]")
df_company_final["speciality"] = df_company_final["speciality"].fillna("[OTHER]")

In [15]:
print("Details of Company Dataframe after cleaning and merging:")
df_company_final.info()

Details of Company Dataframe after cleaning and merging:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24450 entries, 0 to 24449
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   company_id      24450 non-null  int64         
 1   name            24450 non-null  string        
 2   country         24450 non-null  string        
 3   state           24450 non-null  string        
 4   city            24450 non-null  string        
 5   url             24450 non-null  string        
 6   industry        24450 non-null  object        
 7   speciality      24450 non-null  object        
 8   employee_count  24450 non-null  int64         
 9   follower_count  24450 non-null  int64         
 10  time_recorded   24450 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(3), object(2), string(5)
memory usage: 2.1+ MB


In [16]:
pd.set_option("display.max_rows", None)
df_company_final.head()

Unnamed: 0,company_id,name,country,state,city,url,industry,speciality,employee_count,follower_count,time_recorded
0,1009,IBM,US,NY,"ARMONK, NEW YORK",https://www.linkedin.com/company/ibm,[IT SERVICES AND IT CONSULTING],"[CLOUD, MOBILE, COGNITIVE, SECURITY, RESEARCH,...",311223,16314846,1970-01-20 19:58:21.255
1,1016,GE HEALTHCARE,US,0,CHICAGO,https://www.linkedin.com/company/gehealthcare,[HOSPITALS AND HEALTH CARE],"[HEALTHCARE, BIOTECHNOLOGY]",57001,2196350,1970-01-20 19:59:29.031
2,1025,HEWLETT PACKARD ENTERPRISE,US,TEXAS,HOUSTON,https://www.linkedin.com/company/hewlett-packa...,[IT SERVICES AND IT CONSULTING],[OTHER],79559,3588329,1970-01-20 19:57:33.258
3,1028,ORACLE,US,TEXAS,AUSTIN,https://www.linkedin.com/company/oracle,[IT SERVICES AND IT CONSULTING],"[ENTERPRISE, SOFTWARE, APPLICATIONS, DATABASE,...",191374,9497909,1970-01-20 19:59:00.073
4,1033,ACCENTURE,IE,0,DUBLIN 2,https://www.linkedin.com/company/accenture,[BUSINESS CONSULTING AND SERVICES],"[MANAGEMENT CONSULTING, SYSTEMS INTEGRATION AN...",565191,11890321,1970-01-20 19:59:29.451


In [17]:
df_company_final["industry"] = df_company_final["industry"].apply(json.dumps)
df_company_final["speciality"] = df_company_final["speciality"].apply(json.dumps)
df_company_final.to_csv("../data/transformed/company_final.csv", encoding="utf-8", index=False, header=True)

### <ins>**2. Job Postings**</ins>

#### **2.1 Job Postings Data Schema (`job_skills.csv`)**

| Column                     | Description                                                  |
| -------------------------- | ------------------------------------------------------------ |
| job_id                     | The job ID as defined by LinkedIn                            |
| company_name               | Company Name                                                 |
| title                      | Job title                                                    |
| description                | Job description                                              |
| max_salary                 | Maximum salary                                               |
| pay_period                 | Pay period for salary (Hourly, Monthly, Yearly)              |
| location                   | Job location                                                 |
| company_id                 | Identifier for the company associated with the job posting   |
| views                      | Number of times the job posting has been viewed              |
| med_salary                 | Median salary                                                |
| min_salary                 | Minimum salary                                               |
| formatted_work_type        | Type of work (Fulltime, Parttime, Contract)                  |
| applies                    | Number of applications that have been submitted              |
| original_listed_time       | Original time the job was listed                             |
| remote_allowed             | Whether job permits remote work                              |
| job_posting_url            | URL to the job posting on a platform                         |
| application_url            | URL where applications can be submitted                      |
| application_type           | Type of application process (offsite, complex/simple onsite) |
| expiry                     | Expiration date or time for the job listing                  |
| closed_time                | Time to close job listing                                    |
| formatted_experience_level | Job experience level (entry, associate, executive, etc)      |
| skills_desc                | Description detailing required skills for job                |
| listed_time                | Time when the job was listed                                 |
| posting_domain             | Domain of the website with application                       |
| sponsored                  | Whether the job listing is sponsored or promoted             |
| work_type                  | Type of work associated with the job                         |
| currency                   | Currency in which the salary is provided                     |
| compensation_type          | Type of compensation for the job                             |
| normalized_salary          | Normalized salary                                            |
| zip_code                   | ZIP code of company"s headquarters                           |
| fips                       | Federal Information Processing Standard code                 |

#### **2.2 Job Skills Data Schema (`job_skills.csv`)**

| Column    | Description                          |
| --------- | ------------------------------------ |
| job_id    | The job ID as defined by LinkedIn    |
| skill_abr | Abbreviation or identifier for skill |

  <br>

#### **2.3 Job Industries Data Schema (`job_industries.csv`)**

| Column      | Description                          |
| ----------- | ------------------------------------ |
| job_id      | The job ID as defined by LinkedIn    |
| industry_id | Identifier for the industry category |

  <br>

#### **2.4 Benefits Data Schema (`benefits.csv`)**

| Column   | Description                       |
| -------- | --------------------------------- |
| job_id   | The job ID as defined by LinkedIn |
| inferred | Whether the benefit is inferred   |
| type     | Type of benefit                   |

  <br>

#### **2.5 Salaries Data Schema (`salaries.csv`)**

| Column            | Description                                     |
| ----------------- | ----------------------------------------------- |
| salary_id         | Unique identifier for the salary entry          |
| job_id            | The job ID as defined by LinkedIn               |
| max_salary        | Maximum salary                                  |
| med_salary        | Median salary                                   |
| min_salary        | Minimum salary                                  |
| pay_period        | Pay period for salary (Hourly, Monthly, Yearly) |
| currency          | Currency in which the salary is provided        |
| compensation_type | Type of compensation for the job                |


In [18]:
print("Skills Dataframe:")
print_dataframe_summary(df_skills)

Skills Dataframe:
Shape: (35, 2)
Number of Rows: 35
Number of Columns: 2
Total Duplicates: 0
Total Null Values: 0

Null Values per Column:
{'skill_abr': 0, 'skill_name': 0}

Columns with Nulls:
None

Column Data Types:
{'skill_abr': string[python], 'skill_name': string[python]}

Unique Values per Column:
{'skill_abr': 35, 'skill_name': 35}


In [19]:
print("Job Skills Dataframe:")
print_dataframe_summary(df_job_skills)

Job Skills Dataframe:
Shape: (213768, 2)
Number of Rows: 213768
Number of Columns: 2
Total Duplicates: 0
Total Null Values: 0

Null Values per Column:
{'job_id': 0, 'skill_abr': 0}

Columns with Nulls:
None

Column Data Types:
{'job_id': dtype('int64'), 'skill_abr': string[python]}

Unique Values per Column:
{'job_id': 126807, 'skill_abr': 35}


In [20]:
# Clean df_skills
df_skills["skill_abr"] = df_skills["skill_abr"].str.strip().str.upper()
df_skills["skill_name"] = df_skills["skill_name"].str.strip().str.upper()

# Clean df_job_skills
df_job_skills["skill_abr"] = df_job_skills["skill_abr"].str.strip().str.upper()

# Provide the skill_name for the skill_abr
df_job_skills_final = pd.merge(df_job_skills, df_skills, on="skill_abr", how="left")

# Aggregate the skills since one Job can require multiple skills
df_job_skills_final = (df_job_skills_final.groupby("job_id")["skill_name"].apply(list).reset_index())

df_job_skills_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 126807 entries, 0 to 126806
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   job_id      126807 non-null  int64 
 1   skill_name  126807 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.9+ MB


In [21]:
print("Job Benefits Dataframe:")
print_dataframe_summary(df_job_benefits)

Job Benefits Dataframe:
Shape: (67943, 3)
Number of Rows: 67943
Number of Columns: 3
Total Duplicates: 0
Total Null Values: 0

Null Values per Column:
{'job_id': 0, 'inferred': 0, 'type': 0}

Columns with Nulls:
None

Column Data Types:
{'job_id': dtype('int64'), 'inferred': dtype('int64'), 'type': string[python]}

Unique Values per Column:
{'job_id': 30023, 'inferred': 2, 'type': 12}


In [22]:
# Clean df_job_benefits
df_job_benefits_final = df_job_benefits.drop("inferred", axis=1)
df_job_benefits_final["type"] = df_job_benefits_final["type"].str.strip().str.upper()
df_job_benefits_final = (df_job_benefits_final.groupby("job_id")["type"].apply(list).reset_index())

# rename type to benefits
df_job_benefits_final.rename(columns={"type": "benefits"}, inplace=True)

df_job_benefits_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30023 entries, 0 to 30022
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   job_id    30023 non-null  int64 
 1   benefits  30023 non-null  object
dtypes: int64(1), object(1)
memory usage: 469.2+ KB


In [23]:
df_job_benefits_final.head()

Unnamed: 0,job_id,benefits
0,23221523,[401(K)]
1,56482768,"[401(K), DENTAL INSURANCE, DISABILITY INSURANCE]"
2,69333422,"[MEDICAL INSURANCE, VISION INSURANCE, DENTAL I..."
3,95428182,"[MEDICAL INSURANCE, DENTAL INSURANCE, DISABILI..."
4,111513530,"[MEDICAL INSURANCE, PAID MATERNITY LEAVE, PENS..."


In [24]:
print("Job Postings Dataframe:")
print_dataframe_summary(df_job_postings)

Job Postings Dataframe:
Shape: (123849, 31)
Number of Rows: 123849
Number of Columns: 31
Total Duplicates: 0
Total Null Values: 1269564

Null Values per Column:
{'job_id': 0, 'company_name': 1719, 'title': 0, 'description': 7, 'max_salary': 94056, 'pay_period': 87776, 'location': 0, 'company_id': 1717, 'views': 1689, 'med_salary': 117569, 'min_salary': 94056, 'formatted_work_type': 0, 'applies': 100529, 'original_listed_time': 0, 'remote_allowed': 108603, 'job_posting_url': 0, 'application_url': 36665, 'application_type': 0, 'expiry': 0, 'closed_time': 122776, 'formatted_experience_level': 29409, 'skills_desc': 121410, 'listed_time': 0, 'posting_domain': 39968, 'sponsored': 0, 'work_type': 0, 'currency': 87776, 'compensation_type': 87776, 'normalized_salary': 87776, 'zip_code': 20872, 'fips': 27415}

Columns with Nulls:
['company_name', 'description', 'max_salary', 'pay_period', 'company_id', 'views', 'med_salary', 'min_salary', 'applies', 'remote_allowed', 'application_url', 'closed_t

In [25]:
df_job_postings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123849 entries, 0 to 123848
Data columns (total 31 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   job_id                      123849 non-null  int64         
 1   company_name                122130 non-null  string        
 2   title                       123849 non-null  string        
 3   description                 123842 non-null  string        
 4   max_salary                  29793 non-null   float64       
 5   pay_period                  36073 non-null   string        
 6   location                    123849 non-null  string        
 7   company_id                  122132 non-null  Int64         
 8   views                       122160 non-null  Int64         
 9   med_salary                  6280 non-null    float64       
 10  min_salary                  29793 non-null   float64       
 11  formatted_work_type         123849 non-

In [26]:
pd.set_option("display.max_columns", None)
df_job_postings.head()

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,min_salary,formatted_work_type,applies,original_listed_time,remote_allowed,job_posting_url,application_url,application_type,expiry,closed_time,formatted_experience_level,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,20.0,HOURLY,"Princeton, NJ",2774458.0,20,,17.0,Full-time,2.0,2024-04-17 23:45:08,,https://www.linkedin.com/jobs/view/921716/?trk...,,ComplexOnsiteApply,2024-05-17 23:45:08,NaT,,Requirements: We are seeking a College or Gr...,2024-04-17 23:45:08,,0,FULL_TIME,USD,BASE_SALARY,38480.0,8540,34021
1,1829192,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...",50.0,HOURLY,"Fort Collins, CO",,1,,30.0,Full-time,,2024-04-11 17:51:27,,https://www.linkedin.com/jobs/view/1829192/?tr...,,ComplexOnsiteApply,2024-05-11 17:51:27,NaT,,,2024-04-11 17:51:27,,0,FULL_TIME,USD,BASE_SALARY,83200.0,80521,8069
2,10998357,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,65000.0,YEARLY,"Cincinnati, OH",64896719.0,8,,45000.0,Full-time,,2024-04-16 14:26:54,,https://www.linkedin.com/jobs/view/10998357/?t...,,ComplexOnsiteApply,2024-05-16 14:26:54,NaT,,We are currently accepting resumes for FOH - A...,2024-04-16 14:26:54,,0,FULL_TIME,USD,BASE_SALARY,55000.0,45202,39061
3,23221523,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,175000.0,YEARLY,"New Hyde Park, NY",766262.0,16,,140000.0,Full-time,,2024-04-12 04:23:32,,https://www.linkedin.com/jobs/view/23221523/?t...,,ComplexOnsiteApply,2024-05-12 04:23:32,NaT,,This position requires a baseline understandin...,2024-04-12 04:23:32,,0,FULL_TIME,USD,BASE_SALARY,157500.0,11040,36059
4,35982263,,Service Technician,Looking for HVAC service tech with experience ...,80000.0,YEARLY,"Burlington, IA",,3,,60000.0,Full-time,,2024-04-18 14:52:23,,https://www.linkedin.com/jobs/view/35982263/?t...,,ComplexOnsiteApply,2024-05-18 14:52:23,NaT,,,2024-04-18 14:52:23,,0,FULL_TIME,USD,BASE_SALARY,70000.0,52601,19057


In [27]:
# Select only the required columns
df_job_postings_active = df_job_postings[
    [
        "job_id",
        "job_posting_url",
        "company_id",
        "title",
        "description",
        "formatted_experience_level",
        "work_type",
        "remote_allowed",
        "min_salary",
        "max_salary",
        "normalized_salary",
        "location",
        "currency",
        "pay_period",
        "views",
        "applies",
        "listed_time",
        "original_listed_time",
        "expiry",
    ]
]

# Get the jobs which are only active
df_job_postings_active = df_job_postings_active[df_job_postings["closed_time"].isnull()]

# Clean and prepare all "string" columns
df_job_postings_active["title"] = df_job_postings_active["title"].str.strip().str.upper()
df_job_postings_active["description"] = df_job_postings_active["description"].str.strip().str.replace("<.*?>", "").str.replace("[^a-z\s]", "")
df_job_postings_active.dropna(subset=["description"], inplace=True)

df_job_postings_active["formatted_experience_level"] = df_job_postings_active["formatted_experience_level"].str.strip().str.upper()
df_job_postings_active["formatted_experience_level"] = df_job_postings_active["formatted_experience_level"].fillna("OTHER")


df_job_postings_active["work_type"] = df_job_postings_active["work_type"].str.strip().str.upper()

# Clean remote_allowed
df_job_postings_active["remote_allowed"] = df_job_postings_active["remote_allowed"].replace({"<NA>": 0, "1.0": 1}).fillna(0).astype(int)

# Clean Salary
df_job_postings_active["min_salary"] = df_job_postings_active["min_salary"].fillna(0)
df_job_postings_active["max_salary"] = df_job_postings_active["max_salary"].fillna(0)
df_job_postings_active["normalized_salary"] = df_job_postings_active["normalized_salary"].fillna(0)

# Clean currency
df_job_postings_active["currency"] = df_job_postings_active["currency"].str.strip().str.upper()
df_job_postings_active["currency"] = df_job_postings_active["currency"].fillna("USD")

# Clean Pay Period
df_job_postings_active["pay_period"] = df_job_postings_active["pay_period"].str.strip().str.upper()
df_job_postings_active["pay_period"] = df_job_postings_active["pay_period"].fillna("YEARLY")

# Clean views, applies
df_job_postings_active["views"] = df_job_postings_active["views"].fillna(0)
df_job_postings_active["applies"] = df_job_postings_active["applies"].fillna(0)

In [28]:
# Merge job postings and job skills
df_job_postings_active = pd.merge(df_job_postings_active, df_job_skills_final, on="job_id", how="left")

# Merge job postings and benefits
df_job_postings_active = pd.merge(df_job_postings_active, df_job_benefits_final, on="job_id", how="left")

# handle null values
df_job_postings_active["benefits"] = df_job_postings_active["benefits"].fillna("[]")
df_job_postings_active["skill_name"] = df_job_postings_active["skill_name"].fillna("[]")

In [29]:
pd.set_option("display.max_columns", None)
df_job_postings_active.head()

Unnamed: 0,job_id,job_posting_url,company_id,title,description,formatted_experience_level,work_type,remote_allowed,min_salary,max_salary,normalized_salary,location,currency,pay_period,views,applies,listed_time,original_listed_time,expiry,skill_name,benefits
0,921716,https://www.linkedin.com/jobs/view/921716/?trk...,2774458.0,MARKETING COORDINATOR,Job descriptionA leading real estate firm in N...,OTHER,FULL_TIME,0,17.0,20.0,38480.0,"Princeton, NJ",USD,HOURLY,20,2.0,2024-04-17 23:45:08,2024-04-17 23:45:08,2024-05-17 23:45:08,"[MARKETING, SALES]",[]
1,1829192,https://www.linkedin.com/jobs/view/1829192/?tr...,,MENTAL HEALTH THERAPIST/COUNSELOR,"At Aspen Therapy and Wellness , we are committ...",OTHER,FULL_TIME,0,30.0,50.0,83200.0,"Fort Collins, CO",USD,HOURLY,1,0.0,2024-04-11 17:51:27,2024-04-11 17:51:27,2024-05-11 17:51:27,[HEALTH CARE PROVIDER],[]
2,10998357,https://www.linkedin.com/jobs/view/10998357/?t...,64896719.0,ASSITANT RESTAURANT MANAGER,The National Exemplar is accepting application...,OTHER,FULL_TIME,0,45000.0,65000.0,55000.0,"Cincinnati, OH",USD,YEARLY,8,0.0,2024-04-16 14:26:54,2024-04-16 14:26:54,2024-05-16 14:26:54,"[MANAGEMENT, MANUFACTURING]",[]
3,23221523,https://www.linkedin.com/jobs/view/23221523/?t...,766262.0,SENIOR ELDER LAW / TRUSTS AND ESTATES ASSOCIAT...,Senior Associate Attorney - Elder Law / Trusts...,OTHER,FULL_TIME,0,140000.0,175000.0,157500.0,"New Hyde Park, NY",USD,YEARLY,16,0.0,2024-04-12 04:23:32,2024-04-12 04:23:32,2024-05-12 04:23:32,[OTHER],[401(K)]
4,35982263,https://www.linkedin.com/jobs/view/35982263/?t...,,SERVICE TECHNICIAN,Looking for HVAC service tech with experience ...,OTHER,FULL_TIME,0,60000.0,80000.0,70000.0,"Burlington, IA",USD,YEARLY,3,0.0,2024-04-18 14:52:23,2024-04-18 14:52:23,2024-05-18 14:52:23,[INFORMATION TECHNOLOGY],[]


### <ins>**3. Job Postings with Company Details**</ins>

In [30]:
df_job_postings_final = pd.merge(df_job_postings_active, df_company_final, on="company_id", how="inner")

df_job_postings_final = df_job_postings_final[
    [
        "job_id",
        "job_posting_url",
        "company_id",
        "name",
        "country",
        "state",
        "city",
        "title",
        "description",
        "formatted_experience_level",
        "work_type",
        "remote_allowed",
        "min_salary",
        "max_salary",
        "normalized_salary",
        "currency",
        "pay_period",
        "views",
        "applies",
        "listed_time",
        "original_listed_time",
        "expiry",
        "url",
        "industry",
        "speciality",
        "benefits",
        "employee_count",
        "follower_count",
        "time_recorded",
    ]
]

In [31]:
df_job_postings_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121032 entries, 0 to 121031
Data columns (total 29 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   job_id                      121032 non-null  int64         
 1   job_posting_url             121032 non-null  string        
 2   company_id                  121032 non-null  Int64         
 3   name                        121032 non-null  string        
 4   country                     121032 non-null  string        
 5   state                       121032 non-null  string        
 6   city                        121032 non-null  string        
 7   title                       121032 non-null  string        
 8   description                 121032 non-null  string        
 9   formatted_experience_level  121032 non-null  string        
 10  work_type                   121032 non-null  string        
 11  remote_allowed              121032 non-

In [32]:
pd.set_option("display.max_columns", None)
df_job_postings_final.head()

Unnamed: 0,job_id,job_posting_url,company_id,name,country,state,city,title,description,formatted_experience_level,work_type,remote_allowed,min_salary,max_salary,normalized_salary,currency,pay_period,views,applies,listed_time,original_listed_time,expiry,url,industry,speciality,benefits,employee_count,follower_count,time_recorded
0,921716,https://www.linkedin.com/jobs/view/921716/?trk...,2774458,CORCORAN SAWYER SMITH,US,NJ,JERSEY CITY,MARKETING COORDINATOR,Job descriptionA leading real estate firm in N...,OTHER,FULL_TIME,0,17.0,20.0,38480.0,USD,HOURLY,20,2.0,2024-04-17 23:45:08,2024-04-17 23:45:08,2024-05-17 23:45:08,https://www.linkedin.com/company/corcoran-sawy...,"[""REAL ESTATE""]","[""REAL ESTATE"", ""NEW DEVELOPMENT""]",[],402,2351,1970-01-20 19:56:49.275
1,10998357,https://www.linkedin.com/jobs/view/10998357/?t...,64896719,THE NATIONAL EXEMPLAR,US,OHIO,MARIEMONT,ASSITANT RESTAURANT MANAGER,The National Exemplar is accepting application...,OTHER,FULL_TIME,0,45000.0,65000.0,55000.0,USD,YEARLY,8,0.0,2024-04-16 14:26:54,2024-04-16 14:26:54,2024-05-16 14:26:54,https://www.linkedin.com/company/the-national-...,"[""RESTAURANTS""]","""[OTHER]""",[],15,40,1970-01-20 19:54:39.321
2,23221523,https://www.linkedin.com/jobs/view/23221523/?t...,766262,"ABRAMS FENSTERMAN, LLP",US,NEW YORK,LAKE SUCCESS,SENIOR ELDER LAW / TRUSTS AND ESTATES ASSOCIAT...,Senior Associate Attorney - Elder Law / Trusts...,OTHER,FULL_TIME,0,140000.0,175000.0,157500.0,USD,YEARLY,16,0.0,2024-04-12 04:23:32,2024-04-12 04:23:32,2024-05-12 04:23:32,https://www.linkedin.com/company/abrams-fenste...,"[""LAW PRACTICE""]","[""CIVIL LITIGATION"", ""CORPORATE & SECURITIES L...",[401(K)],222,2427,1970-01-20 19:48:17.299
3,91700727,https://www.linkedin.com/jobs/view/91700727/?t...,1481176,DOWNTOWN RALEIGH ALLIANCE,US,NORTH CAROLINA,RALEIGH,ECONOMIC DEVELOPMENT AND PLANNING INTERN,Job summary:The Economic Development & Plannin...,OTHER,INTERNSHIP,0,14.0,20.0,35360.0,USD,HOURLY,9,4.0,2024-04-18 16:01:39,2024-04-18 16:01:39,2024-05-18 16:01:39,https://www.linkedin.com/company/downtownralei...,"[""NON-PROFIT ORGANIZATIONS""]","[""ECONOMIC DEVELOPMENT"", ""CLEAN & SAFE AMBASSO...",[],22,7825,1970-01-20 19:57:39.851
4,103254301,https://www.linkedin.com/jobs/view/103254301/?...,81942316,RAW CEREAL,US,CA,LOS ANGELES,PRODUCER,Company DescriptionRaw Cereal is a creative de...,OTHER,CONTRACT,1,60000.0,300000.0,180000.0,USD,YEARLY,7,1.0,2024-04-11 18:43:39,2024-04-11 18:43:39,2024-05-11 18:43:39,https://www.linkedin.com/company/raw-cereal,"[""DESIGN SERVICES""]","""[OTHER]""",[],11,447,1970-01-20 19:47:41.202


In [33]:
# Convert lists to JSON strings and save to CSV
df_job_postings_final["industry"] = df_job_postings_final["industry"].apply(json.dumps)
df_job_postings_final["speciality"] = df_job_postings_final["speciality"].apply(json.dumps)
df_job_postings_final["benefits"] = df_job_postings_final["benefits"].apply(json.dumps)

df_job_postings_final.to_csv("../data/transformed/postings_final.csv", encoding="utf-8", index=False, header=True)