# Module 1: Import and data wrangling

- Import libraries that use in this project

In [1]:
import pandas as pd
import numpy as np


## Importing CSV file using Pandas

In [2]:
df = pd.read_csv('/Users/aimeetienle/Desktop/aimee_learn_data/ProjectIndeedJobPosting/indeed_kaggle.csv')

- Use Pandas to format the datafame in order to scroll all columns and rows

In [3]:
pd.set_option('display.max_rows', None) # scrolling all the data
pd.set_option('display.max_colwidth', None) # expand column to see all text

- Glance at the dataset, look for Null and duplicate values

In [4]:
df.shape  # the data has the total of 29184 rows and 8 columns

(29184, 8)

In [5]:
df.info() # 5 columns have missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29184 entries, 0 to 29183
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Title        29184 non-null  object
 1   Company      29181 non-null  object
 2   Location     29182 non-null  object
 3   Salary       16283 non-null  object
 4   Description  22755 non-null  object
 5   Job URL      29181 non-null  object
 6   Date         29184 non-null  object
 7   State        29184 non-null  object
dtypes: object(8)
memory usage: 1.8+ MB


In [6]:
df.dtypes # Salary type should by float, date need to be converted to datetime data type.

Title          object
Company        object
Location       object
Salary         object
Description    object
Job URL        object
Date           object
State          object
dtype: object

In [7]:
df.describe()

Unnamed: 0,Title,Company,Location,Salary,Description,Job URL,Date,State
count,29184,29181,29182,16283,22755,29181,29184,29184
unique,8173,4085,3817,3527,12956,29181,36,52
top,Data Scientist,Amazon.com Services LLC,Remote,Full-time,"Note: Apple benefit, compensation and employee stock programs are subject to eligibility requirements and other terms of the applicable plan or program.",https://www.indeed.com/rc/clk?jk=b31b63cb0d9fdd0f&bb=iyky3AQomuxeNprvj0oz9HyPara7dK1M9INizUv8kosQ394A7WGaCTtBbYUOGCiGAIxUmt1U3X2Kt-adR663FfscJlBBLKLs6Q3kY2IhY5ZFWlikYNcXP91yqruJz9PL&xkcb=SoCe67M3_jNDenxlnp0LbzkdCdPP&fccid=0e6b3ef446f683d5&vjs=3,2024-06-23,California
freq,1407,948,1340,4560,139,1,4745,5400


## Data Wrangling

- Drop unused column `JobURL`

In [8]:
df = df.drop(columns ='Job URL') # drop un-used column

- Drop duplicate rows if any

In [9]:
df.drop_duplicates().shape # no duplicate in the dataset.

(29184, 7)

- Replace blank values to `NaN` if any

In [10]:
df.replace('', np.nan, inplace=True)

- Working with `missing values`

In [11]:
print('The Company column is missng:',df['Company'].isnull().sum() ,'values')
print('The Location column is missing:',df['Location'].isnull().sum(), 'values')
print('The Salary column is missing:', df['Salary'].isnull().sum(), 'values')
print('The description column is missing:', df['Description'].isnull().sum(), 'values')

The Company column is missng: 3 values
The Location column is missing: 2 values
The Salary column is missing: 12901 values
The description column is missing: 6429 values


## Tasks:

1. Identify missing values where data exists in other columns (e.g., salary info in the Description column but missing in Salary).
2. Create new columns for analysis: Working Type and Level.
3. Extract City and State from the Location column.
4. Extract work mode from the Location column and fill the values in Work_Type column.
5. Clean Salary column by removing strings, split it into Min and Max salary, and calculate the average salary.
6. Fill missing Salary data using information from the Description column.
7. Convert Salary to float and Posting Date to datetime format.


### Method: Working with each column for each related task

__Company column__

- Define function to capitalize the first letter of each word

In [12]:
def custom_capitalize(text):
    return ' '.join(word.capitalize() for word in text.split())

df['Company'] = df['Company'].astype(str)
df['Company'] = df['Company'].apply(custom_capitalize)

__Title column__

- Apply the above capitalize function to the `Title` column

In [13]:
df['Title'] = df['Title'].apply(custom_capitalize)

- The `Title` column that state working mode, look for these columns and extract to the new colum `Work_Type`

In [14]:
df['Work_Type']=df['Title'].str.extract(r'(Remote|remote|Full-time|full-time|Part-time|part-time|Casual|casual|Hybrid|hybrid)', expand=False)

- Extract the `Job Level` from `Title` column

In [15]:
df['Level'] = df['Title'].str.extract(r'(?i)(Associate|Graduate|Senior|Junior|Entry-level|Manager|Lead)', expand=False)

- If the values are `Null`, then Level is supposed to be `Mid-level`

In [16]:
df['Level']=df['Level'].fillna('Mid-level').str.capitalize()

__Location column__

- Use split method to extract the `City, Postcode` from Location column to new columns

In [17]:
df[['City','Post_Code']] = df['Location'].str.split(',', n=1, expand=True)

- Use regular expression to extract the `Postcode` from the string (interger data types)

In [18]:
df['Post_Code']=df['Post_Code'].str.extract(r'(\d+)', expand=True)

- Extract the Work type from `Location` column

In [19]:
df['Work_Type1'] = df['Location'][df['Location'] == 'Remote|remote|Hybrid|hybrid']

- Fill the missing value in `Work_Type` column by the new Work_Type1 that extract from `Location`

In [20]:
df['Work_Type'] = df['Work_Type1'].fillna(df['Work_Type'])

__Salary column__

- Define function using loop to extract digit from text

In [21]:
# Function to extract digit from text
def extract_number(text):
    if pd.isna(text):
        return np.nan

    numbers = []
    for char in text:
        if char.isdigit() or char in ',.-':
            numbers.append(char)
    return ''.join(numbers).strip()

- Apply the function to `Salary` column

In [22]:
df['Salary']=df['Salary'].apply(extract_number)

- For the values is not the number, ruturn to `Null` value 

In [23]:
df['Salary'] = df['Salary'].str.strip('-')

df.replace('', np.nan, inplace=True)

- Looking for the missing salary information in description column by using regular expression to extract `$` sign

In [24]:
df['Salary1'] = df['Description'].str.extract(r'(\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?)')

- Fill the missing value in `Salary` column with the new `Salary1` column that extract from the decription

In [25]:
df['Salary']= df['Salary'].fillna(df['Salary1'])

- Salary within a range, seperate into to column `Min salary` and `Max salary`

In [26]:
df[['Min_salary','Max_salary']]=df['Salary'].str.split('-', n=1, expand=True)

- For salary that is not a range, Min salary = Max salary

In [27]:
df['Max_salary']=df['Max_salary'].fillna(df['Min_salary'])

- Convert the data type to float

In [28]:
# Strip any blank space and replace the '-' & ',' from salary colum

df['Min_salary'] = df['Min_salary'].str.replace(',','') 
df['Min_salary'] = df['Min_salary'].str.replace('$','') 
df['Max_salary'] = df['Max_salary'].str.replace(',','')
df['Max_salary'] = df['Max_salary'].str.replace('$','') 

In [29]:
# Convert to float

df['Min_salary'] = df['Min_salary'].astype(float)
df['Max_salary'] = df['Max_salary'].astype(float)

In [30]:
# Format the number with commas as thousands separators and no decimal places

pd.options.display.float_format = '{:,.0f}'.format

- Use aggregate function to generate the `Average salary` column

In [31]:
df['Average_salary'] = (df['Min_salary']+ df['Max_salary'])/2

- Filter the salary offer by year, suppose the annual salary is `over or equal 30,000 dollars`

In [32]:
df = df[df['Average_salary'] >= 30000]

__State column__

- If the `State` column stated the value 'Remote' as the state, then the city and working type is also Remote

In [33]:
df['City1'] = df['State'][df['State'] == 'Remote']

- Update column city if state is `Remote`

In [34]:
df['City'] = df['City1'].fillna(df['City'])

__City column__

- If city is remote then work type is remote

In [35]:
df['Work_Type1'] =df['City'][df['City'] == 'Remote']

- Fill the Remote values to main Work_Type column

In [36]:
df['Work_Type'] = df['Work_Type1'].fillna(df['Work_Type'])

- Extract the `City` after the word 'in'

In [37]:
df['City'] = df['City'].apply(lambda x: x.split(' in ', 1)[-1].strip() if 'in' in x else x)

- If `Work_Type` is null, then suppose the type is 'on-site'

In [38]:
df['Work_Type'] = df['Work_Type'].fillna('On-Site')

__Date column__

- Convert the data type to `Datetime`

In [39]:
df['Date'] = pd.to_datetime(df['Date'])

__Drop un-used columns from current datasets__

In [40]:
df.drop(['Location', 'Description', 'Salary1', 'Min_salary', 'Max_salary', 'Work_Type1', 'Salary', 'Post_Code', 'City1'], axis=1, inplace=True)

__Double check the missing values__

In [41]:
df.isna().sum()

Title             0
Company           0
Date              0
State             0
Work_Type         0
Level             0
City              0
Average_salary    0
dtype: int64

In [42]:
df.shape

(9391, 8)

__Download the dataset to CSV file without indexing__

In [43]:
df.to_csv('completed_data.csv', index = False)

--> Complete cleaning up the dataset and ready to analyse