# Module 1: Import and data wrangling

### Import libraries

In [54]:
import pandas as pd
import numpy as np

- Download and read the file from Google drive

In [55]:
df = pd.read_csv('../indeed_kaggle.csv')

In [56]:
df.head(10)

Unnamed: 0,Title,Company,Location,Salary,Description,Job URL,Date,State
0,Data Scientist,"DESE Research, Inc.","Huntsville, AL 35806",,"Familiarity with advanced machine learning, da...",https://www.indeed.com/rc/clk?jk=b31b63cb0d9fd...,2024-06-23,Alabama
1,Senior Data Analyst,PCI Government Services,"Hybrid work in Huntsville, AL 35808","From $85,000 a year",Must have strong technical skills in areas suc...,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,2024-06-23,Alabama
2,Data Scientist,"Interclypse, Inc.","Huntsville, AL",Full-time,Excellent interpersonal skills and ability to ...,https://www.indeed.com/rc/clk?jk=c34e8a98b5f9a...,2024-06-23,Alabama
3,"Data Scientist, Mid",Booz Allen,"Huntsville, AL","$75,600 - $172,000 a year","As a data scientist, you’re excited at the pro...",https://www.indeed.com/rc/clk?jk=908b996e5ba98...,2024-06-23,Alabama
4,Senior Data Analyst (U.S. remote eligible),"Eternal Word Television Network, Inc.",Remote in Alabama,,You are experienced using web analytics and Go...,https://www.indeed.com/rc/clk?jk=400cd0aab0d76...,2024-06-23,Alabama
5,AIMSS - Data Scientist (All Levels),Intuitive Research and Technology Corporation,"Huntsville, AL 35805",Full-time,You will have opportunities to understand mach...,https://www.indeed.com/rc/clk?jk=bf7e7f21043de...,2024-06-23,Alabama
6,STATISTICIAN I,University of Alabama at Birmingham,"Birmingham, AL",Day shift,Collects and analyzes statistical data.\nColle...,https://www.indeed.com/rc/clk?jk=b78b78e795878...,2024-06-23,Alabama
7,Data Scientist (Top Secret),"Spry Methods, Inc","Huntsville, AL",Full-time,Stay updated with the latest advancements in d...,https://www.indeed.com/rc/clk?jk=4144324e302d6...,2024-06-23,Alabama
8,Senior Data Scientist,Hibbett | City Gear,"Birmingham, AL",Full-time,"Develop project plans, track progress, and com...",https://www.indeed.com/rc/clk?jk=66da143a6d04e...,2024-06-23,Alabama
9,"Data Scientist — ""Cancer Biology"" for Academic...",Tuskegee University,"Tuskegee, AL 36088",,Apply advanced statistical techniques and mach...,https://www.indeed.com/rc/clk?jk=04bb0ca0f0d27...,2024-06-23,Alabama


- Format the datafame in order to scroll all columns and rows

In [57]:
pd.set_option('display.max_rows', None) 
pd.set_option('display.max_columns', None) 

- Glance at the dataset, look for Null and duplicate values

In [58]:
df.shape  # the data has the total of 29184 rows and 8 columns

(29184, 8)

In [59]:
df.info() # 5 columns have missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29184 entries, 0 to 29183
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Title        29184 non-null  object
 1   Company      29181 non-null  object
 2   Location     29182 non-null  object
 3   Salary       16283 non-null  object
 4   Description  22755 non-null  object
 5   Job URL      29181 non-null  object
 6   Date         29184 non-null  object
 7   State        29184 non-null  object
dtypes: object(8)
memory usage: 1.8+ MB


In [60]:
df.dtypes # Salary type should by float, date need to be converted to datetime data type.

Title          object
Company        object
Location       object
Salary         object
Description    object
Job URL        object
Date           object
State          object
dtype: object

In [61]:
df.describe()

Unnamed: 0,Title,Company,Location,Salary,Description,Job URL,Date,State
count,29184,29181,29182,16283,22755,29181,29184,29184
unique,8173,4085,3817,3527,12956,29181,36,52
top,Data Scientist,Amazon.com Services LLC,Remote,Full-time,"Note: Apple benefit, compensation and employee...",https://www.indeed.com/rc/clk?jk=b31b63cb0d9fd...,2024-06-23,California
freq,1407,948,1340,4560,139,1,4745,5400


## Data Wrangling

- Drop unused column `Job URL`

In [62]:
df = df.drop(columns ='Job URL') # drop un-used column

- Drop duplicate rows if any

In [63]:
df.drop_duplicates().shape # no duplicate values found in the dataset.

(29184, 7)

- Replace blank values to `NaN` if any

In [64]:
df.replace(' ', np.nan, inplace=True)

- Print the number of `missing values`

In [65]:
print('The Company column is missing:',df['Company'].isnull().sum() ,'values')
print('The Location column is missing:',df['Location'].isnull().sum(), 'values')
print('The Salary column is missing:', df['Salary'].isnull().sum(), 'values')
print('The Description column is missing:', df['Description'].isnull().sum(), 'values')

The Company column is missing: 3 values
The Location column is missing: 2 values
The Salary column is missing: 12901 values
The Description column is missing: 6429 values


## Tasks:

1. Identify missing values where data exists in other columns (e.g., salary info in the Description column but missing in Salary).
2. Create new columns for analysis: Working Type and Level.
3. Extract City and State from the Location column.
4. Extract work mode from the Location column and fill the values in Work_Type column.
5. Clean Salary column by removing strings, split it into Min and Max salary, and calculate the average salary.
6. Fill missing Salary data using information from the Description column.
7. Convert Salary to float and Posting Date to datetime format.

### Method: Working with each column for each related task

__Company column__

- Define function to capitalize the first letter of each word

In [66]:
def custom_capitalize(text):
    return ' '.join(word.capitalize() for word in text.split())

df['Company'] = df['Company'].astype(str)
df['Company'] = df['Company'].apply(custom_capitalize)

__Title column__

- Apply the above capitalize function to the `Title` column

In [67]:
df['Title'] = df['Title'].apply(custom_capitalize)

- The `Title` column that state working mode, look for these columns and extract to the new colum `Work_Type`

In [68]:
df['Work_Type']=df['Title'].str.extract(r'(Remote|remote|Hybrid|hybrid)', expand=False)

- Extract the `Job Level` from `Title` column

In [69]:
df['Level'] = df['Title'].str.extract(r'(?i)(Associate|Graduate|Senior|Junior|Entry-level|Manager|Lead)', expand=False)

- If the values are `Null`, then Level is supposed to be `Mid-level`

In [70]:
df['Level']=df['Level'].fillna('Mid-level').str.capitalize()

__Location column__

- Use split method to extract the `City, Postcode` from Location column to new columns

In [71]:
df[['City','Post_Code']] = df['Location'].str.split(',', n=1, expand=True)

- Use regular expression to extract the `Postcode` from the string (interger data types)

In [72]:
df['Post_Code']=df['Post_Code'].str.extract(r'(\d+)', expand=True)

- Extract the Work type from `Location` column

In [73]:
df['Work_Type1'] = df['Location'][df['Location'] == 'Remote|remote|Hybrid|hybrid']

- Fill the missing value in `Work_Type` column by the new Work_Type1 that extract from `Location`

In [74]:
df['Work_Type'] = df['Work_Type1'].fillna(df['Work_Type'])

__Salary column__

- Define function using loop to extract digit from text

In [75]:
# Function to extract digit from text
def extract_number(text):
    if pd.isna(text):
        return np.nan

    numbers = []
    for char in text:
        if char.isdigit() or char in ',.-':
            numbers.append(char)
    return ''.join(numbers).strip()

- Apply the function to `Salary` column

In [76]:
df['Salary']=df['Salary'].apply(extract_number)

- For the values is not the number, return to `Null` value 

In [77]:
df['Salary'] = df['Salary'].str.strip('-')

df.replace('', np.nan, inplace=True)

- Looking for the missing salary information in description column by using regular expression to extract `$` sign

In [78]:
df['Salary1'] = df['Description'].str.extract(r'(\$\d{1,3}(?:,\d{3})*(?:\.\d{2})?)')

- Fill the missing value in `Salary` column with the new `Salary1` column that extract from the decription

In [79]:
df['Salary']= df['Salary'].fillna(df['Salary1'])

- Salary within a range, seperate into to column `Min salary` and `Max salary`

In [80]:
df[['Min_salary','Max_salary']]=df['Salary'].str.split('-', n=1, expand=True)

- For salary that is not a range, Min salary = Max salary

In [81]:
df['Max_salary']=df['Max_salary'].fillna(df['Min_salary'])

- Convert `Salary` data type to float

In [82]:
# Strip any blank space and replace the '-' & ',' from salary colum

df['Min_salary'] = df['Min_salary'].str.replace(r'[$,]', '', regex=True)
df['Max_salary'] = df['Max_salary'].str.replace(r'[$,]', '', regex=True)

In [83]:
# Convert to float

df['Min_salary'] = df['Min_salary'].astype(float)
df['Max_salary'] = df['Max_salary'].astype(float)

In [84]:
# Format the number with commas as thousands separators and no decimal places

pd.options.display.float_format = '{:,.0f}'.format

- Use aggregate function to generate the `Average salary` column

In [85]:
df['Average_salary'] = (df['Min_salary']+ df['Max_salary'])/2

- Filter the salary offer by year, suppose the annual salary is over or equal 30,000 dollars

In [86]:
df = df[df['Average_salary'] >= 30000]
df.head()

Unnamed: 0,Title,Company,Location,Salary,Description,Date,State,Work_Type,Level,City,Post_Code,Work_Type1,Salary1,Min_salary,Max_salary,Average_salary
1,Senior Data Analyst,Pci Government Services,"Hybrid work in Huntsville, AL 35808",85000,Must have strong technical skills in areas suc...,2024-06-23,Alabama,,Senior,Hybrid work in Huntsville,35808.0,,,85000,85000,85000
3,"Data Scientist, Mid",Booz Allen,"Huntsville, AL","75,600-172,000","As a data scientist, you’re excited at the pro...",2024-06-23,Alabama,,Mid-level,Huntsville,,,,75600,172000,123800
15,"Data Scientist, Senior",Booz Allen,"Huntsville, AL","96,600-220,000",Your deep data science and complimentary techn...,2024-06-23,Alabama,,Senior,Huntsville,,,,96600,220000,158300
29,Ai/ml Software Engineer,Leidos,"Huntsville, AL 35806","81,250-146,875",You will also be responsible for working with ...,2024-06-23,Alabama,,Mid-level,Huntsville,35806.0,,,81250,146875,114062
30,"Product Lead, Ai",Recruiting From Scratch,"Huntsville, AL","140,000-180,000",Facilitate rapid experimentation and data-driv...,2024-06-23,Alabama,,Lead,Huntsville,,,,140000,180000,160000


__State column__

- If the `State` column stated the value 'Remote' as the state, then the city and working type is also Remote

In [87]:
df['City1'] = df['State'][df['State'] == 'Remote']

- Update column city if state is `Remote`

In [88]:
df['City'] = df['City1'].fillna(df['City'])

__City column__

- If `City` is remote then work type is remote

In [89]:
df['Work_Type1'] =df['City'][df['City'] == 'Remote']

- Fill the Remote values to main `Work_Type` column

In [90]:
df['Work_Type'] = df['Work_Type1'].fillna(df['Work_Type'])

- Extract the `City` after the word 'in'

In [91]:
df['City'] = df['City'].apply(lambda x: x.split(' in ', 1)[-1].strip() if 'in' in x else x)

- If `Work_Type` is null, then suppose the type is 'on-site'

In [92]:
df['Work_Type'] = df['Work_Type'].fillna('On-Site')

__Date column__

- Convert the data type to `Datetime`

In [93]:
df['Date'] = pd.to_datetime(df['Date'])

___Update and reorder the columns in the dataset, remove unused columns___

In [94]:
df = df[['Date','Company', 'State', 'City', 'Level', 'Work_Type', 'Average_salary']]

___Double check the missing values___

In [95]:
df.isna().sum()

Date              0
Company           0
State             0
City              0
Level             0
Work_Type         0
Average_salary    0
dtype: int64

In [96]:
df.shape

(9391, 7)

In [97]:
df.head(10)

Unnamed: 0,Date,Company,State,City,Level,Work_Type,Average_salary
1,2024-06-23,Pci Government Services,Alabama,Huntsville,Senior,On-Site,85000
3,2024-06-23,Booz Allen,Alabama,Huntsville,Mid-level,On-Site,123800
15,2024-06-23,Booz Allen,Alabama,Huntsville,Senior,On-Site,158300
29,2024-06-23,Leidos,Alabama,Huntsville,Mid-level,On-Site,114062
30,2024-06-23,Recruiting From Scratch,Alabama,Huntsville,Lead,On-Site,160000
33,2024-06-23,Recruiting From Scratch,Alabama,Huntsville,Senior,On-Site,165000
41,2024-06-23,Prosper,Arizona,Phoenix,Mid-level,On-Site,162500
42,2024-06-23,Us Dhs Headquarters,Arizona,Chandler,Lead,On-Site,162050
45,2024-06-23,Clarivate,Arizona,Chandler,Senior,On-Site,136000
48,2024-06-23,City National Bank,Arizona,Phoenix,Mid-level,On-Site,165625


___Download the dataset to CSV file without the index___

In [99]:
df.to_csv('completed_file.csv', index = False)