# Transformation of Data
----
For web-scraping data, clean up includes: 
1. removing duplicates 
2. adding country index numbers 
3. normalizing company locations to city and state

For University ranking data, clean up includes: 
1. removing all other country's university 
2. keeping latest data (removing all other year)
3. adding country index numbers

For mental health data, clean up includes: 
1. normalizing all data (Gender and blanks) 
2. adding country index
3. removing all other country's data

In [1]:
# Dependencies
import pandas as pd 
import os 
import numpy as np

## Job Market Data - CA, US, SG, AU

In [2]:
country = ["SG", "CA", "US", "AU"]
# Country Index = [1, 2, 3, 4]

In [3]:
index_num = 1 

for x in country: 
    #import loaded csv
    filepath = f"../Clean Data/{x}-JobMarket.csv"
    df = pd.read_csv(filepath, index_col=0)
    
    #Cleaning:
    
    #removing jobs that have more than one unique job title index 
    #lots of data with duplicates - will remove duplicates but will keep the last entry
    #i.e. if one job is labeled as both Machine Learning and Data Analyst, the Machine Learning label entry will be kept
    #ranking will be Machine Learning(index=4), Data Engineer(index=3), Data Scientist(index=2), and then Data Analyst(index=1) 
    #(ranking is based on how specific each name is)
    df = df.drop_duplicates(subset=['Job ID'], keep='last')
    
    #set job id as index (now unique)
    df = df.set_index("Job ID")
    
    #creating country index 
    df["Country"] = index_num
    index_num = index_num + 1
    
    #df to csv 
    df.to_csv(f"../Transformed Data/{x}-JobMarket-Transformed.csv")


In [4]:
#Combining all the data into one csv

filepathCA = "../Transformed Data/CA-JobMarket-Transformed.csv"
dfCA = pd.read_csv(filepathCA, index_col=0)

filepathAU = "../Transformed Data/AU-JobMarket-Transformed.csv"
dfAU = pd.read_csv(filepathAU, index_col=0)

filepathUS = "../Transformed Data/US-JobMarket-Transformed.csv"
dfUS = pd.read_csv(filepathUS, index_col=0)

filepathSG = "../Transformed Data/SG-JobMarket-Transformed.csv"
dfSG = pd.read_csv(filepathSG, index_col=0)

## Normalizing location into City and State 

In [5]:
# Spliting Canada company location into City and Province/State
dfCA["City"], dfCA["State"] = dfCA["Company Location"].str.split(", ", 1).str

# Spliting Australia company location into City and Province/State
dfAU["City"], dfAU["State"] = dfAU["Company Location"].str.rsplit(" ", 1).str

# Spliting USA company location into City, State and others (Like zip code)
dfUS["City"], dfUS["State"] = dfUS["Company Location"].str.split(", ", 1).str
dfUS["State"], dfUS["Zip Code"] = dfUS["State"].str.split(" ", 1).str

# Defining Singapore location info from company location and country 
dfSG["City"] = dfSG["Company Location"]
dfSG["State"] = "Singapore"


  dfCA["City"], dfCA["State"] = dfCA["Company Location"].str.split(", ", 1).str
  dfAU["City"], dfAU["State"] = dfAU["Company Location"].str.rsplit(" ", 1).str
  dfUS["City"], dfUS["State"] = dfUS["Company Location"].str.split(", ", 1).str
  dfUS["State"], dfUS["Zip Code"] = dfUS["State"].str.split(" ", 1).str


In [6]:
# Concatenate four df into one
df = dfCA.append(dfAU)
df = df.append(dfUS)
df = df.append(dfSG)

In [7]:
# Formatting column names 
df.columns = ['job_title_id','job_title','company_name','company_location','country_id','city','state','zip_code']

# Keep required columns only
df = df[['job_title_id','country_id','job_title','company_name', 'company_location','city','state']]

# Set df index name
df.index.names=["job_id"]

df

Unnamed: 0_level_0,job_title_id,country_id,job_title,company_name,company_location,city,state
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
p_ecae2dcad8f17d8b,1,2,Data & Systems Analyst,Protein Industries Canada,"Regina, SK",Regina,SK
pj_12dccdfbb8ef0da5,1,2,Junior Data Analyst - LOCAL | MTL,BDP CALL CENTER,"Vaudreuil-Dorion, QC",Vaudreuil-Dorion,QC
pj_7837ad55c28258ea,1,2,Pipeline Inline-Inspection Data Analyst (ILI L...,Onstream Pipeline Inspection Services Inc.,"Calgary, AB",Calgary,AB
p_05719d87a0059bf7,1,2,Data and Reporting Analyst,Nunavut Government,"Iqaluit, NU",Iqaluit,NU
p_bf4bd5f13d04a674,1,2,Specialist-Data Visualization,Canadian Red Cross,Canada,Canada,
...,...,...,...,...,...,...,...
p_c9a71595fbad1ebd,4,1,CRO - Digital Bank,Pure Hong Kong,Singapore,Singapore,Singapore
p_ae6a40daf0546a81,4,1,Senior Data Scientist - Image Processing & Com...,BIOFOURMIS SINGAPORE PTE. LTD.,Jurong Island,Jurong Island,Singapore
p_1693547ea84bdc32,4,1,Principal Data Scientist (NLP),Randstad,Singapore,Singapore,Singapore
p_81eb5501dcf512e5,4,1,Materials Specialist #SGUnitedTraineeships #SGUP,BECTON DICKINSON MEDICAL (S) PTE LTD,Jurong Island,Jurong Island,Singapore


In [8]:
# Export to csv
df.to_csv(f"../Transformed Data/AllJobMarket-Transformed.csv")

## Creation of Location Table for Location API import

In [18]:
# Preparing a dataframe with unique city, state information
location_df = df[["country_id", "city", "state"]]
location_df.drop_duplicates(inplace=True, ignore_index=True)

# Check the number of null values in State column
location_df.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  location_df.drop_duplicates(inplace=True, ignore_index=True)


country_id     0
city           0
state         38
dtype: int64

In [23]:
# Inversely select to get dataframe with no NaN rows. Reset index. Export to csv
location_summary = location_df[~location_df["state"].isnull()]
location_summary.reset_index(inplace=True, drop=True)
location_summary.to_csv("../Transformed Data/location-summary.csv", index=False)

## Mental Health Data Clean Up

In [None]:
#Pulling in Data 
filepath = f"../Clean Data/MentalHealthSurvey.csv"
df = pd.read_csv(filepath, index_col=0)
df

In [None]:
#Replace blanks with "N/A"
df = df.fillna("N/A")

df

In [None]:
# Getting only required four contry information
df = df.loc[(df["Country"] == "Canada") | (df["Country"] == "Australia") | (df["Country"] == "Singapore") | (df["Country"] == "United States")]

#checking that all four countries are still in df
df["Country"].unique()

In [None]:
country_list = ["Singapore", "Canada", "United States", "Australia"]
#Country Index = [1, 2, 3, 4]

for index_num, country in enumerate(country_list):   
    #replacing with country index number 
    df["Country"]= np.where((df.Country == country), (index_num + 1), df.Country)


In [None]:
#normalizing gender entry 
female = ["Female", "female", "F", "f", "Female ", "Femake", "Trans woman", "Cis Female", "Trans-female", "cis-female/femme", 
          "queer/she/they", "Trans-female" "Cis Female", "Woman", "woman", "Female (trans)", "Female (cis)", "femail"]
male = ["M", "Male", "male", "m", "Male-ish", "maile", "Cis Male", "Male (CIS)", "Make", "male leaning androgynous", "Male ", 
        "Man", "Mail", "msle", "cis male"]
other = ["Guy (-ish) ^_^", "p", "non-binary", "Nah", "Genderqueer", "Other"]

for gender in female:   
    #replacing with country index number 
    df["Gender"]= np.where((df.Gender == gender), "Female", df.Gender)

for gender in male:   
    #replacing with country index number 
    df["Gender"]= np.where((df.Gender == gender), "Male", df.Gender)

for gender in other:   
    #replacing with country index number 
    df["Gender"]= np.where((df.Gender == gender), "Other", df.Gender)

df["Gender"].unique()

In [None]:
df.columns = ['timestamp', 'age', 'gender', 'country_id', 'state', 'self_employed',
       'family_history', 'treatment', 'work_interfere', 'no_employees',
       'remote_work', 'tech_company', 'benefits', 'care_options',
       'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'comments']

df = df[['timestamp', 'age', 'gender', 'country_id', 'state', 'self_employed',
       'family_history', 'treatment', 'work_interfere', 'no_employees',
       'remote_work', 'tech_company', 'benefits', 'care_options',
       'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical']]

# Change index name
df.index.names = ["sample_id"]

df

In [None]:
df.to_csv("../Transformed Data/MentalHealth-Transformed.csv")

## University Data 

In [None]:
# Importing in Data 
filepath = f"../Clean Data/UniversityData.csv"
df = pd.read_csv(filepath, index_col=0)
df.head()

In [None]:
#normalizing country with index numbers

country_list = ["Singapore", "Canada", "USA", "Australia"]
#Country Index = [1, 2, 3, 4]
index_num = 1

for country_name in country_list:   
    #replacing with country index number 
    df["country"]= np.where((df.country == country_name), index_num, df.country)
    index_num = index_num + 1

In [None]:
#keeping data on the four countries 

df = df.loc[(df["country"] == 1) | (df["country"] == 2) | (df["country"] == 3) | (df["country"] == 4)]

#checking that all four countries are still in df
df["country"].unique()

In [None]:
df.head()

In [None]:
# Keeping only the latest data (year 2014)
latest_df = df.loc[df["year"]==2014]
latest_df.columns

In [None]:
#formatting column names 

df.columns = ['world_rank', 'institution', 'country_id', 'national_rank',
       'quality_of_education', 'alumni_employment', 'quality_of_faculty',
       'publications', 'influence', 'citations', 'broad_impact', 'patents',
       'score', 'year']

df = df[['institution', 'world_rank', 'country_id', 'national_rank',
       'quality_of_education', 'alumni_employment', 'quality_of_faculty',
       'publications', 'influence', 'citations', 'broad_impact', 'patents',
       'score', 'year']]


df

In [None]:
df.to_csv("../Transformed Data/University-Transformed.csv")