# Transformation of Data 

In [1]:
import pandas as pd 
import os 
import numpy as np

## Job Market Data - CA, US, SG

In [21]:
country = ["SG", "CA", "US", "AU"]
#Country Index = [1, 2, 3, 4]

In [22]:
index_num = 1 

for x in country: 
    #import loaded csv
    filepath = f"../Clean Data/{x}-JobMarket.csv"
    df = pd.read_csv(filepath, index_col=0)
    
    #Cleaning:
    
    #removing jobs that have more than one unique job title index 
    #lots of data with duplicates - will remove duplicates but will keep the last entry
    #i.e. if one job is labeled as both Machine Learning and Data Analyst, the Machine Learning label entry will be kept
    #ranking will be Machine Learning(index=4), Data Engineer(index=3), Data Scientist(index=2), and then Data Analyst(index=1) 
    #(ranking is based on how specific each name is)
    df = df.drop_duplicates(subset=['Job ID'], keep='last')
    
    #set job id as index (now unique)
    df = df.set_index("Job ID")
    
    #creating country index 
    df["Country"] = index_num
    index_num = index_num + 1
    
    #df to csv 
    df.to_csv(f"../Transformed Data/{x}-JobMarket-Transformed.csv")


In [68]:
#Combining all the data into one csv

filepathCA = "../Transformed Data/CA-JobMarket-Transformed.csv"
dfCA = pd.read_csv(filepathCA, index_col=0)

filepathAU = "../Transformed Data/AU-JobMarket-Transformed.csv"
dfAU = pd.read_csv(filepathAU, index_col=0)

filepathUS = "../Transformed Data/US-JobMarket-Transformed.csv"
dfUS = pd.read_csv(filepathUS, index_col=0)

filepathSG = "../Transformed Data/SG-JobMarket-Transformed.csv"
dfSG = pd.read_csv(filepathSG, index_col=0)

## Location Clean up 

In [69]:
dfCA["City"], dfCA["State"] = dfCA["Company Location"].str.split(", ", 1).str


  dfCA["City"], dfCA["State"] = dfCA["Company Location"].str.split(", ", 1).str


In [70]:
dfAU["City"], dfAU["State"] = dfAU["Company Location"].str.rsplit(" ", 1).str


  dfAU["City"], dfAU["State"] = dfAU["Company Location"].str.rsplit(" ", 1).str


In [71]:
# Splite location information into City, State and others (Like zip code)
dfUS["City"], dfUS["State"] = dfUS["Company Location"].str.split(", ", 1).str
dfUS["State"], dfUS["Zip Code"] = dfUS["State"].str.split(" ", 1).str


  dfUS["City"], dfUS["State"] = dfUS["Company Location"].str.split(", ", 1).str
  dfUS["State"], dfUS["Zip Code"] = dfUS["State"].str.split(" ", 1).str


In [72]:
dfSG["City"] = dfSG["Company Location"]
dfSG["State"] = "Singapore"


In [73]:
df = dfCA.append(dfAU)
df = df.append(dfUS)
df = df.append(dfSG)

In [74]:
#formatting column names 

df.columns = ['job_title_id','job_title','company_name','company_location','country_id','city','state','zip_code']

df = df[['job_title_id','country_id','job_title','company_name', 'company_location','city','state']]

df.index.names=["job_id"]

df

Unnamed: 0_level_0,job_title_id,country_id,job_title,company_name,company_location,city,state
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
p_ecae2dcad8f17d8b,1,2,Data & Systems Analyst,Protein Industries Canada,"Regina, SK",Regina,SK
pj_12dccdfbb8ef0da5,1,2,Junior Data Analyst - LOCAL | MTL,BDP CALL CENTER,"Vaudreuil-Dorion, QC",Vaudreuil-Dorion,QC
pj_7837ad55c28258ea,1,2,Pipeline Inline-Inspection Data Analyst (ILI L...,Onstream Pipeline Inspection Services Inc.,"Calgary, AB",Calgary,AB
p_05719d87a0059bf7,1,2,Data and Reporting Analyst,Nunavut Government,"Iqaluit, NU",Iqaluit,NU
p_bf4bd5f13d04a674,1,2,Specialist-Data Visualization,Canadian Red Cross,Canada,Canada,
...,...,...,...,...,...,...,...
p_c9a71595fbad1ebd,4,1,CRO - Digital Bank,Pure Hong Kong,Singapore,Singapore,Singapore
p_ae6a40daf0546a81,4,1,Senior Data Scientist - Image Processing & Com...,BIOFOURMIS SINGAPORE PTE. LTD.,Jurong Island,Jurong Island,Singapore
p_1693547ea84bdc32,4,1,Principal Data Scientist (NLP),Randstad,Singapore,Singapore,Singapore
p_81eb5501dcf512e5,4,1,Materials Specialist #SGUnitedTraineeships #SGUP,BECTON DICKINSON MEDICAL (S) PTE LTD,Jurong Island,Jurong Island,Singapore


In [75]:
df.to_csv(f"../Transformed Data/AllJobMarket-Transformed.csv")

# Creation of Location Table 

In [77]:
# Preparing a dataframe with unique city, state information
location_df = df[["city", "state"]]
location_df.drop_duplicates(inplace=True, ignore_index=True)
location_df.isnull().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  location_df.drop_duplicates(inplace=True, ignore_index=True)


city      0
state    36
dtype: int64

In [80]:
# Inversely select to get dataframe with no NaN rows.
location_summary = location_df[~location_df["state"].isnull()]
location_summary.reset_index(inplace=True, drop=True)
location_summary.to_csv("../Transformed Data/location-summary.csv")

## Mental Health Data 

In [88]:
#Pulling in Data 
filepath = f"../Clean Data/MentalHealthSurvey.csv"
df = pd.read_csv(filepath, index_col=0)
df

Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,6-25,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,2014-08-27 11:29:37,44,M,United States,IN,,No,No,Rarely,More than 1000,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,2014-08-27 11:29:44,32,Male,Canada,,,No,No,Rarely,6-25,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,2014-08-27 11:29:46,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,100-500,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,2015-09-12 11:17:21,26,male,United Kingdom,,No,No,Yes,,26-100,...,Somewhat easy,No,No,Some of them,Some of them,No,No,Don't know,No,
1255,2015-09-26 01:07:35,32,Male,United States,IL,No,Yes,Yes,Often,26-100,...,Somewhat difficult,No,No,Some of them,Yes,No,No,Yes,No,
1256,2015-11-07 12:36:58,34,male,United States,CA,No,Yes,Yes,Sometimes,More than 1000,...,Somewhat difficult,Yes,Yes,No,No,No,No,No,No,
1257,2015-11-30 21:25:06,46,f,United States,NC,No,No,No,,100-500,...,Don't know,Yes,No,No,No,No,No,No,No,


In [89]:
#Replace blanks with "N/A"
df = df.fillna("N/A")

df

Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,6-25,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,2014-08-27 11:29:37,44,M,United States,IN,,No,No,Rarely,More than 1000,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,2014-08-27 11:29:44,32,Male,Canada,,,No,No,Rarely,6-25,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,2014-08-27 11:29:46,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,100-500,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,2015-09-12 11:17:21,26,male,United Kingdom,,No,No,Yes,,26-100,...,Somewhat easy,No,No,Some of them,Some of them,No,No,Don't know,No,
1255,2015-09-26 01:07:35,32,Male,United States,IL,No,Yes,Yes,Often,26-100,...,Somewhat difficult,No,No,Some of them,Yes,No,No,Yes,No,
1256,2015-11-07 12:36:58,34,male,United States,CA,No,Yes,Yes,Sometimes,More than 1000,...,Somewhat difficult,Yes,Yes,No,No,No,No,No,No,
1257,2015-11-30 21:25:06,46,f,United States,NC,No,No,No,,100-500,...,Don't know,Yes,No,No,No,No,No,No,No,


In [90]:
df = df.loc[(df["Country"] == "Canada") | (df["Country"] == "Australia") | (df["Country"] == "Singapore") | (df["Country"] == "United States")]

#checking that all four countries are still in df
df["Country"].unique()

array(['United States', 'Canada', 'Australia', 'Singapore'], dtype=object)

In [91]:
country_list = ["Singapore", "Canada", "United States", "Australia"]
#Country Index = [1, 2, 3, 4]
index_num = 1

for country in country_list:   
    #replacing with country index number 
    df["Country"]= np.where((df.Country == country), index_num, df.Country)
    index_num = index_num + 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Country"]= np.where((df.Country == country), index_num, df.Country)


In [92]:
#normalizing gender entry 

female = ["Female", "female", "F", "f", "Female ", "Femake", "Trans woman", "Cis Female", "Trans-female", "cis-female/femme", "queer/she/they", "Trans-female" "Cis Female", "Woman", "woman", "Female (trans)", "Female (cis)", "femail"]
male = ["M", "Male", "male", "m", "Male-ish", "maile", "Cis Male", "Male (CIS)", "Make", "male leaning androgynous", "Male ", "Man", "Mail", "msle", "cis male"]
other = ["Guy (-ish) ^_^", "p", "non-binary", "Nah", "Genderqueer", "Other"]


for gender in female:   
    #replacing with country index number 
    df["Gender"]= np.where((df.Gender == gender), "Female", df.Gender)

for gender in male:   
    #replacing with country index number 
    df["Gender"]= np.where((df.Gender == gender), "Male", df.Gender)

for gender in other:   
    #replacing with country index number 
    df["Gender"]= np.where((df.Gender == gender), "Other", df.Gender)

df["Gender"].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Gender"]= np.where((df.Gender == gender), "Female", df.Gender)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Gender"]= np.where((df.Gender == gender), "Male", df.Gender)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Gender"]= np.where((df.Gender == gender), "Other", df.Gender)


array(['Female', 'Male', 'Other'], dtype=object)

In [96]:
df.columns

Index(['Timestamp', 'Age', 'Gender', 'Country', 'state', 'self_employed',
       'family_history', 'treatment', 'work_interfere', 'no_employees',
       'remote_work', 'tech_company', 'benefits', 'care_options',
       'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'comments'],
      dtype='object')

In [97]:
df.columns = ['timestamp', 'age', 'gender', 'country_id', 'state', 'self_employed',
       'family_history', 'treatment', 'work_interfere', 'no_employees',
       'remote_work', 'tech_company', 'benefits', 'care_options',
       'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'comments']

df = df[['timestamp', 'age', 'gender', 'country_id', 'state', 'self_employed',
       'family_history', 'treatment', 'work_interfere', 'no_employees',
       'remote_work', 'tech_company', 'benefits', 'care_options',
       'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical']]


df

Unnamed: 0,timestamp,age,gender,country_id,state,self_employed,family_history,treatment,work_interfere,no_employees,...,seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical
0,2014-08-27 11:29:31,37,Female,3,IL,,No,Yes,Often,6-25,...,Yes,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes
1,2014-08-27 11:29:37,44,Male,3,IN,,No,No,Rarely,More than 1000,...,Don't know,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know
2,2014-08-27 11:29:44,32,Male,2,,,No,No,Rarely,6-25,...,No,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No
4,2014-08-27 11:30:22,31,Male,3,TX,,No,No,Never,100-500,...,Don't know,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know
5,2014-08-27 11:31:22,33,Male,3,TN,,Yes,No,Sometimes,6-25,...,Don't know,Don't know,Don't know,No,No,Yes,Yes,No,Maybe,Don't know
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1253,2015-08-25 19:59:38,36,Male,3,UT,No,Yes,No,Rarely,More than 1000,...,Yes,Don't know,Somewhat easy,Maybe,Maybe,Some of them,Some of them,No,No,Don't know
1255,2015-09-26 01:07:35,32,Male,3,IL,No,Yes,Yes,Often,26-100,...,No,Yes,Somewhat difficult,No,No,Some of them,Yes,No,No,Yes
1256,2015-11-07 12:36:58,34,Male,3,CA,No,Yes,Yes,Sometimes,More than 1000,...,No,Don't know,Somewhat difficult,Yes,Yes,No,No,No,No,No
1257,2015-11-30 21:25:06,46,Female,3,NC,No,No,No,,100-500,...,No,Don't know,Don't know,Yes,No,No,No,No,No,No


In [98]:
df.to_csv("../Transformed Data/MentalHealth-Transformed.csv")

## University Data 

In [11]:
#Pulling in Data 
filepath = f"../Clean Data/UniversityData.csv"
df = pd.read_csv(filepath, index_col=0)
df.head()

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year
0,1,Harvard University,USA,1,7,9,1,1,1,1,,5,100.0,2012
1,2,Massachusetts Institute of Technology,USA,2,9,17,3,12,4,4,,1,91.67,2012
2,3,Stanford University,USA,3,17,11,5,4,2,2,,15,89.5,2012
3,4,University of Cambridge,United Kingdom,1,10,24,4,16,16,11,,50,86.17,2012
4,5,California Institute of Technology,USA,4,2,29,7,37,22,22,,18,85.21,2012


In [12]:
#normalizing country with index numbers

country_list = ["Singapore", "Canada", "USA", "Australia"]
#Country Index = [1, 2, 3, 4]
index_num = 1

for country_name in country_list:   
    #replacing with country index number 
    df["country"]= np.where((df.country == country_name), index_num, df.country)
    index_num = index_num + 1

In [13]:
#keeping data on the four countries 

df = df.loc[(df["country"] == 1) | (df["country"] == 2) | (df["country"] == 3) | (df["country"] == 4)]

#checking that all four countries are still in df
df["country"].unique()

array([3, 2, 4, 1], dtype=object)

In [16]:
df.head()

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year
0,1,Harvard University,3,1,7,9,1,1,1,1,,5,100.0,2012
1,2,Massachusetts Institute of Technology,3,2,9,17,3,12,4,4,,1,91.67,2012
2,3,Stanford University,3,3,17,11,5,4,2,2,,15,89.5,2012
4,5,California Institute of Technology,3,4,2,29,7,37,22,22,,18,85.21,2012
5,6,Princeton University,3,5,8,14,2,53,33,26,,101,82.5,2012


In [18]:
latest_df = df.loc[df["year"]==2014]
latest_df

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year
200,1,Harvard University,3,1,1,1,1,1,1,1,1.0,2,100.00,2014
201,2,Stanford University,3,2,11,2,4,5,3,3,4.0,6,99.09,2014
202,3,Massachusetts Institute of Technology,3,3,3,11,2,15,2,2,2.0,1,98.69,2014
205,6,Columbia University,3,4,13,8,9,14,13,9,13.0,4,97.41,2014
206,7,"University of California, Berkeley",3,5,4,22,6,7,4,3,7.0,28,92.84,2014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1112,913,University of Southern Mississippi,3,227,355,478,210,903,840,800,849.0,737,44.38,2014
1122,923,"University of California, Merced",3,228,355,478,210,957,576,800,923.0,338,44.37,2014
1138,939,University of New England (Australia),4,27,355,478,210,916,552,493,909.0,637,44.34,2014
1168,969,Western Michigan University,3,229,355,478,210,962,815,800,937.0,737,44.30,2014


In [13]:
df.to_csv("../Transformed Data/University-Transformed.csv")