# Transformation of Data
----
For web-scraping data, clean up includes: 
1. removing duplicates 
2. adding country index numbers 
3. normalizing company locations to city and state

For University ranking data, clean up includes: 
1. removing all other country's university 
2. keeping latest data (removing all other year)
3. adding country index numbers

For mental health data, clean up includes: 
1. normalizing all data (Gender and blanks) 
2. adding country index
3. removing all other country's data

In [1]:
# Dependencies
import pandas as pd 
import os 
import numpy as np

## Job Market Data - CA, US, SG, AU

In [2]:
country = ["SG", "CA", "US", "AU"]
# Country Index = [1, 2, 3, 4]

In [3]:
for index_num, x in enumerate(country): 
    #import loaded csv
    filepath = f"../Clean Data/{x}-JobMarket.csv"
    df = pd.read_csv(filepath, index_col=0)
    
    #Cleaning:
    
    #removing jobs that have more than one unique job title index 
    #lots of data with duplicates - will remove duplicates but will keep the last entry
    #i.e. if one job is labeled as both Machine Learning and Data Analyst, the Machine Learning label entry will be kept
    #ranking will be Machine Learning(index=4), Data Engineer(index=3), Data Scientist(index=2), and then Data Analyst(index=1) 
    #(ranking is based on how specific each name is)
    df = df.drop_duplicates(subset=['Job ID'], keep='last')
    
    #set job id as index (now unique)
    df = df.set_index("Job ID")
    
    #creating country index 
    df["Country"] = index_num + 1
    
    #df to csv 
    df.to_csv(f"../Transformed Data/{x}-JobMarket-Transformed.csv")


In [4]:
#Combining all the data into one csv

filepathCA = "../Transformed Data/CA-JobMarket-Transformed.csv"
dfCA = pd.read_csv(filepathCA, index_col=0)

filepathAU = "../Transformed Data/AU-JobMarket-Transformed.csv"
dfAU = pd.read_csv(filepathAU, index_col=0)

filepathUS = "../Transformed Data/US-JobMarket-Transformed.csv"
dfUS = pd.read_csv(filepathUS, index_col=0)

filepathSG = "../Transformed Data/SG-JobMarket-Transformed.csv"
dfSG = pd.read_csv(filepathSG, index_col=0)

## Normalizing location into City and State 

In [5]:
# Spliting Canada company location into City and Province/State
dfCA["City"], dfCA["State"] = dfCA["Company Location"].str.split(", ", 1).str

# Spliting Australia company location into City and Province/State
dfAU["City"], dfAU["State"] = dfAU["Company Location"].str.rsplit(" ", 1).str

# Spliting USA company location into City, State and others (Like zip code)
dfUS["City"], dfUS["State"] = dfUS["Company Location"].str.split(", ", 1).str
dfUS["State"], dfUS["Zip Code"] = dfUS["State"].str.split(" ", 1).str

# Defining Singapore location info from company location and country 
dfSG["City"] = dfSG["Company Location"]
dfSG["State"] = "Singapore"


In [6]:
# Concatenate four df into one
df_all = pd.concat([dfCA, dfAU, dfUS, dfSG], ignore_index=True)
df_all

Unnamed: 0,Job Title Index,Job Title,Company Name,Company Location,Country,City,State,Zip Code
0,1,Data & Systems Analyst,Protein Industries Canada,"Regina, SK",2,Regina,SK,
1,1,Junior Data Analyst - LOCAL | MTL,BDP CALL CENTER,"Vaudreuil-Dorion, QC",2,Vaudreuil-Dorion,QC,
2,1,Pipeline Inline-Inspection Data Analyst (ILI L...,Onstream Pipeline Inspection Services Inc.,"Calgary, AB",2,Calgary,AB,
3,1,Data and Reporting Analyst,Nunavut Government,"Iqaluit, NU",2,Iqaluit,NU,
4,1,Specialist-Data Visualization,Canadian Red Cross,Canada,2,Canada,,
...,...,...,...,...,...,...,...,...
9866,4,CRO - Digital Bank,Pure Hong Kong,Singapore,1,Singapore,Singapore,
9867,4,Senior Data Scientist - Image Processing & Com...,BIOFOURMIS SINGAPORE PTE. LTD.,Jurong Island,1,Jurong Island,Singapore,
9868,4,Principal Data Scientist (NLP),Randstad,Singapore,1,Singapore,Singapore,
9869,4,Materials Specialist #SGUnitedTraineeships #SGUP,BECTON DICKINSON MEDICAL (S) PTE LTD,Jurong Island,1,Jurong Island,Singapore,


In [7]:
# Formatting column names 
df_all.columns = ['job_title_id','job_title','company_name','company_location','country_id','city','state','zip_code']

# Keep required columns only
df_all = df_all[['job_title_id','country_id','job_title','company_name', 'company_location','city','state']]

# Set df index name
df_all.index.names=["job_id"]

df_all

Unnamed: 0_level_0,job_title_id,country_id,job_title,company_name,company_location,city,state
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1,2,Data & Systems Analyst,Protein Industries Canada,"Regina, SK",Regina,SK
1,1,2,Junior Data Analyst - LOCAL | MTL,BDP CALL CENTER,"Vaudreuil-Dorion, QC",Vaudreuil-Dorion,QC
2,1,2,Pipeline Inline-Inspection Data Analyst (ILI L...,Onstream Pipeline Inspection Services Inc.,"Calgary, AB",Calgary,AB
3,1,2,Data and Reporting Analyst,Nunavut Government,"Iqaluit, NU",Iqaluit,NU
4,1,2,Specialist-Data Visualization,Canadian Red Cross,Canada,Canada,
...,...,...,...,...,...,...,...
9866,4,1,CRO - Digital Bank,Pure Hong Kong,Singapore,Singapore,Singapore
9867,4,1,Senior Data Scientist - Image Processing & Com...,BIOFOURMIS SINGAPORE PTE. LTD.,Jurong Island,Jurong Island,Singapore
9868,4,1,Principal Data Scientist (NLP),Randstad,Singapore,Singapore,Singapore
9869,4,1,Materials Specialist #SGUnitedTraineeships #SGUP,BECTON DICKINSON MEDICAL (S) PTE LTD,Jurong Island,Jurong Island,Singapore


In [8]:
# For all remote location job postings, fill state as Remote
df_all.loc[(df_all["city"].str.contains("remote", case=False)), "state"] = "Remote"

# Fill other NA state values with its own city (like Ontario, California, Canada, etc)
df_all["state"].fillna(df_all["city"], inplace=True)
df_all

Unnamed: 0_level_0,job_title_id,country_id,job_title,company_name,company_location,city,state
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1,2,Data & Systems Analyst,Protein Industries Canada,"Regina, SK",Regina,SK
1,1,2,Junior Data Analyst - LOCAL | MTL,BDP CALL CENTER,"Vaudreuil-Dorion, QC",Vaudreuil-Dorion,QC
2,1,2,Pipeline Inline-Inspection Data Analyst (ILI L...,Onstream Pipeline Inspection Services Inc.,"Calgary, AB",Calgary,AB
3,1,2,Data and Reporting Analyst,Nunavut Government,"Iqaluit, NU",Iqaluit,NU
4,1,2,Specialist-Data Visualization,Canadian Red Cross,Canada,Canada,Canada
...,...,...,...,...,...,...,...
9866,4,1,CRO - Digital Bank,Pure Hong Kong,Singapore,Singapore,Singapore
9867,4,1,Senior Data Scientist - Image Processing & Com...,BIOFOURMIS SINGAPORE PTE. LTD.,Jurong Island,Jurong Island,Singapore
9868,4,1,Principal Data Scientist (NLP),Randstad,Singapore,Singapore,Singapore
9869,4,1,Materials Specialist #SGUnitedTraineeships #SGUP,BECTON DICKINSON MEDICAL (S) PTE LTD,Jurong Island,Jurong Island,Singapore


In [9]:
# Check if there is any is null in the dataframe
df_all.isnull().sum()

job_title_id        0
country_id          0
job_title           0
company_name        0
company_location    0
city                0
state               0
dtype: int64

In [10]:
# For the same city&state location, assign unique ID to them
df_all['location_id'] = df_all.groupby(['city', 'state']).ngroup()
df_all

Unnamed: 0_level_0,job_title_id,country_id,job_title,company_name,company_location,city,state,location_id
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1,2,Data & Systems Analyst,Protein Industries Canada,"Regina, SK",Regina,SK,800
1,1,2,Junior Data Analyst - LOCAL | MTL,BDP CALL CENTER,"Vaudreuil-Dorion, QC",Vaudreuil-Dorion,QC,1017
2,1,2,Pipeline Inline-Inspection Data Analyst (ILI L...,Onstream Pipeline Inspection Services Inc.,"Calgary, AB",Calgary,AB,142
3,1,2,Data and Reporting Analyst,Nunavut Government,"Iqaluit, NU",Iqaluit,NU,436
4,1,2,Specialist-Data Visualization,Canadian Red Cross,Canada,Canada,Canada,154
...,...,...,...,...,...,...,...,...
9866,4,1,CRO - Digital Bank,Pure Hong Kong,Singapore,Singapore,Singapore,894
9867,4,1,Senior Data Scientist - Image Processing & Com...,BIOFOURMIS SINGAPORE PTE. LTD.,Jurong Island,Jurong Island,Singapore,457
9868,4,1,Principal Data Scientist (NLP),Randstad,Singapore,Singapore,Singapore,894
9869,4,1,Materials Specialist #SGUnitedTraineeships #SGUP,BECTON DICKINSON MEDICAL (S) PTE LTD,Jurong Island,Jurong Island,Singapore,457


In [11]:
# Sort by location ID
df_all["location_id"] = df_all["location_id"] + 1
df_all_export = df_all[["job_title_id", "country_id", "job_title", "company_name", "location_id"]]
df_all_export

Unnamed: 0_level_0,job_title_id,country_id,job_title,company_name,location_id
job_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2,Data & Systems Analyst,Protein Industries Canada,801
1,1,2,Junior Data Analyst - LOCAL | MTL,BDP CALL CENTER,1018
2,1,2,Pipeline Inline-Inspection Data Analyst (ILI L...,Onstream Pipeline Inspection Services Inc.,143
3,1,2,Data and Reporting Analyst,Nunavut Government,437
4,1,2,Specialist-Data Visualization,Canadian Red Cross,155
...,...,...,...,...,...
9866,4,1,CRO - Digital Bank,Pure Hong Kong,895
9867,4,1,Senior Data Scientist - Image Processing & Com...,BIOFOURMIS SINGAPORE PTE. LTD.,458
9868,4,1,Principal Data Scientist (NLP),Randstad,895
9869,4,1,Materials Specialist #SGUnitedTraineeships #SGUP,BECTON DICKINSON MEDICAL (S) PTE LTD,458


In [12]:
# Export to csv
df_all_export.to_csv(f"../Transformed Data/AllJobMarket-Transformed.csv")

## Creation of Location Table for Location API import

In [13]:
# Preparing a dataframe with unique city, state information
location_df = df_all[["location_id", "country_id", "city", "state"]]
location_df.drop_duplicates(inplace=True, ignore_index=True)

# Check the number of null values in State column
location_df.isnull().sum()

location_id    0
country_id     0
city           0
state          0
dtype: int64

In [14]:
# Reset index. Export to csv
location_df.reset_index(inplace=True, drop=True)
location_df.to_csv("../Transformed Data/location-summary.csv", index=False)

## Mental Health Data Clean Up

In [15]:
#Pulling in Data 
filepath = f"../Clean Data/MentalHealthSurvey.csv"
df = pd.read_csv(filepath, index_col=0)
df

Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,6-25,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,2014-08-27 11:29:37,44,M,United States,IN,,No,No,Rarely,More than 1000,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,2014-08-27 11:29:44,32,Male,Canada,,,No,No,Rarely,6-25,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,2014-08-27 11:29:46,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,100-500,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,2015-09-12 11:17:21,26,male,United Kingdom,,No,No,Yes,,26-100,...,Somewhat easy,No,No,Some of them,Some of them,No,No,Don't know,No,
1255,2015-09-26 01:07:35,32,Male,United States,IL,No,Yes,Yes,Often,26-100,...,Somewhat difficult,No,No,Some of them,Yes,No,No,Yes,No,
1256,2015-11-07 12:36:58,34,male,United States,CA,No,Yes,Yes,Sometimes,More than 1000,...,Somewhat difficult,Yes,Yes,No,No,No,No,No,No,
1257,2015-11-30 21:25:06,46,f,United States,NC,No,No,No,,100-500,...,Don't know,Yes,No,No,No,No,No,No,No,


In [16]:
#Replace blanks with "N/A"
df = df.fillna("N/A")

df

Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,6-25,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,2014-08-27 11:29:37,44,M,United States,IN,,No,No,Rarely,More than 1000,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,2014-08-27 11:29:44,32,Male,Canada,,,No,No,Rarely,6-25,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,2014-08-27 11:29:46,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,100-500,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,2015-09-12 11:17:21,26,male,United Kingdom,,No,No,Yes,,26-100,...,Somewhat easy,No,No,Some of them,Some of them,No,No,Don't know,No,
1255,2015-09-26 01:07:35,32,Male,United States,IL,No,Yes,Yes,Often,26-100,...,Somewhat difficult,No,No,Some of them,Yes,No,No,Yes,No,
1256,2015-11-07 12:36:58,34,male,United States,CA,No,Yes,Yes,Sometimes,More than 1000,...,Somewhat difficult,Yes,Yes,No,No,No,No,No,No,
1257,2015-11-30 21:25:06,46,f,United States,NC,No,No,No,,100-500,...,Don't know,Yes,No,No,No,No,No,No,No,


In [17]:
# Getting only required four contry information
df = df.loc[(df["Country"] == "Canada") | (df["Country"] == "Australia") | (df["Country"] == "Singapore") | (df["Country"] == "United States")]

#checking that all four countries are still in df
df["Country"].unique()

array(['United States', 'Canada', 'Australia', 'Singapore'], dtype=object)

In [18]:
country_dict = {
    "Singapore"     : 1,
    "Canada"        : 2,
    "United States" : 3,
    "Australia"     : 4
}

for country_key, country_index in country_dict.items():
    df.loc[(df["Country"] == country_key), "Country"] = country_index

In [19]:
#normalizing gender entry 
female = ["Female", "female", "F", "f", "Female ", "Femake", "Trans woman", "Cis Female", "Trans-female", "cis-female/femme", "queer/she/they", "Trans-female" "Cis Female", "Woman", "woman", "Female (trans)", "Female (cis)", "femail"]
male = ["M", "Male", "male", "m", "Male-ish", "maile", "Cis Male", "Male (CIS)", "Make", "male leaning androgynous", "Male ", "Man", "Mail", "msle", "cis male"]
other = ["Guy (-ish) ^_^", "p", "non-binary", "Nah", "Genderqueer", "Other"]

for gender in female:   
    #replacing with country index number 
    df["Gender"]= np.where((df.Gender == gender), "Female", df.Gender)

for gender in male:   
    #replacing with country index number 
    df["Gender"]= np.where((df.Gender == gender), "Male", df.Gender)

for gender in other:   
    #replacing with country index number 
    df["Gender"]= np.where((df.Gender == gender), "Other", df.Gender)

df["Gender"].unique()

array(['Female', 'Male', 'Other'], dtype=object)

In [20]:
df.columns = ['timestamp', 'age', 'gender', 'country_id', 'state', 'self_employed',
       'family_history', 'treatment', 'work_interfere', 'no_employees',
       'remote_work', 'tech_company', 'benefits', 'care_options',
       'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'comments']

df = df[['timestamp', 'age', 'gender', 'country_id', 'state', 'self_employed',
       'family_history', 'treatment', 'work_interfere', 'no_employees',
       'remote_work', 'tech_company', 'benefits', 'care_options',
       'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical']]

# Change index name
df.index.names = ["sample_id"]

df

Unnamed: 0_level_0,timestamp,age,gender,country_id,state,self_employed,family_history,treatment,work_interfere,no_employees,...,seek_help,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2014-08-27 11:29:31,37,Female,3,IL,,No,Yes,Often,6-25,...,Yes,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes
1,2014-08-27 11:29:37,44,Male,3,IN,,No,No,Rarely,More than 1000,...,Don't know,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know
2,2014-08-27 11:29:44,32,Male,2,,,No,No,Rarely,6-25,...,No,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No
4,2014-08-27 11:30:22,31,Male,3,TX,,No,No,Never,100-500,...,Don't know,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know
5,2014-08-27 11:31:22,33,Male,3,TN,,Yes,No,Sometimes,6-25,...,Don't know,Don't know,Don't know,No,No,Yes,Yes,No,Maybe,Don't know
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1253,2015-08-25 19:59:38,36,Male,3,UT,No,Yes,No,Rarely,More than 1000,...,Yes,Don't know,Somewhat easy,Maybe,Maybe,Some of them,Some of them,No,No,Don't know
1255,2015-09-26 01:07:35,32,Male,3,IL,No,Yes,Yes,Often,26-100,...,No,Yes,Somewhat difficult,No,No,Some of them,Yes,No,No,Yes
1256,2015-11-07 12:36:58,34,Male,3,CA,No,Yes,Yes,Sometimes,More than 1000,...,No,Don't know,Somewhat difficult,Yes,Yes,No,No,No,No,No
1257,2015-11-30 21:25:06,46,Female,3,NC,No,No,No,,100-500,...,No,Don't know,Don't know,Yes,No,No,No,No,No,No


In [21]:
df.to_csv("../Transformed Data/MentalHealth-Transformed.csv")

## University Data 

In [22]:
# Importing in Data 
filepath = f"../Clean Data/UniversityData.csv"
df_uni = pd.read_csv(filepath, index_col=0)
df_uni.head()

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year
0,1,Harvard University,USA,1,7,9,1,1,1,1,,5,100.0,2012
1,2,Massachusetts Institute of Technology,USA,2,9,17,3,12,4,4,,1,91.67,2012
2,3,Stanford University,USA,3,17,11,5,4,2,2,,15,89.5,2012
3,4,University of Cambridge,United Kingdom,1,10,24,4,16,16,11,,50,86.17,2012
4,5,California Institute of Technology,USA,4,2,29,7,37,22,22,,18,85.21,2012


In [23]:
country_dict = {
    "Singapore"     : 1,
    "Canada"        : 2,
    "USA"           : 3,
    "Australia"     : 4
}
#normalizing country with index numbers
for country_key, country_index in country_dict.items():
    #replacing with country index number 
    df_uni.loc[(df_uni["country"] == country_key), "country"] = country_index

df_uni

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year
0,1,Harvard University,3,1,7,9,1,1,1,1,,5,100.00,2012
1,2,Massachusetts Institute of Technology,3,2,9,17,3,12,4,4,,1,91.67,2012
2,3,Stanford University,3,3,17,11,5,4,2,2,,15,89.50,2012
3,4,University of Cambridge,United Kingdom,1,10,24,4,16,16,11,,50,86.17,2012
4,5,California Institute of Technology,3,4,2,29,7,37,22,22,,18,85.21,2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2195,996,University of the Algarve,Portugal,7,367,567,218,926,845,812,969.0,816,44.03,2015
2196,997,Alexandria University,Egypt,4,236,566,218,997,908,645,981.0,871,44.03,2015
2197,998,Federal University of Ceará,Brazil,18,367,549,218,830,823,812,975.0,824,44.03,2015
2198,999,University of A Coruña,Spain,40,367,567,218,886,974,812,975.0,651,44.02,2015


In [24]:
#keeping data on the four countries 
df_uni_new = df_uni[df_uni["country"].isin(list(country_dict.values()))]
df_uni_new.reset_index(drop=True, inplace=True)

#checking that all four countries are still in df
df_uni_new["country"].unique()

array([3, 2, 4, 1], dtype=object)

In [25]:
df_uni_new.head()

Unnamed: 0,world_rank,institution,country,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year
0,1,Harvard University,3,1,7,9,1,1,1,1,,5,100.0,2012
1,2,Massachusetts Institute of Technology,3,2,9,17,3,12,4,4,,1,91.67,2012
2,3,Stanford University,3,3,17,11,5,4,2,2,,15,89.5,2012
3,5,California Institute of Technology,3,4,2,29,7,37,22,22,,18,85.21,2012
4,6,Princeton University,3,5,8,14,2,53,33,26,,101,82.5,2012


In [26]:
# Keeping only the latest data (year 2014)
latest_df = df_uni_new.loc[df_uni_new["year"]==2014]
latest_df.columns

Index(['world_rank', 'institution', 'country', 'national_rank',
       'quality_of_education', 'alumni_employment', 'quality_of_faculty',
       'publications', 'influence', 'citations', 'broad_impact', 'patents',
       'score', 'year'],
      dtype='object')

In [27]:
#formatting column names 

df_uni_new.columns = ['world_rank', 'institution', 'country_id', 'national_rank',
       'quality_of_education', 'alumni_employment', 'quality_of_faculty',
       'publications', 'influence', 'citations', 'broad_impact', 'patents',
       'score', 'year']

df_uni_new = df_uni_new[['institution', 'world_rank', 'country_id', 'national_rank',
       'quality_of_education', 'alumni_employment', 'quality_of_faculty',
       'publications', 'influence', 'citations', 'broad_impact', 'patents',
       'score', 'year']]


df_uni_new

Unnamed: 0,institution,world_rank,country_id,national_rank,quality_of_education,alumni_employment,quality_of_faculty,publications,influence,citations,broad_impact,patents,score,year
0,Harvard University,1,3,1,7,9,1,1,1,1,,5,100.00,2012
1,Massachusetts Institute of Technology,2,3,2,9,17,3,12,4,4,,1,91.67,2012
2,Stanford University,3,3,3,17,11,5,4,2,2,,15,89.50,2012
3,California Institute of Technology,5,3,4,2,29,7,37,22,22,,18,85.21,2012
4,Princeton University,6,3,5,8,14,2,53,33,26,,101,82.50,2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
703,University of Lethbridge,917,2,32,367,567,218,957,718,511,867.0,853,44.12,2015
704,University of Texas at El Paso,929,3,228,367,442,218,910,838,812,906.0,706,44.10,2015
705,"University of California, Merced",936,3,229,367,567,218,899,540,812,906.0,491,44.09,2015
706,Charles Darwin University,942,4,27,367,567,218,931,742,645,896.0,871,44.09,2015


In [28]:
df_uni_new.to_csv("../Transformed Data/University-Transformed.csv")