# Clustering Based on Holland Code

In [1]:
import pandas as pd

## Loading data

In [56]:
df = pd.read_excel ('data\interests.xlsx')

In [57]:
df[:6]

Unnamed: 0,O*NET-SOC Code,Title,Element ID,Element Name,Scale ID,Scale Name,Data Value,Date,Domain Source
0,11-1011.00,Chief Executives,1.B.1.a,Realistic,OI,Occupational Interests,1.33,06/2008,Analyst
1,11-1011.00,Chief Executives,1.B.1.b,Investigative,OI,Occupational Interests,2.0,06/2008,Analyst
2,11-1011.00,Chief Executives,1.B.1.c,Artistic,OI,Occupational Interests,2.67,06/2008,Analyst
3,11-1011.00,Chief Executives,1.B.1.d,Social,OI,Occupational Interests,3.67,06/2008,Analyst
4,11-1011.00,Chief Executives,1.B.1.e,Enterprising,OI,Occupational Interests,7.0,06/2008,Analyst
5,11-1011.00,Chief Executives,1.B.1.f,Conventional,OI,Occupational Interests,5.33,06/2008,Analyst


## Data Cleaning
Here we are only using RIASEC ( Realistic, Investigative, Artistic, Social, Enterprising, Conventional)

In [58]:
# Renaming the odd formated name
df.rename(columns = {'O*NET-SOC Code':'Code'}, inplace = True)

def rename_columns(columns)->dict:
    """
    Args  : Columns(list or panda dataframe columns)
    Return: Dictionary of columns, key as old name and value 
            as new formated name 
    """
    renamed_columns = {}
    
    for col in columns:
        renamed_columns[col]=col.replace(' ','_').lower()

    return renamed_columns
    

# Renaming column names
df.rename(columns = rename_columns(df.columns), inplace = True )
 

In [59]:
# Droping unwanted columns for our model
df = df.drop(['date','domain_source'],axis=1)

In [60]:
df

Unnamed: 0,code,title,element_id,element_name,scale_id,scale_name,data_value
0,11-1011.00,Chief Executives,1.B.1.a,Realistic,OI,Occupational Interests,1.33
1,11-1011.00,Chief Executives,1.B.1.b,Investigative,OI,Occupational Interests,2.00
2,11-1011.00,Chief Executives,1.B.1.c,Artistic,OI,Occupational Interests,2.67
3,11-1011.00,Chief Executives,1.B.1.d,Social,OI,Occupational Interests,3.67
4,11-1011.00,Chief Executives,1.B.1.e,Enterprising,OI,Occupational Interests,7.00
...,...,...,...,...,...,...,...
8761,53-7121.00,"Tank Car, Truck, and Ship Loaders",1.B.1.e,Enterprising,OI,Occupational Interests,2.00
8762,53-7121.00,"Tank Car, Truck, and Ship Loaders",1.B.1.f,Conventional,OI,Occupational Interests,5.00
8763,53-7121.00,"Tank Car, Truck, and Ship Loaders",1.B.1.g,First Interest High-Point,IH,Occupational Interest High-Point,1.00
8764,53-7121.00,"Tank Car, Truck, and Ship Loaders",1.B.1.h,Second Interest High-Point,IH,Occupational Interest High-Point,6.00


In [68]:
df=df[df['scale_id']=='OI']

In [69]:
df[:10]

Unnamed: 0,code,title,element_id,element_name,scale_id,scale_name,data_value
0,11-1011.00,Chief Executives,1.B.1.a,Realistic,OI,Occupational Interests,1.33
1,11-1011.00,Chief Executives,1.B.1.b,Investigative,OI,Occupational Interests,2.0
2,11-1011.00,Chief Executives,1.B.1.c,Artistic,OI,Occupational Interests,2.67
3,11-1011.00,Chief Executives,1.B.1.d,Social,OI,Occupational Interests,3.67
4,11-1011.00,Chief Executives,1.B.1.e,Enterprising,OI,Occupational Interests,7.0
5,11-1011.00,Chief Executives,1.B.1.f,Conventional,OI,Occupational Interests,5.33
9,11-1011.03,Chief Sustainability Officers,1.B.1.a,Realistic,OI,Occupational Interests,1.0
10,11-1011.03,Chief Sustainability Officers,1.B.1.b,Investigative,OI,Occupational Interests,4.33
11,11-1011.03,Chief Sustainability Officers,1.B.1.c,Artistic,OI,Occupational Interests,2.67
12,11-1011.03,Chief Sustainability Officers,1.B.1.d,Social,OI,Occupational Interests,2.33
