# Clustering Based on Holland Code

In [43]:
import pandas as pd

## Loading data

In [44]:
df = pd.read_excel ('data\Abilities.xlsx')

In [45]:
df[:6]

Unnamed: 0,O*NET-SOC Code,Title,Element ID,Element Name,Scale ID,Scale Name,Data Value,N,Standard Error,Lower CI Bound,Upper CI Bound,Recommend Suppress,Not Relevant,Date,Domain Source
0,11-1011.00,Chief Executives,1.A.1.a.1,Oral Comprehension,IM,Importance,4.5,8.0,0.19,4.13,4.87,N,,07/2014,Analyst
1,11-1011.00,Chief Executives,1.A.1.a.1,Oral Comprehension,LV,Level,4.88,8.0,0.13,4.63,5.12,N,N,07/2014,Analyst
2,11-1011.00,Chief Executives,1.A.1.a.2,Written Comprehension,IM,Importance,4.25,8.0,0.16,3.93,4.57,N,,07/2014,Analyst
3,11-1011.00,Chief Executives,1.A.1.a.2,Written Comprehension,LV,Level,4.62,8.0,0.18,4.27,4.98,N,N,07/2014,Analyst
4,11-1011.00,Chief Executives,1.A.1.a.3,Oral Expression,IM,Importance,4.38,8.0,0.18,4.02,4.73,N,,07/2014,Analyst
5,11-1011.00,Chief Executives,1.A.1.a.3,Oral Expression,LV,Level,5.0,8.0,0.0,5.0,5.0,N,N,07/2014,Analyst


## Data Cleaning
Here we are only using RIASEC ( Realistic, Investigative, Artistic, Social, Enterprising, Conventional)

In [46]:
# Renaming the odd formated name
df.rename(columns = {'O*NET-SOC Code':'Code'}, inplace = True)

def rename_columns(columns)->dict:
    """
    Args  : Columns(list or panda dataframe columns)
    Return: Dictionary of columns, key as old name and value 
            as new formated name 
    """
    renamed_columns = {}
    
    for col in columns:
        renamed_columns[col]=col.replace(' ','_').lower()

    return renamed_columns
    

# Renaming column names
df.rename(columns = rename_columns(df.columns), inplace = True )
 

In [47]:
df.head()

Unnamed: 0,code,title,element_id,element_name,scale_id,scale_name,data_value,n,standard_error,lower_ci_bound,upper_ci_bound,recommend_suppress,not_relevant,date,domain_source
0,11-1011.00,Chief Executives,1.A.1.a.1,Oral Comprehension,IM,Importance,4.5,8.0,0.19,4.13,4.87,N,,07/2014,Analyst
1,11-1011.00,Chief Executives,1.A.1.a.1,Oral Comprehension,LV,Level,4.88,8.0,0.13,4.63,5.12,N,N,07/2014,Analyst
2,11-1011.00,Chief Executives,1.A.1.a.2,Written Comprehension,IM,Importance,4.25,8.0,0.16,3.93,4.57,N,,07/2014,Analyst
3,11-1011.00,Chief Executives,1.A.1.a.2,Written Comprehension,LV,Level,4.62,8.0,0.18,4.27,4.98,N,N,07/2014,Analyst
4,11-1011.00,Chief Executives,1.A.1.a.3,Oral Expression,IM,Importance,4.38,8.0,0.18,4.02,4.73,N,,07/2014,Analyst


In [48]:
# Droping unwanted columns for our model
df = df.drop(['code','element_id','n','standard_error','lower_ci_bound','upper_ci_bound','recommend_suppress','not_relevant','date','domain_source','scale_name'],axis=1)

In [49]:
df.head()

Unnamed: 0,title,element_name,scale_id,data_value
0,Chief Executives,Oral Comprehension,IM,4.5
1,Chief Executives,Oral Comprehension,LV,4.88
2,Chief Executives,Written Comprehension,IM,4.25
3,Chief Executives,Written Comprehension,LV,4.62
4,Chief Executives,Oral Expression,IM,4.38


In [50]:
df=df[df['scale_id']=='IM']

In [51]:
df = df.drop(['scale_id'],axis=1)

In [52]:
df=df.pivot(index='title',columns='element_name',values='data_value')


In [53]:
df=df.reset_index()

In [54]:
df.columns.name = None

In [55]:
df

Unnamed: 0,title,Arm-Hand Steadiness,Auditory Attention,Category Flexibility,Control Precision,Deductive Reasoning,Depth Perception,Dynamic Flexibility,Dynamic Strength,Explosive Strength,...,Speed of Limb Movement,Stamina,Static Strength,Time Sharing,Trunk Strength,Visual Color Discrimination,Visualization,Wrist-Finger Speed,Written Comprehension,Written Expression
0,Accountants,1.38,1.88,3.25,1.38,3.88,1.75,1.00,1.25,1.00,...,1.00,1.00,1.00,2.50,1.50,2.00,2.25,1.38,4.00,3.75
1,Actors,2.00,2.00,2.75,1.00,2.88,1.00,1.00,2.00,1.50,...,1.00,2.00,2.00,2.75,2.38,2.00,2.62,1.12,3.88,2.88
2,Actuaries,1.12,2.00,3.88,1.00,4.12,1.62,1.00,1.00,1.00,...,1.00,1.00,1.00,2.50,1.50,2.00,2.50,1.25,4.00,3.75
3,Acupuncturists,3.25,1.88,3.00,2.12,3.88,2.00,1.00,2.25,1.25,...,1.12,2.12,2.12,2.75,2.38,2.00,2.88,1.62,3.62,3.12
4,Acute Care Nurses,1.75,2.12,3.12,1.62,4.00,2.00,1.00,1.50,2.00,...,1.25,2.00,2.25,3.00,1.62,3.00,2.62,1.38,4.00,3.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
963,Wind Energy Project Managers,1.38,2.38,3.25,1.50,4.00,2.00,1.00,1.25,1.12,...,1.00,1.12,1.25,2.62,2.00,2.25,2.88,2.00,4.00,3.88
964,Wind Turbine Service Technicians,3.50,2.62,3.12,3.12,3.62,3.12,1.62,2.62,1.25,...,2.25,2.75,2.88,2.62,3.00,3.00,3.38,2.00,3.25,2.75
965,"Woodworking Machine Setters, Operators, and Te...",3.12,3.12,3.00,3.50,3.00,2.88,1.62,3.00,1.88,...,2.62,2.50,3.25,2.25,3.25,2.62,3.12,2.62,2.88,2.00
966,Word Processors and Typists,2.00,2.12,3.00,2.00,3.00,1.38,1.00,1.50,1.00,...,1.00,1.00,1.00,2.25,2.00,1.88,2.50,3.12,3.75,3.25


In [56]:
from sklearn.metrics.pairwise import cosine_similarity

In [57]:
interest_matrix = df[df.columns[1:]].values

In [58]:
similarity = cosine_similarity(interest_matrix, interest_matrix)

In [59]:
# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use in the function to match the indexes
indices = pd.Series(df.index)

#  defining the function that takes in movie title 
# as input and returns the top 10 recommended movies
def recommendations(title, cosine_sim = similarity):
    
    # initializing the empty list of recommended movies
    recommended_occupation = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]
   

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_occupation.append(list(df.index)[i])
        
    return recommended_occupation

In [60]:
 df.loc[recommendations(0)]

Unnamed: 0,title,Arm-Hand Steadiness,Auditory Attention,Category Flexibility,Control Precision,Deductive Reasoning,Depth Perception,Dynamic Flexibility,Dynamic Strength,Explosive Strength,...,Speed of Limb Movement,Stamina,Static Strength,Time Sharing,Trunk Strength,Visual Color Discrimination,Visualization,Wrist-Finger Speed,Written Comprehension,Written Expression
202,Credit Analysts,1.62,1.75,3.12,1.38,3.75,1.38,1.0,1.0,1.0,...,1.0,1.0,1.0,2.0,1.5,1.75,1.88,1.25,3.88,3.38
2,Actuaries,1.12,2.0,3.88,1.0,4.12,1.62,1.0,1.0,1.0,...,1.0,1.0,1.0,2.5,1.5,2.0,2.5,1.25,4.0,3.75
934,Treasurers and Controllers,1.0,1.75,3.5,1.0,4.0,1.62,1.0,1.25,1.0,...,1.0,1.0,1.0,2.38,1.5,1.88,2.38,1.25,4.12,4.0
872,Statistical Assistants,1.5,2.12,3.38,1.38,3.62,1.25,1.0,1.5,1.0,...,1.0,1.0,1.0,2.38,1.5,2.0,2.75,1.5,4.0,3.88
101,Budget Analysts,1.0,1.88,3.12,1.0,3.88,1.5,1.0,1.0,1.0,...,1.0,1.12,1.0,2.5,1.5,2.12,2.12,1.12,4.0,3.88
60,Auditors,1.12,2.0,3.25,1.0,4.0,1.62,1.0,1.0,1.0,...,1.0,1.0,1.0,2.25,1.25,2.0,2.25,1.25,3.88,4.0
887,Survey Researchers,1.38,1.75,3.12,1.5,4.0,1.62,1.0,1.0,1.0,...,1.0,1.0,1.0,2.0,1.5,1.75,2.12,1.25,4.12,4.12
322,Financial Analysts,1.0,1.75,3.25,1.0,4.0,1.62,1.0,1.25,1.0,...,1.0,1.0,1.0,2.12,1.75,1.75,2.38,1.5,4.0,4.0
92,Biostatisticians,1.88,1.88,3.62,1.75,4.12,1.62,1.0,1.25,1.0,...,1.12,1.25,1.25,2.38,1.38,1.75,3.0,1.38,4.12,3.88
518,Logistics Analysts,1.5,2.0,3.25,1.62,3.75,1.88,1.0,1.12,1.0,...,1.0,1.0,1.0,2.75,1.5,2.0,2.75,1.25,3.88,3.25


In [61]:
df


Unnamed: 0,title,Arm-Hand Steadiness,Auditory Attention,Category Flexibility,Control Precision,Deductive Reasoning,Depth Perception,Dynamic Flexibility,Dynamic Strength,Explosive Strength,...,Speed of Limb Movement,Stamina,Static Strength,Time Sharing,Trunk Strength,Visual Color Discrimination,Visualization,Wrist-Finger Speed,Written Comprehension,Written Expression
0,Accountants,1.38,1.88,3.25,1.38,3.88,1.75,1.00,1.25,1.00,...,1.00,1.00,1.00,2.50,1.50,2.00,2.25,1.38,4.00,3.75
1,Actors,2.00,2.00,2.75,1.00,2.88,1.00,1.00,2.00,1.50,...,1.00,2.00,2.00,2.75,2.38,2.00,2.62,1.12,3.88,2.88
2,Actuaries,1.12,2.00,3.88,1.00,4.12,1.62,1.00,1.00,1.00,...,1.00,1.00,1.00,2.50,1.50,2.00,2.50,1.25,4.00,3.75
3,Acupuncturists,3.25,1.88,3.00,2.12,3.88,2.00,1.00,2.25,1.25,...,1.12,2.12,2.12,2.75,2.38,2.00,2.88,1.62,3.62,3.12
4,Acute Care Nurses,1.75,2.12,3.12,1.62,4.00,2.00,1.00,1.50,2.00,...,1.25,2.00,2.25,3.00,1.62,3.00,2.62,1.38,4.00,3.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
963,Wind Energy Project Managers,1.38,2.38,3.25,1.50,4.00,2.00,1.00,1.25,1.12,...,1.00,1.12,1.25,2.62,2.00,2.25,2.88,2.00,4.00,3.88
964,Wind Turbine Service Technicians,3.50,2.62,3.12,3.12,3.62,3.12,1.62,2.62,1.25,...,2.25,2.75,2.88,2.62,3.00,3.00,3.38,2.00,3.25,2.75
965,"Woodworking Machine Setters, Operators, and Te...",3.12,3.12,3.00,3.50,3.00,2.88,1.62,3.00,1.88,...,2.62,2.50,3.25,2.25,3.25,2.62,3.12,2.62,2.88,2.00
966,Word Processors and Typists,2.00,2.12,3.00,2.00,3.00,1.38,1.00,1.50,1.00,...,1.00,1.00,1.00,2.25,2.00,1.88,2.50,3.12,3.75,3.25
