# Clustering Based on Holland Code

In [78]:
import pandas as pd

## Loading data

In [79]:
df = pd.read_excel ('data\interests.xlsx')

In [80]:
df[:6]

Unnamed: 0,O*NET-SOC Code,Title,Element ID,Element Name,Scale ID,Scale Name,Data Value,Date,Domain Source
0,11-1011.00,Chief Executives,1.B.1.a,Realistic,OI,Occupational Interests,1.33,06/2008,Analyst
1,11-1011.00,Chief Executives,1.B.1.b,Investigative,OI,Occupational Interests,2.0,06/2008,Analyst
2,11-1011.00,Chief Executives,1.B.1.c,Artistic,OI,Occupational Interests,2.67,06/2008,Analyst
3,11-1011.00,Chief Executives,1.B.1.d,Social,OI,Occupational Interests,3.67,06/2008,Analyst
4,11-1011.00,Chief Executives,1.B.1.e,Enterprising,OI,Occupational Interests,7.0,06/2008,Analyst
5,11-1011.00,Chief Executives,1.B.1.f,Conventional,OI,Occupational Interests,5.33,06/2008,Analyst


## Data Cleaning
Here we are only using RIASEC ( Realistic, Investigative, Artistic, Social, Enterprising, Conventional)

In [81]:
# Renaming the odd formated name
df.rename(columns = {'O*NET-SOC Code':'Code'}, inplace = True)

def rename_columns(columns)->dict:
    """
    Args  : Columns(list or panda dataframe columns)
    Return: Dictionary of columns, key as old name and value 
            as new formated name 
    """
    renamed_columns = {}
    
    for col in columns:
        renamed_columns[col]=col.replace(' ','_').lower()

    return renamed_columns
    

# Renaming column names
df.rename(columns = rename_columns(df.columns), inplace = True )
 

In [82]:
# Droping unwanted columns for our model
df = df.drop(['code','element_id','date','domain_source','scale_name'],axis=1)

In [83]:
df=df[df['scale_id']=='OI']

In [84]:
df = df.drop(['scale_id'],axis=1)

In [85]:
df=df.pivot(index='title',columns='element_name',values='data_value')


In [86]:
df=df.reset_index()

In [87]:
df.columns.name = None

In [88]:
df

Unnamed: 0,title,Artistic,Conventional,Enterprising,Investigative,Realistic,Social
0,Accountants,1.00,7.00,4.00,3.00,1.33,1.67
1,Actors,6.67,1.67,4.67,1.33,2.67,3.00
2,Actuaries,1.67,6.00,3.33,4.33,2.00,1.00
3,Acupuncturists,2.33,1.67,2.00,5.00,5.33,6.00
4,Acute Care Nurses,2.00,1.67,2.33,5.33,4.33,7.00
...,...,...,...,...,...,...,...
969,Wind Energy Project Managers,2.33,4.67,7.00,4.33,2.33,2.00
970,Wind Turbine Service Technicians,1.00,4.00,2.33,2.33,6.67,1.33
971,"Woodworking Machine Setters, Operators, and Te...",1.67,4.67,1.33,3.67,7.00,1.00
972,Word Processors and Typists,1.00,7.00,2.67,1.00,1.33,1.33


In [89]:
from sklearn.metrics.pairwise import cosine_similarity

In [98]:
interest_matrix = df[df.columns[1:]].values

In [99]:
similarity = cosine_similarity(interest_matrix, interest_matrix)

array([1.        , 0.59613061, 0.97093065, 0.60198256, 0.60792268,
       0.6678857 , 0.68067378, 0.82535129, 0.9251903 , 0.66935573,
       0.64719913, 0.8548239 , 0.82980623, 0.78334034, 0.68962452,
       0.85081906, 0.72731767, 0.54262211, 0.78979012, 0.68425153,
       0.77454179, 0.93616657, 0.7991578 , 0.75209784, 0.60779812,
       0.93052816, 0.81056169, 0.5888071 , 0.6478815 , 0.87599755,
       0.70506849, 0.71109279, 0.67650079, 0.72955862, 0.69598273,
       0.72477021, 0.72310876, 0.68045605, 0.92857095, 0.84085125,
       0.79011731, 0.72143466, 0.66459337, 0.73921069, 0.84426906,
       0.62474557, 0.92745353, 0.64705859, 0.68373385, 0.5856128 ,
       0.62860975, 0.95900704, 0.65473996, 0.66505569, 0.61554004,
       0.78835137, 0.64828645, 0.7318822 , 0.81627333, 0.72476078,
       0.98879898, 0.59545065, 0.73998285, 0.72475798, 0.67056356,
       0.69480442, 0.65945533, 0.75112336, 0.82038866, 0.74806463,
       0.72367962, 0.80983635, 0.72327014, 0.79732437, 0.82322

In [102]:
# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use in the function to match the indexes
indices = pd.Series(df.index)

#  defining the function that takes in movie title 
# as input and returns the top 10 recommended movies
def recommendations(title, cosine_sim = similarity):
    
    # initializing the empty list of recommended movies
    recommended_occupation = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]
   

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_occupation.append(list(df.index)[i])
        
    return recommended_occupation

[477, 94, 60, 101, 605, 499, 475, 376, 739, 940]

Unnamed: 0,title,Artistic,Conventional,Enterprising,Investigative,Realistic,Social
94,"Bookkeeping, Accounting, and Auditing Clerks",1.00,7.00,3.67,2.33,2.33,1.67
95,Brickmasons and Blockmasons,2.67,4.33,1.33,4.33,7.00,1.00
96,Bridge and Lock Tenders,1.00,4.33,3.67,1.67,6.67,1.33
97,Broadcast News Analysts,6.33,2.33,4.67,4.00,1.33,5.00
98,Broadcast Technicians,1.67,5.33,2.00,4.33,7.00,1.67
...,...,...,...,...,...,...,...
969,Wind Energy Project Managers,2.33,4.67,7.00,4.33,2.33,2.00
970,Wind Turbine Service Technicians,1.00,4.00,2.33,2.33,6.67,1.33
971,"Woodworking Machine Setters, Operators, and Te...",1.67,4.67,1.33,3.67,7.00,1.00
972,Word Processors and Typists,1.00,7.00,2.67,1.00,1.33,1.33


In [115]:
 df.loc[recommendations(973)]

Unnamed: 0,title,Artistic,Conventional,Enterprising,Investigative,Realistic,Social
453,Hydrologists,3.0,3.0,2.67,7.0,5.33,1.33
91,Biomedical Engineers,2.67,3.67,2.33,7.0,5.67,2.0
34,Animal Scientists,2.33,3.0,2.0,7.0,5.33,1.0
404,"Geoscientists, Except Hydrologists and Geograp...",3.0,3.33,2.33,7.0,5.0,1.33
79,Biochemical Engineers,2.33,3.67,2.67,6.67,6.0,1.33
550,Materials Scientists,2.67,3.33,3.67,7.0,5.0,1.33
218,Cytotechnologists,1.67,3.33,1.67,6.67,5.0,1.67
627,Nuclear Engineers,1.67,3.67,2.33,7.0,5.33,1.0
581,Microbiologists,1.33,3.0,2.0,7.0,4.67,1.67
14,Aerospace Engineers,3.67,3.0,2.33,7.0,5.67,1.33


In [113]:
df


Unnamed: 0,title,Artistic,Conventional,Enterprising,Investigative,Realistic,Social
0,Accountants,1.00,7.00,4.00,3.00,1.33,1.67
1,Actors,6.67,1.67,4.67,1.33,2.67,3.00
2,Actuaries,1.67,6.00,3.33,4.33,2.00,1.00
3,Acupuncturists,2.33,1.67,2.00,5.00,5.33,6.00
4,Acute Care Nurses,2.00,1.67,2.33,5.33,4.33,7.00
...,...,...,...,...,...,...,...
969,Wind Energy Project Managers,2.33,4.67,7.00,4.33,2.33,2.00
970,Wind Turbine Service Technicians,1.00,4.00,2.33,2.33,6.67,1.33
971,"Woodworking Machine Setters, Operators, and Te...",1.67,4.67,1.33,3.67,7.00,1.00
972,Word Processors and Typists,1.00,7.00,2.67,1.00,1.33,1.33
