# Clustering Based on Holland Code

In [1]:
import pandas as pd

## Loading data

In [8]:
df = pd.read_excel ('data\Knowledge.xlsx')

In [9]:
df[:6]

Unnamed: 0,O*NET-SOC Code,Title,Element ID,Element Name,Scale ID,Scale Name,Data Value,N,Standard Error,Lower CI Bound,Upper CI Bound,Recommend Suppress,Not Relevant,Date,Domain Source
0,11-1011.00,Chief Executives,2.C.1.a,Administration and Management,IM,Importance,4.75,27.0,0.09,4.56,4.94,N,,07/2014,Incumbent
1,11-1011.00,Chief Executives,2.C.1.a,Administration and Management,LV,Level,6.23,27.0,0.17,5.88,6.57,N,N,07/2014,Incumbent
2,11-1011.00,Chief Executives,2.C.1.b,Clerical,IM,Importance,2.66,27.0,0.22,2.21,3.11,N,,07/2014,Incumbent
3,11-1011.00,Chief Executives,2.C.1.b,Clerical,LV,Level,3.5,27.0,0.41,2.66,4.34,N,N,07/2014,Incumbent
4,11-1011.00,Chief Executives,2.C.1.c,Economics and Accounting,IM,Importance,3.7,27.0,0.28,3.11,4.28,N,,07/2014,Incumbent
5,11-1011.00,Chief Executives,2.C.1.c,Economics and Accounting,LV,Level,4.36,27.0,0.34,3.67,5.06,N,N,07/2014,Incumbent


## Data Cleaning
Here we are only using RIASEC ( Realistic, Investigative, Artistic, Social, Enterprising, Conventional)

In [10]:
# Renaming the odd formated name
df.rename(columns = {'O*NET-SOC Code':'Code'}, inplace = True)

def rename_columns(columns)->dict:
    """
    Args  : Columns(list or panda dataframe columns)
    Return: Dictionary of columns, key as old name and value 
            as new formated name 
    """
    renamed_columns = {}
    
    for col in columns:
        renamed_columns[col]=col.replace(' ','_').lower()

    return renamed_columns
    

# Renaming column names
df.rename(columns = rename_columns(df.columns), inplace = True )
 

In [11]:
df.head()

Unnamed: 0,code,title,element_id,element_name,scale_id,scale_name,data_value,n,standard_error,lower_ci_bound,upper_ci_bound,recommend_suppress,not_relevant,date,domain_source
0,11-1011.00,Chief Executives,2.C.1.a,Administration and Management,IM,Importance,4.75,27.0,0.09,4.56,4.94,N,,07/2014,Incumbent
1,11-1011.00,Chief Executives,2.C.1.a,Administration and Management,LV,Level,6.23,27.0,0.17,5.88,6.57,N,N,07/2014,Incumbent
2,11-1011.00,Chief Executives,2.C.1.b,Clerical,IM,Importance,2.66,27.0,0.22,2.21,3.11,N,,07/2014,Incumbent
3,11-1011.00,Chief Executives,2.C.1.b,Clerical,LV,Level,3.5,27.0,0.41,2.66,4.34,N,N,07/2014,Incumbent
4,11-1011.00,Chief Executives,2.C.1.c,Economics and Accounting,IM,Importance,3.7,27.0,0.28,3.11,4.28,N,,07/2014,Incumbent


In [12]:
# Droping unwanted columns for our model
df = df.drop(['code','element_id','scale_name','n','standard_error','lower_ci_bound','upper_ci_bound','recommend_suppress','not_relevant','date','domain_source'],axis=1)

In [13]:
df=df[df['scale_id']=='IM']

In [14]:
df = df.drop(['scale_id'],axis=1)

In [15]:
df.head()

Unnamed: 0,title,element_name,data_value
0,Chief Executives,Administration and Management,4.75
2,Chief Executives,Clerical,2.66
4,Chief Executives,Economics and Accounting,3.7
6,Chief Executives,Sales and Marketing,3.23
8,Chief Executives,Customer and Personal Service,4.09


In [16]:
df=df.pivot(index='title',columns='element_name',values='data_value')


In [17]:
df=df.reset_index()

In [18]:
df.columns.name = None

In [19]:
df

Unnamed: 0,title,Administration and Management,Biology,Building and Construction,Chemistry,Clerical,Communications and Media,Computers and Electronics,Customer and Personal Service,Design,...,Philosophy and Theology,Physics,Production and Processing,Psychology,Public Safety and Security,Sales and Marketing,Sociology and Anthropology,Telecommunications,Therapy and Counseling,Transportation
0,Accountants,3.11,1.00,1.15,1.01,3.64,2.38,2.94,3.12,1.10,...,1.04,1.02,1.53,1.54,1.46,1.32,1.10,1.82,1.06,1.52
1,Actors,2.56,1.04,1.57,1.01,2.21,4.12,2.24,3.08,2.35,...,2.52,1.22,1.35,3.26,2.18,2.89,3.44,1.60,2.22,1.54
2,Actuaries,3.31,1.16,1.25,1.06,2.00,2.16,3.73,2.59,1.75,...,1.16,1.16,1.69,2.00,1.56,2.31,1.97,1.41,1.31,1.34
3,Acupuncturists,3.21,2.96,1.24,1.82,2.85,2.31,2.82,4.15,1.27,...,2.49,1.57,1.95,3.90,2.39,3.64,2.58,1.79,4.07,1.60
4,Acute Care Nurses,2.89,3.32,1.18,2.71,2.68,2.22,2.61,4.36,1.36,...,2.59,2.04,1.75,4.07,2.96,1.79,3.07,2.18,3.86,1.61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
963,Wind Energy Project Managers,4.10,1.68,3.77,1.71,2.96,2.89,3.04,3.34,3.33,...,1.48,2.50,2.67,2.03,2.94,2.59,1.96,2.17,1.24,2.64
964,Wind Turbine Service Technicians,2.76,1.74,3.11,2.33,2.79,2.08,3.98,2.45,2.97,...,1.14,3.32,2.23,2.26,3.17,1.44,1.53,2.94,1.58,2.88
965,"Woodworking Machine Setters, Operators, and Te...",1.98,1.19,2.37,1.73,1.38,1.28,1.78,1.80,2.30,...,1.11,1.56,2.95,1.36,2.51,1.66,1.07,1.15,1.21,1.95
966,Word Processors and Typists,2.42,1.01,1.00,1.00,4.81,2.11,3.51,4.19,1.01,...,1.00,1.00,1.31,2.07,1.74,1.45,1.20,1.81,1.15,1.04


In [20]:
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
interest_matrix = df[df.columns[1:]].values

In [22]:
similarity = cosine_similarity(interest_matrix, interest_matrix)

In [23]:
# creating a Series for the movie titles so they are associated to an ordered numerical
# list I will use in the function to match the indexes
indices = pd.Series(df.index)

#  defining the function that takes in movie title 
# as input and returns the top 10 recommended movies
def recommendations(title, cosine_sim = similarity):
    
    # initializing the empty list of recommended movies
    recommended_occupation = []
    
    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]
   

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes:
        recommended_occupation.append(list(df.index)[i])
        
    return recommended_occupation

In [25]:
 df.loc[recommendations(0)]

Unnamed: 0,title,Administration and Management,Biology,Building and Construction,Chemistry,Clerical,Communications and Media,Computers and Electronics,Customer and Personal Service,Design,...,Philosophy and Theology,Physics,Production and Processing,Psychology,Public Safety and Security,Sales and Marketing,Sociology and Anthropology,Telecommunications,Therapy and Counseling,Transportation
101,Budget Analysts,3.71,1.06,1.22,1.12,3.57,2.88,3.06,3.37,1.19,...,1.85,1.05,1.34,2.14,2.08,1.15,1.92,1.98,1.45,1.37
323,Financial Examiners,3.05,1.0,1.09,1.0,2.48,2.05,2.65,2.35,1.17,...,1.17,1.0,1.43,1.74,1.52,1.3,1.3,1.57,1.13,1.48
202,Credit Analysts,2.8,1.05,1.8,1.05,3.0,2.05,2.85,2.85,1.3,...,1.3,1.1,2.15,1.45,1.8,2.05,1.55,1.58,1.1,1.25
896,"Tax Examiners and Collectors, and Revenue Agents",2.91,1.13,1.24,1.1,3.85,2.26,3.35,4.42,1.33,...,1.67,1.18,2.25,2.53,2.5,1.79,1.67,2.05,1.43,1.84
60,Auditors,3.95,1.0,1.15,1.15,2.5,2.4,3.15,3.9,1.4,...,1.3,1.05,1.75,2.0,1.2,2.65,1.4,1.9,1.2,1.55
934,Treasurers and Controllers,4.04,1.0,1.04,1.0,2.48,2.0,2.8,2.84,1.24,...,1.4,1.04,1.76,1.92,1.8,1.96,1.58,1.44,1.2,1.28
897,Tax Preparers,2.95,1.05,1.23,1.05,3.19,1.95,3.45,4.0,1.27,...,1.23,1.14,2.36,1.73,1.77,2.73,1.23,1.91,1.27,1.36
77,Bill and Account Collectors,3.07,1.0,1.14,1.0,3.07,2.23,3.12,3.77,1.13,...,1.08,1.0,2.35,1.85,1.57,2.14,1.59,1.91,1.12,1.08
204,Credit Checkers,2.81,1.0,1.34,1.01,3.33,1.9,2.42,4.1,1.03,...,1.08,1.01,1.39,2.02,1.45,2.58,1.31,1.75,1.05,1.53
94,"Bookkeeping, Accounting, and Auditing Clerks",2.87,1.0,1.1,1.11,3.89,1.74,3.17,4.18,1.1,...,1.14,1.12,1.68,1.78,1.71,1.85,1.39,2.01,1.22,1.26


In [26]:
df


Unnamed: 0,title,Administration and Management,Biology,Building and Construction,Chemistry,Clerical,Communications and Media,Computers and Electronics,Customer and Personal Service,Design,...,Philosophy and Theology,Physics,Production and Processing,Psychology,Public Safety and Security,Sales and Marketing,Sociology and Anthropology,Telecommunications,Therapy and Counseling,Transportation
0,Accountants,3.11,1.00,1.15,1.01,3.64,2.38,2.94,3.12,1.10,...,1.04,1.02,1.53,1.54,1.46,1.32,1.10,1.82,1.06,1.52
1,Actors,2.56,1.04,1.57,1.01,2.21,4.12,2.24,3.08,2.35,...,2.52,1.22,1.35,3.26,2.18,2.89,3.44,1.60,2.22,1.54
2,Actuaries,3.31,1.16,1.25,1.06,2.00,2.16,3.73,2.59,1.75,...,1.16,1.16,1.69,2.00,1.56,2.31,1.97,1.41,1.31,1.34
3,Acupuncturists,3.21,2.96,1.24,1.82,2.85,2.31,2.82,4.15,1.27,...,2.49,1.57,1.95,3.90,2.39,3.64,2.58,1.79,4.07,1.60
4,Acute Care Nurses,2.89,3.32,1.18,2.71,2.68,2.22,2.61,4.36,1.36,...,2.59,2.04,1.75,4.07,2.96,1.79,3.07,2.18,3.86,1.61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
963,Wind Energy Project Managers,4.10,1.68,3.77,1.71,2.96,2.89,3.04,3.34,3.33,...,1.48,2.50,2.67,2.03,2.94,2.59,1.96,2.17,1.24,2.64
964,Wind Turbine Service Technicians,2.76,1.74,3.11,2.33,2.79,2.08,3.98,2.45,2.97,...,1.14,3.32,2.23,2.26,3.17,1.44,1.53,2.94,1.58,2.88
965,"Woodworking Machine Setters, Operators, and Te...",1.98,1.19,2.37,1.73,1.38,1.28,1.78,1.80,2.30,...,1.11,1.56,2.95,1.36,2.51,1.66,1.07,1.15,1.21,1.95
966,Word Processors and Typists,2.42,1.01,1.00,1.00,4.81,2.11,3.51,4.19,1.01,...,1.00,1.00,1.31,2.07,1.74,1.45,1.20,1.81,1.15,1.04
