In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.spatial.distance import cdist

In [2]:
skills = pd.read_csv("skill.csv", on_bad_lines="skip")
jobs = pd.read_csv("job.csv", on_bad_lines="skip")
job_skill = pd.read_csv("job_skill.csv", on_bad_lines="skip")

In [3]:
skill_count = len(skills["skillname"])
skill_count

6607

In [4]:
skill_similarities = pd.DataFrame(np.zeros((skill_count, skill_count)), index=skills["skillname"], columns=skills["skillname"])
skill_similarities

skillname,apple logic pro,wave meters,filing cabinets,soil probes,specification software,wind tunnels,finalbuilder,plate reader software,barcode software,sample ports,...,esri arcsde,chemical reactors,positive train control ptc systems,sealing machines,voltage sensors,ellucian colleague,c shell,dipping vats,picosecond lasers,power punches
skillname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
apple logic pro,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wave meters,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
filing cabinets,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
soil probes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
specification software,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ellucian colleague,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
c shell,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dipping vats,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
picosecond lasers,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
skillnames = np.array(job_skill["skillname"])
skillnames

array(['[{"skill": "programming", "active": "true"}, {"skill": "design", "active": "true"}, {"skill": "science", "active": "true"}, {"skill": "critical thinking", "active": "true"}]',
       '[{"skill": "player", "active": "true"}, {"skill": "levels", "active": "true"}]',
       '[{"skill": "microsoft office", "active": "true"}, {"skill": "imaging systems", "active": "true"}, {"skill": "skill", "active": "true"}]',
       ...,
       '[{"skill": "telecommunications", "active": "true"}, {"skill": "postage meters", "active": "true"}, {"skill": "scales", "active": "true"}, {"skill": "clerical", "active": "true"}]',
       '[{"skill": "peoplenet", "active": "true"}, {"skill": "rules", "active": "true"}]',
       '[{"skill": "microsoft office", "active": "true"}, {"skill": "levels", "active": "true"}]'],
      dtype=object)

In [6]:
dict_skill_similarities = skill_similarities.to_dict()

In [7]:
dict_skill_similarities["programming"]["programming"]

0.0

In [8]:
for skillname in skillnames:
    parsed_skillname = json.loads(skillname)
    
    for parsed_skill_i in parsed_skillname:
        if parsed_skill_i["skill"] not in dict_skill_similarities:
            continue
        for parsed_skill_j in parsed_skillname:
            if parsed_skill_j["skill"] not in dict_skill_similarities[parsed_skill_i["skill"]]:
                continue
            dict_skill_similarities[parsed_skill_i["skill"]][parsed_skill_j["skill"]] += 1
    

In [9]:
dataframe_skill_similarities = pd.DataFrame(dict_skill_similarities)

In [10]:
dataframe_skill_similarities

Unnamed: 0,apple logic pro,wave meters,filing cabinets,soil probes,specification software,wind tunnels,finalbuilder,plate reader software,barcode software,sample ports,...,esri arcsde,chemical reactors,positive train control ptc systems,sealing machines,voltage sensors,ellucian colleague,c shell,dipping vats,picosecond lasers,power punches
apple logic pro,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wave meters,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
filing cabinets,0.0,0.0,387.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
soil probes,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
specification software,0.0,0.0,0.0,0.0,29.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ellucian colleague,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,49.0,0.0,0.0,0.0,0.0
c shell,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,65.0,0.0,0.0,0.0
dipping vats,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
picosecond lasers,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [11]:
dataframe_skill_similarities.to_csv("TF.csv")

In [12]:
dataframe_skill_similarities.to_csv("TF_without_index.csv", index=None)

In [13]:
array_skill_similarities = np.array(dataframe_skill_similarities)

In [14]:
array_skill_similarities

array([[  4.,   0.,   0., ...,   0.,   0.,   0.],
       [  0.,   1.,   0., ...,   0.,   0.,   0.],
       [  0.,   0., 387., ...,   0.,   0.,   0.],
       ...,
       [  0.,   0.,   0., ...,   1.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   1.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   1.]])

In [15]:
tf = TfidfTransformer()

In [16]:
X_tfidf = tf.fit_transform(array_skill_similarities).toarray()
X_tfidf

array([[0.72551633, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.52590279, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.81389437, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.72818726, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.87277643,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.47747396]])

In [17]:
final_res = 1 - cdist(X_tfidf, X_tfidf, metric="cosine")
final_res

array([[1.        , 0.01352075, 0.0304092 , ..., 0.02377461, 0.04869467,
        0.        ],
       [0.01352075, 1.        , 0.00808143, ..., 0.        , 0.01878655,
        0.        ],
       [0.0304092 , 0.00808143, 1.        , ..., 0.04553314, 0.02611171,
        0.00432356],
       ...,
       [0.02377461, 0.        , 0.04553314, ..., 1.        , 0.03422838,
        0.        ],
       [0.04869467, 0.01878655, 0.02611171, ..., 0.03422838, 1.        ,
        0.        ],
       [0.        , 0.        , 0.00432356, ..., 0.        , 0.        ,
        1.        ]])

In [18]:
dataframe_final_similarities = pd.DataFrame(final_res, columns=dataframe_skill_similarities.columns, index=dataframe_skill_similarities.columns)
dataframe_final_similarities

Unnamed: 0,apple logic pro,wave meters,filing cabinets,soil probes,specification software,wind tunnels,finalbuilder,plate reader software,barcode software,sample ports,...,esri arcsde,chemical reactors,positive train control ptc systems,sealing machines,voltage sensors,ellucian colleague,c shell,dipping vats,picosecond lasers,power punches
apple logic pro,1.000000,0.013521,0.030409,0.000000,0.051370,0.048741,0.009030,0.025518,0.024489,0.009733,...,0.025058,0.024300,0.058231,0.016196,0.040115,0.036684,0.036424,0.023775,0.048695,0.000000
wave meters,0.013521,1.000000,0.008081,0.000000,0.029754,0.038574,0.000000,0.015162,0.010279,0.000000,...,0.018873,0.006902,0.020702,0.012569,0.028533,0.016374,0.031191,0.000000,0.018787,0.000000
filing cabinets,0.030409,0.008081,1.000000,0.005248,0.046107,0.061138,0.003704,0.026647,0.016528,0.014553,...,0.022901,0.025363,0.054742,0.035321,0.024036,0.038946,0.027820,0.045533,0.026112,0.004324
soil probes,0.000000,0.000000,0.005248,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.006542,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
specification software,0.051370,0.029754,0.046107,0.000000,1.000000,0.104183,0.023708,0.057439,0.055609,0.013443,...,0.093391,0.039556,0.084570,0.025739,0.055562,0.060859,0.126738,0.040758,0.070430,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ellucian colleague,0.036684,0.016374,0.038946,0.000000,0.060859,0.055000,0.012928,0.030867,0.019825,0.011345,...,0.047662,0.023030,0.057744,0.029690,0.033636,1.000000,0.063132,0.055879,0.035365,0.009404
c shell,0.036424,0.031191,0.027820,0.000000,0.126738,0.076123,0.038260,0.035931,0.029970,0.008510,...,0.140450,0.027441,0.062630,0.022202,0.050064,0.063132,1.000000,0.061836,0.066861,0.000000
dipping vats,0.023775,0.000000,0.045533,0.000000,0.040758,0.040583,0.033324,0.000000,0.000000,0.048825,...,0.033485,0.018975,0.052708,0.030346,0.030235,0.055879,0.061836,1.000000,0.034228,0.000000
picosecond lasers,0.048695,0.018787,0.026112,0.000000,0.070430,0.088224,0.037641,0.023713,0.038124,0.000000,...,0.044334,0.050688,0.063855,0.019703,0.060701,0.035365,0.066861,0.034228,1.000000,0.000000


In [19]:
dataframe_final_similarities.to_csv("final_similarities.csv")

In [20]:
def recommend_skills(skills, similarities, k = 10, min_probability = 0.5):
    similarities_sum = similarities[skills[0]].copy()
    for skill in skills:
        similarities_sum += similarities[skill]
    similarities_sum -= similarities[skills[0]]
    similarities_sum = (similarities_sum - similarities_sum.min()) / (similarities_sum.nlargest(len(skills) + 1)[len(skills)] - similarities_sum.min())
    top_recommended_skills = similarities_sum.sort_values(ascending=False)[similarities_sum > min_probability][similarities_sum <= 1].head(k)
    return top_recommended_skills

In [21]:
recommended_skills = recommend_skills(["programming"], dataframe_final_similarities)

In [22]:
recommended_skills.name = "probability"
recommended_skills

design                   1.000000
science                  0.940281
c                        0.869818
programming languages    0.863746
python                   0.856231
javascript               0.799896
platforms                0.790035
linux                    0.787940
troubleshooting          0.731551
unix                     0.717150
Name: probability, dtype: float64

In [23]:
dataframe_recommended_skills = pd.DataFrame(recommended_skills)
dataframe_recommended_skills

Unnamed: 0,probability
design,1.0
science,0.940281
c,0.869818
programming languages,0.863746
python,0.856231
javascript,0.799896
platforms,0.790035
linux,0.78794
troubleshooting,0.731551
unix,0.71715


In [24]:
dataframe_recommended_skills.to_csv("Recommended_Skills.csv")