In [1]:
import pandas as pd
import emsi
import queries
from sqlalchemy import create_engine
from secrets import secrets

### Get access to Emsi

In [2]:
access_token = emsi.get_access_token(secrets['emsi_client_id'], secrets['emsi_secret'])
emsi.check_health(access_token)

{'healthy': True, 'message': 'Service is healthy'}

### Load existing skills

In [3]:
f = open("skills/initial.txt", "r")
skills = []
for x in f:
    skills.append(x[:-1])

len(skills)

225

In [4]:
df_own = pd.DataFrame(columns=['Skill'])
df_own['Skill'] = skills
df_own.head()

Unnamed: 0,Skill
0,AB Testing
1,AB testing
2,AI
3,AIML
4,API


### Query existing skills

In [5]:
training_engine = create_engine(secrets['training_db'])
df_train = pd.read_sql_query(queries.skills_query, con=training_engine)
training_engine.dispose()
df_train.head()

Unnamed: 0,Name
0,Linear Algebra
1,Basic Calculus
2,DBMS
3,Hadoop
4,R Programming


### Get related skills from Emsi

In [6]:
def get_related_skills(skills, print_progress=True):
    new_skills = []
    for i, s in enumerate(skills):
        if (i + 1) % int(len(skills) / 10) == 0:
            print("{}% done".format((i + 1) / len(skills) * 100))
        related = emsi.get_emsi_skills(s, access_token)
        for r in related:
            new_skills.append(r['name'])
    return new_skills

In [7]:
# From own
related_own = get_related_skills(df_own['Skill'].to_list())
len(related_own)

9.777777777777779% done
19.555555555555557% done
29.333333333333332% done
39.111111111111114% done
48.888888888888886% done
58.666666666666664% done
68.44444444444444% done
78.22222222222223% done
88.0% done
97.77777777777777% done


842

In [8]:
df1 = pd.DataFrame(columns=['Skill', 'Source'])
df1['Skill'] = related_own
df1['Source'] = 'Own'
df1.head()

Unnamed: 0,Skill,Source
0,Computer-Aided Design,Own
1,Air Brakes,Own
2,Aircraft Maintenance,Own
3,IBM AIX,Own
4,Basic First Aid,Own


In [9]:
# From own
related_training = get_related_skills(df_train['Name'].to_list())
len(related_training)

8.823529411764707% done
17.647058823529413% done
26.47058823529412% done
35.294117647058826% done
44.11764705882353% done
52.94117647058824% done
61.76470588235294% done
70.58823529411765% done
79.41176470588235% done
88.23529411764706% done
97.05882352941177% done


134

In [10]:
df2 = pd.DataFrame(columns=['Skill', 'Source'])
df2['Skill'] = related_training
df2['Source'] = 'Training'
df2.head()

Unnamed: 0,Skill,Source
0,Linear Algebra,Training
1,Elementary Matrix (Linear Algebra),Training
2,Basic Linear Algebra Subprograms,Training
3,Numerical Linear Algebra,Training
4,Linear Algebra Package (LAPACK),Training


In [11]:
# From own
related_own = get_related_skills(df_own['Skill'].to_list())
len(related_own)

df1 = pd.DataFrame(columns=['Skill', 'Source'])
df1['Skill'] = related_own
df1['Source'] = 'Own'
df1.head()

9.777777777777779% done
19.555555555555557% done
29.333333333333332% done
39.111111111111114% done
48.888888888888886% done
58.666666666666664% done
68.44444444444444% done
78.22222222222223% done
88.0% done
97.77777777777777% done


Unnamed: 0,Skill,Source
0,Computer-Aided Design,Own
1,Air Brakes,Own
2,Aircraft Maintenance,Own
3,IBM AIX,Own
4,Basic First Aid,Own


### Consolidate

In [12]:
df_emsi = pd.concat([df1, df2])
df_emsi.head()

Unnamed: 0,Skill,Source
0,Computer-Aided Design,Own
1,Air Brakes,Own
2,Aircraft Maintenance,Own
3,IBM AIX,Own
4,Basic First Aid,Own


In [13]:
df_emsi.describe(include='object')

Unnamed: 0,Skill,Source
count,976,976
unique,745,2
top,Recurrent Neural Network (RNN),Own
freq,6,842


In [14]:
df_emsi = df_emsi.drop_duplicates(subset=['Skill'])
df_emsi = df_emsi.sort_values(by=['Skill'])
df_emsi = df_emsi.reset_index().drop(columns=['index'])
df_emsi.head()

Unnamed: 0,Skill,Source
0,.NET Framework,Own
1,3D Modeling,Own
2,API Design,Own
3,API Gateway,Own
4,API Management,Own


In [15]:
df_emsi.describe(include='object')

Unnamed: 0,Skill,Source
count,745,745
unique,745,2
top,Test Execution Engine,Own
freq,1,722


In [16]:
df_emsi.to_csv('skills/test.csv')