In [1]:
import pandas as pd
import requests
import emsi
from secrets import secrets

## Get access to Emsi

In [55]:
access_token = emsi.get_access_token(secrets['emsi_client_id'], secrets['emsi_secret'])
emsi.check_health(access_token)

{'healthy': True, 'message': 'Service is healthy'}

## Load Coursera skills

In [5]:
df = pd.read_csv('skills/coursera_full.csv').drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,Skill,Domain,Sub-Domain
0,Journalism,arts-and-humanities,history
1,Content Marketing,arts-and-humanities,history
2,Storytelling,arts-and-humanities,history
3,Advertising,arts-and-humanities,history
4,Marketing,arts-and-humanities,history


### Get related skills from Emsi

In [49]:
def get_all_skills(df, subdomain):
    sd_initial = df.loc[df['Sub-Domain'] == subdomain]['Skill'].tolist()
    print(len(sd_initial))
    print()
    sd_skills = get_related_skills(sd_initial)
    df_sd = pd.DataFrame(sd_skills, columns=['Skill'])
    print(df_sd.describe(include='object'))
    return df_sd.drop_duplicates()

def get_related_skills(skills, print_percent=5):
    new_skills = []
    for i, s in enumerate(skills):
        if (i + 1) % int(len(skills) * print_percent / 100) == 0:
            print("{}% done".format((i + 1) / len(skills) * 100))
        related = emsi.get_emsi_skills(s, access_token)
        for r in related:
            new_skills.append(r['name'])
    return new_skills

### Data Science

In [37]:
df_ds = df.loc[df['Domain'] == 'data-science']
df_ds.head()

Unnamed: 0,Skill,Domain,Sub-Domain
7804,Microsoft Excel Vba,data-science,data-analysis
7805,Tableau Software,data-science,data-analysis
7806,Analytics,data-science,data-analysis
7807,Data Clustering Algorithms,data-science,data-analysis
7808,Software,data-science,data-analysis


In [7]:
df_ds['Sub-Domain'].unique()

array(['data-analysis', 'machine-learning', 'probability-and-statistics'],
      dtype=object)

In [10]:
# Data Analysis skills
da_initial = df_ds.loc[df_ds['Sub-Domain'] == 'data-analysis']['Skill'].tolist()
len(da_initial)

926

In [17]:
da_skills = get_related_skills(da_initial)

4.967602591792657% done
9.935205183585314% done
14.902807775377969% done
19.870410367170628% done
24.838012958963283% done
29.805615550755938% done
34.77321814254859% done
39.740820734341256% done
44.70842332613391% done
49.676025917926566% done
54.64362850971922% done
59.611231101511876% done
64.57883369330453% done
69.54643628509719% done
74.51403887688986% done
79.48164146868251% done
84.44924406047517% done
89.41684665226782% done
94.38444924406048% done
99.35205183585313% done


In [24]:
df_da = pd.DataFrame(da_skills, columns=['Skill'])
df_da.describe(include='object')

Unnamed: 0,Skill
count,2640
unique,2241
top,Machine Learning Algorithms
freq,5


In [25]:
df_da = df_da.drop_duplicates()
df_da.head()

Unnamed: 0,Skill
0,Analytics
1,Analytical Thinking
2,Google Analytics
3,Business Analytics
4,Analytical Techniques


In [26]:
# Machine learning skills
ml_initial = df_ds.loc[df_ds['Sub-Domain'] == 'machine-learning']['Skill'].tolist()
len(ml_initial)

646

In [27]:
ml_skills = get_related_skills(ml_initial)

4.953560371517028% done
9.907120743034056% done
14.860681114551083% done
19.814241486068113% done
24.76780185758514% done
29.721362229102166% done
34.6749226006192% done
39.628482972136226% done
44.58204334365325% done
49.53560371517028% done
54.4891640866873% done
59.44272445820433% done
64.39628482972137% done
69.3498452012384% done
74.30340557275542% done
79.25696594427245% done
84.21052631578947% done
89.1640866873065% done
94.11764705882352% done
99.07120743034056% done


In [28]:
df_ml = pd.DataFrame(ml_skills, columns=['Skill'])
df_ml.describe(include='object')

Unnamed: 0,Skill
count,1735
unique,1549
top,Machine Learning Algorithms
freq,4


In [29]:
df_ml = df_ml.drop_duplicates()
df_ml.head()

Unnamed: 0,Skill
0,Document Classification
1,Hyperparameter Optimization
2,Regression Testing
3,Regression Analysis
4,Logistic Regression


In [30]:
# Probability and statistics skills
ps_initial = df_ds.loc[df_ds['Sub-Domain'] == 'probability-and-statistics']['Skill'].tolist()
len(ps_initial)

204

In [31]:
ps_skills = get_related_skills(ps_initial)

4.901960784313726% done
9.803921568627452% done
14.705882352941178% done
19.607843137254903% done
24.509803921568626% done
29.411764705882355% done
34.31372549019608% done
39.21568627450981% done
44.11764705882353% done
49.01960784313725% done
53.92156862745098% done
58.82352941176471% done
63.725490196078425% done
68.62745098039215% done
73.52941176470588% done
78.43137254901961% done
83.33333333333334% done
88.23529411764706% done
93.13725490196079% done
98.0392156862745% done


In [32]:
df_ps = pd.DataFrame(ps_skills, columns=['Skill'])
df_ps.describe(include='object')

Unnamed: 0,Skill
count,474
unique,426
top,Statistical Hypothesis Testing
freq,4


In [33]:
df_ps = df_ps.drop_duplicates()
df_ps.head()

Unnamed: 0,Skill
0,Biostatistics
1,Odds Ratios (Epidemiology)
3,Bootstrap (Front-End Framework)
4,Spring Boot
5,Booting (BIOS)


In [35]:
df_da.to_csv('skills/data-analysis.csv', index=False)
df_ml.to_csv('skills/machine-learning.csv', index=False)
df_ps.to_csv('skills/probability-and-statistics.csv', index=False)

### Computer Science

In [39]:
df_cs = df.loc[df['Domain'] == 'computer-science']
df_cs.head()

Unnamed: 0,Skill,Domain,Sub-Domain
4824,Image Retrieval,computer-science,algorithms
4825,Computer Vision,computer-science,algorithms
4826,Image Processing,computer-science,algorithms
4827,Convolution,computer-science,algorithms
4828,Object Detection,computer-science,algorithms


In [40]:
df_cs['Sub-Domain'].unique()

array(['algorithms', 'computer-security-and-networks',
       'design-and-product', 'mobile-and-web-development',
       'software-development'], dtype=object)

In [50]:
# Algorithms skills
df_al = get_all_skills(df_cs, 'algorithms')
df_al.head()

410

4.878048780487805% done
9.75609756097561% done
14.634146341463413% done
19.51219512195122% done
24.390243902439025% done
29.268292682926827% done
34.146341463414636% done
39.02439024390244% done
43.90243902439025% done
48.78048780487805% done
53.65853658536586% done
58.536585365853654% done
63.41463414634146% done
68.29268292682927% done
73.17073170731707% done
78.04878048780488% done
82.92682926829268% done
87.8048780487805% done
92.6829268292683% done
97.5609756097561% done
                           Skill
count                       1042
unique                       935
top     Graphical User Interface
freq                           3


Unnamed: 0,Skill
0,Image Retrieval
1,Content-Based Image Retrieval
2,Computer Vision
3,Image Processing
4,Digital Image Processing


In [51]:
# Computer security and networks skills
df_csn = get_all_skills(df_cs, 'computer-security-and-networks')
df_csn.head()

465

4.946236559139785% done
9.89247311827957% done
14.838709677419354% done
19.78494623655914% done
24.731182795698924% done
29.677419354838708% done
34.623655913978496% done
39.56989247311828% done
44.516129032258064% done
49.46236559139785% done
54.40860215053763% done
59.354838709677416% done
64.30107526881721% done
69.24731182795699% done
74.19354838709677% done
79.13978494623656% done
84.08602150537634% done
89.03225806451613% done
93.97849462365592% done
98.9247311827957% done
                           Skill
count                       1439
unique                      1312
top     Internet Protocols Suite
freq                           4


Unnamed: 0,Skill
0,Web Servers
1,Web Server Gateway Interface
2,IPlant Web Server
3,Zeus Web Server
4,Microsoft Personal Web Servers


In [52]:
# Design and product skills
df_dp = get_all_skills(df_cs, 'design-and-product')
df_dp.head()

475

4.842105263157895% done
9.68421052631579% done
14.526315789473685% done
19.36842105263158% done
24.210526315789473% done
29.05263157894737% done
33.89473684210526% done
38.73684210526316% done
43.57894736842105% done
48.421052631578945% done
53.26315789473684% done
58.10526315789474% done
62.94736842105263% done
67.78947368421052% done
72.63157894736842% done
77.47368421052632% done
82.3157894736842% done
87.1578947368421% done
92.0% done
96.84210526315789% done
                                          Skill
count                                      1409
unique                                     1271
top     Autodesk 3DS Max (3D Graphics Software)
freq                                          5


Unnamed: 0,Skill
0,Persona (User Experience)
1,Agile Software Development
2,User Story
3,User Experience
4,User Experience Design (UX)


In [53]:
# Mobile and web development skills
df_mwd = get_all_skills(df_cs, 'mobile-and-web-development')
df_mwd.head()

475

4.842105263157895% done
9.68421052631579% done
14.526315789473685% done
19.36842105263158% done
24.210526315789473% done
29.05263157894737% done
33.89473684210526% done
38.73684210526316% done
43.57894736842105% done
48.421052631578945% done
53.26315789473684% done
58.10526315789474% done
62.94736842105263% done
67.78947368421052% done
72.63157894736842% done
77.47368421052632% done
82.3157894736842% done
87.1578947368421% done
92.0% done
96.84210526315789% done
                                   Skill
count                               1455
unique                              1293
top     Graphical User Interface Builder
freq                                   4


Unnamed: 0,Skill
0,Android Studio
1,2D Computer Graphics
2,Computer Graphics
3,3D Computer Graphics
4,3D Computer Graphics Software


In [56]:
# Software development skills
df_sd = get_all_skills(df_cs, 'software-development')
df_sd.head()

1265

4.980237154150197% done
9.960474308300395% done
14.940711462450592% done
19.92094861660079% done
24.90118577075099% done
29.881422924901184% done
34.86166007905138% done
39.84189723320158% done
44.82213438735178% done
49.80237154150198% done
54.78260869565217% done
59.76284584980237% done
64.74308300395258% done
69.72332015810277% done
74.70355731225297% done
79.68379446640316% done
84.66403162055336% done
89.64426877470356% done
94.62450592885375% done
99.60474308300395% done
                                         Skill
count                                     3111
unique                                    2634
top     Relational Database Management Systems
freq                                         5


Unnamed: 0,Skill
0,HyperText Markup Language (HTML)
1,HTML5
2,Dynamic HTML
3,Semantic HTML
4,Character Encodings In HTML


In [58]:
df_al.to_csv('skills/algorithm.csv', index=False)
df_csn.to_csv('skills/computer-security-and-network.csv', index=False)
df_dp.to_csv('skills/design-and-product.csv', index=False)
df_mwd.to_csv('skills/mobile-and-web-development.csv', index=False)
df_sd.to_csv('skills/software-development.csv, index=False')