In [1]:
import pandas as pd
import requests
import emsi
from secrets import secrets

### Get access to Emsi

In [2]:
access_token = emsi.get_access_token(secrets['emsi_client_id'], secrets['emsi_secret'])
emsi.check_health(access_token)

{'healthy': True, 'message': 'Service is healthy'}

### Load Coursera skills

In [5]:
df = pd.read_csv('skills/coursera_full.csv').drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,Skill,Domain,Sub-Domain
0,Journalism,arts-and-humanities,history
1,Content Marketing,arts-and-humanities,history
2,Storytelling,arts-and-humanities,history
3,Advertising,arts-and-humanities,history
4,Marketing,arts-and-humanities,history


In [6]:
# Get only data science skills
df_ds = df.loc[df['Domain'] == 'data-science']
df_ds.head()

Unnamed: 0,Skill,Domain,Sub-Domain
7804,Microsoft Excel Vba,data-science,data-analysis
7805,Tableau Software,data-science,data-analysis
7806,Analytics,data-science,data-analysis
7807,Data Clustering Algorithms,data-science,data-analysis
7808,Software,data-science,data-analysis


### Get related skills from Emsi

In [16]:
def get_related_skills(skills, print_percent=5):
    new_skills = []
    for i, s in enumerate(skills):
        if (i + 1) % int(len(skills) * print_percent / 100) == 0:
            print("{}% done".format((i + 1) / len(skills) * 100))
        related = emsi.get_emsi_skills(s, access_token)
        for r in related:
            new_skills.append(r['name'])
    return new_skills

In [7]:
df_ds['Sub-Domain'].unique()

array(['data-analysis', 'machine-learning', 'probability-and-statistics'],
      dtype=object)

In [10]:
# Data Analysis skills
da_initial = df_ds.loc[df_ds['Sub-Domain'] == 'data-analysis']['Skill'].tolist()
len(da_initial)

926

In [17]:
da_skills = get_related_skills(da_initial)

4.967602591792657% done
9.935205183585314% done
14.902807775377969% done
19.870410367170628% done
24.838012958963283% done
29.805615550755938% done
34.77321814254859% done
39.740820734341256% done
44.70842332613391% done
49.676025917926566% done
54.64362850971922% done
59.611231101511876% done
64.57883369330453% done
69.54643628509719% done
74.51403887688986% done
79.48164146868251% done
84.44924406047517% done
89.41684665226782% done
94.38444924406048% done
99.35205183585313% done


In [24]:
df_da = pd.DataFrame(da_skills, columns=['Skill'])
df_da.describe(include='object')

Unnamed: 0,Skill
count,2640
unique,2241
top,Machine Learning Algorithms
freq,5


In [25]:
df_da = df_da.drop_duplicates()
df_da.head()

Unnamed: 0,Skill
0,Analytics
1,Analytical Thinking
2,Google Analytics
3,Business Analytics
4,Analytical Techniques


In [26]:
# Machine learning skills
ml_initial = df_ds.loc[df_ds['Sub-Domain'] == 'machine-learning']['Skill'].tolist()
len(ml_initial)

646

In [27]:
ml_skills = get_related_skills(ml_initial)

4.953560371517028% done
9.907120743034056% done
14.860681114551083% done
19.814241486068113% done
24.76780185758514% done
29.721362229102166% done
34.6749226006192% done
39.628482972136226% done
44.58204334365325% done
49.53560371517028% done
54.4891640866873% done
59.44272445820433% done
64.39628482972137% done
69.3498452012384% done
74.30340557275542% done
79.25696594427245% done
84.21052631578947% done
89.1640866873065% done
94.11764705882352% done
99.07120743034056% done


In [28]:
df_ml = pd.DataFrame(ml_skills, columns=['Skill'])
df_ml.describe(include='object')

Unnamed: 0,Skill
count,1735
unique,1549
top,Machine Learning Algorithms
freq,4


In [29]:
df_ml = df_ml.drop_duplicates()
df_ml.head()

Unnamed: 0,Skill
0,Document Classification
1,Hyperparameter Optimization
2,Regression Testing
3,Regression Analysis
4,Logistic Regression


In [30]:
# Probability and statistics skills
ps_initial = df_ds.loc[df_ds['Sub-Domain'] == 'probability-and-statistics']['Skill'].tolist()
len(ps_initial)

204

In [31]:
ps_skills = get_related_skills(ps_initial)

4.901960784313726% done
9.803921568627452% done
14.705882352941178% done
19.607843137254903% done
24.509803921568626% done
29.411764705882355% done
34.31372549019608% done
39.21568627450981% done
44.11764705882353% done
49.01960784313725% done
53.92156862745098% done
58.82352941176471% done
63.725490196078425% done
68.62745098039215% done
73.52941176470588% done
78.43137254901961% done
83.33333333333334% done
88.23529411764706% done
93.13725490196079% done
98.0392156862745% done


In [32]:
df_ps = pd.DataFrame(ps_skills, columns=['Skill'])
df_ps.describe(include='object')

Unnamed: 0,Skill
count,474
unique,426
top,Statistical Hypothesis Testing
freq,4


In [33]:
df_ps = df_ps.drop_duplicates()
df_ps.head()

Unnamed: 0,Skill
0,Biostatistics
1,Odds Ratios (Epidemiology)
3,Bootstrap (Front-End Framework)
4,Spring Boot
5,Booting (BIOS)


In [35]:
df_da.to_csv('skills/data_analysis.csv', index=False)
df_ml.to_csv('skills/machine_learning.csv', index=False)
df_ps.to_csv('skills/probability_and_statistics.csv', index=False)