In [21]:
import pandas as pd

all_skills = 'skills/Skills.xlsx'
digital_skills = 'skills/Digital Skills.xlsx'
coursera_skills = 'coursera_skills/Coursera Skills To DB.xlsx'

In [78]:
def extract_skills(file_name):
    xl_file = pd.ExcelFile(file_name)
    dfs = {sheet_name: xl_file.parse(sheet_name) for sheet_name in xl_file.sheet_names}
    all_skills = []
    for key, df in dfs.items():
        all_skills.extend(df['Skill'].unique().tolist())
    df_skills = pd.DataFrame({'skill': all_skills})
    return df_skills.drop_duplicates()

def parent_check(file_name, df_skills):
    xl_file = pd.ExcelFile(file_name)
    dfs = {sheet_name: xl_file.parse(sheet_name) for sheet_name in xl_file.sheet_names}
    for key, df in dfs.items():
        df_error = df.loc[~df['Parent'].isin(df_skills['skill'])]
        print(df_error['Parent'].unique())

def extract_relation(file_name, df_skills):
    xl_file = pd.ExcelFile(file_name)
    dfs = {sheet_name: xl_file.parse(sheet_name) for sheet_name in xl_file.sheet_names}
    df_rel = pd.DataFrame(columns=['Skill', 'Parent'])
    skill_to_id_dict = df_skills.set_index('skill').to_dict()['skill_id']
    for key, df in dfs.items():
        df_rel = df_rel.append(df.replace(skill_to_id_dict))
    df_rel = df_rel.reset_index(drop=True)
    df_rel = df_rel.dropna()
    df_rel = df_rel.drop_duplicates()
    df_rel['Parent'] = df_rel['Parent'].astype(int)
    return df_rel

### Extract Skills

In [23]:
# All skills
df_s1 = extract_skills(all_skills)
df_s1.head()

Unnamed: 0,skill
0,Applied Science
1,Arts
2,Business
3,Computer Science
4,Data Science


In [24]:
df_s1.describe(include='object')

Unnamed: 0,skill
count,1687
unique,1687
top,Predicate Logic
freq,1


In [25]:
# Digital skills
df_s2 = extract_skills(digital_skills)
df_s2.head()

Unnamed: 0,skill
0,Data Analysis
1,Machine Learning
2,Algorithm
3,Artificial Intelligence
4,Big Data


In [26]:
df_s2.describe(include='object')

Unnamed: 0,skill
count,979
unique,979
top,Object Oriented CSS
freq,1


In [66]:
# Combine
df_skills = df_s1.append(df_s2)
df_skills = df_skills.drop_duplicates()
df_skills = df_skills.dropna()
df_skills.describe()

Unnamed: 0,skill
count,2499
unique,2499
top,Object Oriented CSS
freq,1


### Coursera Skills

In [29]:
df_c = pd.read_excel(coursera_skills)
df_c.head()

Unnamed: 0,skill_id,c_skill,Skill,Parent
0,0,Journalism,Journalism,Writing
1,1,Content Marketing,Content Marketing,Marketing
2,2,Storytelling,Storytelling,Storytelling
3,3,Advertising,Advertising,Marketing
4,4,Marketing,Marketing,Business


In [30]:
df_coursera = df_c[['skill_id', 'c_skill']]
df_coursera = df_coursera.rename(columns={'c_skill': 'skill'})
df_coursera.head()

Unnamed: 0,skill_id,skill
0,0,Journalism
1,1,Content Marketing
2,2,Storytelling
3,3,Advertising
4,4,Marketing


In [67]:
df_others = df_c[['Skill']].rename(columns={'Skill': 'skill'}).dropna()
# Combine
df_skills = df_skills.append(df_others)
df_skills = df_skills.drop_duplicates()
df_skills['skill_id'] = df_skills.groupby('skill').grouper.group_info[0]
df_skills = df_skills.reset_index(drop=True)
df_skills = df_skills[['skill_id', 'skill']]
df_skills.head()

Unnamed: 0,skill_id,skill
0,417,Applied Science
1,477,Arts
2,994,Business
3,1641,Computer Science
4,2108,Data Science


In [68]:
df_skills.describe(include='all')

Unnamed: 0,skill_id,skill
count,9172.0,9172
unique,,9172
top,,Prejudice
freq,,1
mean,4585.5,
std,2647.872668,
min,0.0,
25%,2292.75,
50%,4585.5,
75%,6878.25,


### Data Checking/Cleaning

In [69]:
parent_check(all_skills, df_skills)

[nan]
[]
[]
[]
[]
[nan]
[]


In [70]:
parent_check(digital_skills, df_skills)

[nan]
[]
[]
[]
[]
[]


In [71]:
parent_check(coursera_skills, df_skills)

[nan]


### Extract Relation

In [79]:
df_r1 = extract_relation(all_skills, df_skills)
df_r2 = extract_relation(digital_skills, df_skills)
# Combine
df_rel = df_r1.append(df_r2)
df_rel = df_rel.drop_duplicates()
df_rel = df_rel.reset_index(drop=True)
df_rel = df_rel.rename(columns={'Skill': 'skill_1_id', 'Parent': 'skill_2_id'})
df_rel.head()

Unnamed: 0,skill_1_id,skill_2_id
0,4771,5619
1,6349,5619
2,8427,1641
3,8427,5123
4,76,994


In [80]:
df_rel.describe(include='all')

Unnamed: 0,skill_1_id,skill_2_id
count,2976.0,2976.0
unique,2490.0,
top,7296.0,
freq,7.0,
mean,,4451.893817
std,,2642.834963
min,,4.0
25%,,2108.0
50%,,4771.0
75%,,6696.0


In [None]:

    skill_to_id_dict = df_skills.set_index('skill').to_dict()['skill_id']
    for key, df in dfs.items():
        df_rel = df_rel.append(df.replace(skill_to_id_dict))
    df_rel = df_rel.reset_index(drop=True)
    df_rel = df_rel.dropna()
    df_rel = df_rel.drop_duplicates()
    df_rel['Parent'] = df_rel['Parent'].astype(int)

### Export

In [None]:
df_skills.to_csv('skills_db/skill.csv', index=False)
df_rel.to_csv('skills_db/skill_tree.csv', index=False)