In [19]:
import pandas as pd

all_skills = 'skills/Skills.xlsx'
digital_skills = 'skills/Digital Skills.xlsx'
coursera_skills = 'coursera_skills/Coursera Skills To DB.xlsx'

In [20]:
def extract_skills(file_name):
    xl_file = pd.ExcelFile(file_name)
    dfs = {sheet_name: xl_file.parse(sheet_name) for sheet_name in xl_file.sheet_names}
    all_skills = []
    for key, df in dfs.items():
        all_skills.extend(df['Skill'].unique().tolist())
    df_skills = pd.DataFrame({'skill': all_skills})
    return df_skills.drop_duplicates()

def parent_check(file_name, df_skills):
    xl_file = pd.ExcelFile(file_name)
    dfs = {sheet_name: xl_file.parse(sheet_name) for sheet_name in xl_file.sheet_names}
    for key, df in dfs.items():
        df_error = df.loc[~df['Parent'].isin(df_skills['skill'])].dropna()
        print(df_error['Parent'].unique().tolist())

def extract_relation(file_name, df_skills):
    xl_file = pd.ExcelFile(file_name)
    dfs = {sheet_name: xl_file.parse(sheet_name) for sheet_name in xl_file.sheet_names}
    df_rel = pd.DataFrame(columns=['Skill', 'Parent'])
    skill_to_id_dict = df_skills.set_index('skill').to_dict()['skill_id']
    for key, df in dfs.items():
        df_rel = df_rel.append(df.replace(skill_to_id_dict))
    df_rel = df_rel.reset_index(drop=True)
    df_rel = df_rel.dropna()
    df_rel = df_rel.drop_duplicates()
    df_rel['Parent'] = df_rel['Parent'].astype(int)
    return df_rel

### Extract Skills

In [21]:
# All skills
df_s1 = extract_skills(all_skills)
df_s1.head()

Unnamed: 0,skill
0,Applied Science
1,Art
2,Business
3,Computer Science
4,Data Science


In [22]:
df_s1.describe(include='object')

Unnamed: 0,skill
count,2297
unique,2297
top,Aspect-Oriented Programming
freq,1


In [23]:
# Digital skills
df_s2 = extract_skills(digital_skills)
df_s2.head()

Unnamed: 0,skill
0,Data Analysis
1,Machine Learning
2,Algorithm
3,Artificial Intelligence
4,Big Data


In [24]:
df_s2.describe(include='object')

Unnamed: 0,skill
count,1021
unique,1021
top,Apache Kafka
freq,1


In [25]:
# Combine
df_skills = df_s1.append(df_s2)
df_skills = df_skills.drop_duplicates()
df_skills = df_skills.dropna()
df_skills.describe()

Unnamed: 0,skill
count,3129
unique,3129
top,Aspect-Oriented Programming
freq,1


In [26]:
df_skills['skill_id'] = df_skills.groupby('skill').grouper.group_info[0] + 1
df_skills = df_skills.reset_index(drop=True)
df_skills = df_skills[['skill_id', 'skill']]
df_skills.head()

Unnamed: 0,skill_id,skill
0,153,Applied Science
1,166,Art
2,328,Business
3,555,Computer Science
4,724,Data Science


In [27]:
df_skills.describe(include='all')

Unnamed: 0,skill_id,skill
count,3129.0,3129
unique,,3129
top,,Aspect-Oriented Programming
freq,,1
mean,1565.0,
std,903.408822,
min,1.0,
25%,783.0,
50%,1565.0,
75%,2347.0,


### Coursera Skills

In [28]:
df_c = pd.read_excel(coursera_skills)
df_c.head()

Unnamed: 0,skill_id,c_skill,Skill,Parent
0,457,Low Poly,Low Poly,3D Computer Graphics
1,462,Polygon Mesh,Polygon Mesh,3D Computer Graphics
2,466,Uv Mapping,Uv Mapping,3D Computer Graphics
3,467,Framing,Framing,3D Computer Graphics
4,469,3D Graphics Software,3D Graphics Software,3D Computer Graphics


In [29]:
df_coursera = df_c[['skill_id', 'c_skill']]
df_coursera = df_coursera.rename(columns={'c_skill': 'skill'})
df_coursera.head()

Unnamed: 0,skill_id,skill
0,457,Low Poly
1,462,Polygon Mesh
2,466,Uv Mapping
3,467,Framing
4,469,3D Graphics Software


In [30]:
df_others = df_c[['Skill']].rename(columns={'Skill': 'skill'})
df_others = df_others.drop_duplicates()
df_others = df_others.dropna()
df_others['skill_id'] = df_others.index + df_skills['skill_id'].max()
df_others.head()

Unnamed: 0,skill,skill_id
0,Low Poly,3129
1,Polygon Mesh,3130
2,Uv Mapping,3131
3,Framing,3132
4,3D Graphics Software,3133


In [31]:
# Combine
df_all = df_skills.append(df_others)
df_all = df_all.drop_duplicates(subset=['skill'])
df_all = df_all.reset_index(drop=True)
df_all = df_all[['skill_id', 'skill']]
df_all.head()

Unnamed: 0,skill_id,skill
0,153,Applied Science
1,166,Art
2,328,Business
3,555,Computer Science
4,724,Data Science


In [32]:
df_all.describe(include='all')

Unnamed: 0,skill_id,skill
count,9518.0,9518
unique,,9518
top,,Customer Success
freq,,1
mean,5182.914058,
std,3095.742241,
min,1.0,
25%,2380.25,
50%,5266.5,
75%,7926.75,


### Data Checking/Cleaning

In [33]:
parent_check(all_skills, df_skills)

[]
[]
[]
[]
[]
[]
[]
[]
[]


In [34]:
parent_check(digital_skills, df_skills)

[]
[]
[]
[]
[]
[]


In [35]:
parent_check(coursera_skills, df_all)

[]


In [36]:
df_skills.loc[df_skills['skill'].str.contains('Java')]

Unnamed: 0,skill_id,skill
433,1497,Java
434,1507,JavaScript
867,1511,Javanese
2373,1500,Java Data Mining
2374,1501,Java Persistence Query Language
2506,1498,Java Cryptography Architecture
2507,1499,Java Cryptography Extension
2549,1509,JavaScript Object Notation (JSON)
2564,1502,Java Programming Language Compiler (Javac)
2565,1505,Java Syntax


### Extract Skill Relation

In [37]:
# Skills
df_r1 = extract_relation(all_skills, df_skills)
df_r2 = extract_relation(digital_skills, df_skills)
df_rel = df_r1.append(df_r2)
df_rel = df_rel.drop_duplicates()
df_rel = df_rel.reset_index(drop=True)
df_rel = df_rel.rename(columns={'Skill': 'skill_1_id', 'Parent': 'skill_2_id'})
df_rel.head()

Unnamed: 0,skill_1_id,skill_2_id
0,1592,1901
1,2170,1901
2,2861,555
3,2861,1721
4,23,328


In [38]:
df_rel.describe(include='all')

Unnamed: 0,skill_1_id,skill_2_id
count,3798.0,3798.0
unique,3121.0,
top,1936.0,
freq,7.0,
mean,,1451.255398
std,,883.410388
min,,1.0
25%,,702.0
50%,,1571.0
75%,,2199.0


In [39]:
# Include coursera skills
df_r3 = extract_relation(coursera_skills, df_all)
# Combine
df_rel_all = df_rel.append(df_r3[['Skill', 'Parent']])
df_rel_all = df_rel_all.drop_duplicates()
df_rel_all = df_rel_all.reset_index(drop=True)
df_rel_all = df_rel_all.rename(columns={'Skill': 'skill_1_id', 'Parent': 'skill_2_id'})
df_rel_all.head()

Unnamed: 0,skill_1_id,skill_2_id,skill_1_id.1,skill_2_id.1
0,1592,1901.0,,
1,2170,1901.0,,
2,2861,555.0,,
3,2861,1721.0,,
4,23,328.0,,


In [40]:
df_rel_all.describe(include='all')

Unnamed: 0,skill_1_id,skill_2_id,skill_1_id.1,skill_2_id.1
count,3798.0,3798.0,2349.0,2349.0
unique,3121.0,,2343.0,
top,1936.0,,4070.0,
freq,7.0,,2.0,
mean,,1451.255398,,1707.96126
std,,883.410388,,1340.160443
min,,1.0,,4.0
25%,,702.0,,785.0
50%,,1571.0,,1602.0
75%,,2199.0,,2308.0


### Extract Coursera Relation

In [41]:
df_crel = df_c[['skill_id', 'Skill']].rename(columns={'skill_id': 'coursera_skill_id', 'Skill': 'skill'})\
    .merge(df_all, how='left').drop(columns=['skill'])
df_crel.head()

Unnamed: 0,coursera_skill_id,skill_id
0,457,3129
1,462,3130
2,466,3131
3,467,3132
4,469,3133


In [42]:
df_crel.describe(include='all')

Unnamed: 0,coursera_skill_id,skill_id
count,7301.0,7301.0
mean,3650.941652,6280.034927
std,2107.856673,2663.174218
min,0.0,2.0
25%,1826.0,4315.0
50%,3651.0,6529.0
75%,5476.0,8525.0
max,7301.0,10429.0


### Export

In [43]:
df_skills.to_csv('skills_db/skill.csv', index=False)
df_rel.to_csv('skills_db/skill_tree.csv', index=False)
df_all.to_csv('skills_db/all_skill.csv', index=False)
df_rel_all.to_csv('skills_db/all_skill_tree.csv', index=False)
df_coursera.to_csv('skills_db/coursera_skill.csv', index=False)
df_crel.to_csv('skills_db/coursera_map.csv', index=False)