In [1]:
import pandas as pd

In [2]:
file_name = 'skills/Skills.xlsx'
xl_file = pd.ExcelFile(file_name)

dfs = {sheet_name: xl_file.parse(sheet_name) 
          for sheet_name in xl_file.sheet_names}
dfs.keys()

dict_keys(['Parent 1', 'Parent 2', 'Parent 3', 'Parent 4', 'Parent 5'])

### Get All Skills

In [3]:
all_skills = []

for key, df in dfs.items():
    all_skills.extend(df['Skill'].unique().tolist())

len(all_skills)

861

In [4]:
df_skills = pd.DataFrame({'skill': all_skills})
df_skills['skill_id'] = df_skills.groupby('skill').grouper.group_info[0]
df_skills = df_skills[['skill_id', 'skill']]
df_skills.head()

Unnamed: 0,skill_id,skill
0,41,Applied Science
1,51,Arts
2,87,Business
3,163,Computer Science
4,201,Data Science


In [5]:
df_skills.describe(include='object')

Unnamed: 0,skill
count,861
unique,806
top,Mathematical Logic
freq,3


In [6]:
df_skills = df_skills.drop_duplicates()
df_skills.describe(include='object')

Unnamed: 0,skill
count,806
unique,806
top,Political Geography
freq,1


### Data Checking/Cleaning

In [7]:
for key, df in dfs.items():
    df_error = df.loc[~df['Parent'].isin(df_skills['skill'])]
    print(df_error['Parent'].unique())

[nan]
[]
[]
[]
[]


### Skills Relation Table

In [8]:
df_rel = pd.DataFrame(columns=['Skill', 'Parent'])
skill_to_id_dict = df_skills.set_index('skill').to_dict()['skill_id']

for key, df in dfs.items():
    df_rel = df_rel.append(df.replace(skill_to_id_dict))

df_rel = df_rel.reset_index(drop=True)
df_rel.head()

Unnamed: 0,Skill,Parent
0,41,
1,51,
2,87,
3,163,
4,201,


In [9]:
df_rel.describe(include='all')

Unnamed: 0,Skill,Parent
count,896.0,880.0
unique,806.0,
top,463.0,
freq,4.0,
mean,,375.281818
std,,229.212303
min,,9.0
25%,,163.0
50%,,433.0
75%,,597.0


In [10]:
df_rel = df_rel.dropna()
df_rel = df_rel.drop_duplicates()
df_rel['Parent'] = df_rel['Parent'].astype(int)
df_rel.describe(include='all')

Unnamed: 0,Skill,Parent
count,879.0,879.0
unique,797.0,
top,463.0,
freq,4.0,
mean,,375.177474
std,,229.321884
min,,9.0
25%,,163.0
50%,,433.0
75%,,597.0


### Export

In [11]:
df_skills.to_csv('skills_db/skill.csv')
df_rel.to_csv('skills_db/skill_tree.csv')