In [None]:
import pandas as pd
from difflib import get_close_matches as gcm

all_skills = 'skills/Skills.xlsx'
digital_skills = 'skills/Digital Skills.xlsx'
coursera_skills = 'coursera_skills/Coursera Skills To DB.xlsx'

In [None]:
def extract_skills(file_name):
    xl_file = pd.ExcelFile(file_name)
    dfs = {sheet_name: xl_file.parse(sheet_name) for sheet_name in xl_file.sheet_names}
    all_skills = []
    for key, df in dfs.items():
        all_skills.extend(df['Skill'].unique().tolist())
    df_skills = pd.DataFrame({'skill': all_skills})
    return df_skills.drop_duplicates()

def parent_check(file_name, df_skills):
    xl_file = pd.ExcelFile(file_name)
    dfs = {sheet_name: xl_file.parse(sheet_name) for sheet_name in xl_file.sheet_names}
    for key, df in dfs.items():
        df_error = df.loc[~df['Parent'].isin(df_skills['skill'])].dropna()
        print(df_error['Parent'].unique().tolist())

def extract_relation(file_name, df_skills):
    xl_file = pd.ExcelFile(file_name)
    dfs = {sheet_name: xl_file.parse(sheet_name) for sheet_name in xl_file.sheet_names}
    df_rel = pd.DataFrame(columns=['Skill', 'Parent'])
    skill_to_id_dict = df_skills.set_index('skill').to_dict()['skill_id']
    for key, df in dfs.items():
        df_rel = df_rel.append(df.replace(skill_to_id_dict))
    df_rel = df_rel.reset_index(drop=True)
    df_rel = df_rel.dropna()
    df_rel = df_rel.drop_duplicates()
    df_rel['Parent'] = df_rel['Parent'].astype(int)
    return df_rel

### Extract Skills

In [None]:
# All skills
df_s1 = extract_skills(all_skills)
df_s1.head()

In [None]:
df_s1.describe(include='object')

In [None]:
# Digital skills
df_s2 = extract_skills(digital_skills)
df_s2.head()

In [None]:
df_s2.describe(include='object')

In [None]:
# Combine
df_skills = df_s1.append(df_s2)
df_skills = df_skills.drop_duplicates()
df_skills = df_skills.dropna()
df_skills.describe()

In [None]:
df_skills = df_skills.reset_index(drop=True)
df_skills['skill_id'] = df_skills.index + 1
# df_skills['skill_id'] = df_skills.groupby('skill').grouper.group_info[0] + 1
# df_skills = df_skills.reset_index(drop=True)
df_skills = df_skills[['skill_id', 'skill']]
df_skills.head()

In [None]:
df_skills.describe(include='all')

### Coursera Skills

In [None]:
df_c = pd.read_excel(coursera_skills)
df_c.head()

In [None]:
df_coursera = df_c[['skill_id', 'c_skill']]
df_coursera = df_coursera.rename(columns={'c_skill': 'skill'})
df_coursera.head()

In [None]:
df_others = df_c[['Skill']].rename(columns={'Skill': 'skill'})
df_others = df_others.drop_duplicates()
df_others = df_others.dropna()
df_others['skill_id'] = df_others.index + df_skills['skill_id'].max()
df_others.head()

In [None]:
# Combine
df_all = df_skills.append(df_others)
df_all = df_all.drop_duplicates(subset=['skill'])
df_all = df_all.reset_index(drop=True)
df_all = df_all[['skill_id', 'skill']]
df_all.head()

In [None]:
df_all.describe(include='all')

### Data (Parent) Checking/Cleaning

In [None]:
parent_check(all_skills, df_skills)

In [None]:
parent_check(digital_skills, df_skills)

In [None]:
parent_check(coursera_skills, df_all)

### Data (Duplicate) Checking/Cleaning

In [None]:
def skill_dup_check(reverse=False):
    skill_list = df_skills['skill'].tolist()
    pos_dups = {}
    for i in range(len(skill_list)):
        if reverse:
            skill = skill_list[-(i+1)]
            remaining = skill_list[:-(i+1)]
        else:
            skill = skill_list[i]
            remaining = skill_list[i+1:]
        pos_dup = gcm(skill, remaining, cutoff=0.9)
        if len(pos_dup) > 0:
            pos_dups[skill] = pos_dup
    return pos_dups

In [None]:
pos_dups = skill_dup_check()

for s in pos_dups:
    print('{}: {}'.format(s, pos_dups[s]))

In [None]:
pos_dups = skill_dup_check(reverse=True)

for s in pos_dups:
    print('{}: {}'.format(s, pos_dups[s]))

### Extract Skill Relation

In [None]:
# Skills
df_r1 = extract_relation(all_skills, df_skills)
df_r2 = extract_relation(digital_skills, df_skills)
df_rel = df_r1.append(df_r2)
df_rel = df_rel.drop_duplicates()
df_rel = df_rel.reset_index(drop=True)
df_rel = df_rel.rename(columns={'Skill': 'skill_1_id', 'Parent': 'skill_2_id'})
df_rel.head()

In [None]:
df_rel.describe(include='all')

In [None]:
# Include coursera skills
df_r3 = extract_relation(coursera_skills, df_all)
# Combine
df_rel_all = df_rel.append(df_r3[['Skill', 'Parent']])
df_rel_all = df_rel_all.drop_duplicates()
df_rel_all = df_rel_all.reset_index(drop=True)
df_rel_all = df_rel_all.rename(columns={'Skill': 'skill_1_id', 'Parent': 'skill_2_id'})
df_rel_all.head()

In [None]:
df_rel_all.describe(include='all')

### Extract Coursera Relation

In [None]:
df_crel = df_c[['skill_id', 'Skill']].rename(columns={'skill_id': 'coursera_skill_id', 'Skill': 'skill'})\
    .merge(df_all, how='left').drop(columns=['skill'])
df_crel.head()

In [None]:
df_crel.describe(include='all')

### No Parent Check

In [None]:
df_st = df_rel.replace(df_skills.set_index('skill_id').to_dict()['skill'])\
    .rename(columns={'skill_1_id': 'skill', 'skill_2_id': 'parent'})
no_parents = []

for skill in df_skills['skill'].tolist():
    if skill not in df_st['skill'].tolist():
        no_parents.append(skill)

no_parents

In [None]:
print(no_parents)

In [None]:
def check_sheet(file_name, to_check):
    xl_file = pd.ExcelFile(file_name)
    for sheet_name in xl_file.sheet_names:
        df = xl_file.parse(sheet_name)
        for s in to_check:
            if s in df['Skill'].tolist():
                print(s, ':', sheet_name)

In [None]:
to_check = ['Rstudio']

check_sheet(all_skills, to_check)
print()
check_sheet(digital_skills, to_check)

### Export

In [None]:
df_skills.to_csv('skills_db/skill.csv', index=False)
df_rel.to_csv('skills_db/skill_tree.csv', index=False)
df_all.to_csv('skills_db/all_skill.csv', index=False)
df_rel_all.to_csv('skills_db/all_skill_tree.csv', index=False)
df_coursera.to_csv('skills_db/coursera_skill.csv', index=False)
df_crel.to_csv('skills_db/coursera_map.csv', index=False)