In [None]:
import pandas as pd
from difflib import get_close_matches as gcm

skills = 'skills/Skills v3.xlsx'

In [2]:
excel_file = pd.ExcelFile(skills)
dfs = {sheet_name: excel_file.parse(sheet_name) for sheet_name in excel_file.sheet_names}
df_all = pd.DataFrame(columns=['Skill', 'Domain'])

for key, df in dfs.items():
    df['Domain'] = key
    df_all = df_all.append(df.drop_duplicates())

df_all.head()

Unnamed: 0,Skill,Domain
0,Applied Science,D
1,Arts and Humanities,D
2,Business,D
3,Computer Science,D
4,Data Science,D


In [3]:
df_main = df_all.loc[df_all['Domain'] == 'D']
df_main.head()

Unnamed: 0,Skill,Domain
0,Applied Science,D
1,Arts and Humanities,D
2,Business,D
3,Computer Science,D
4,Data Science,D


In [4]:
df_rest = df_all.loc[df_all['Domain'] != 'D'].reset_index(drop=True)
df_rest.head()

Unnamed: 0,Skill,Domain
0,Account Receivable,B
1,Accounting,B
2,Accounting Software,B
3,Accounting Standards,B
4,Accounts Payable,B


In [5]:
df_rest.describe()

Unnamed: 0,Skill,Domain
count,3149,3149
unique,3149,9
top,Version Control System,CS
freq,1,665


In [6]:
df_rest.groupby('Domain').count()

Unnamed: 0_level_0,Skill
Domain,Unnamed: 1_level_1
AH,258
AS,185
B,463
CS,665
DS,505
IT,550
ML,114
NS,119
SS,290


## Check duplicates

In [7]:
v = df_rest['Skill'].value_counts()
df_dup = df_rest[df_rest['Skill'].isin(v.index[v.gt(1)])].sort_values(by='Skill')
df_dup.head()

Unnamed: 0,Skill,Domain


In [8]:
df_dup

Unnamed: 0,Skill,Domain


## Duplicate Data Check

In [9]:
def skill_dup_check(df, reverse=False):
    skill_list = df['Skill'].tolist()
    pos_dups = {}
    for i in range(len(skill_list)):
        if reverse:
            skill = skill_list[-(i+1)]
            remaining = skill_list[:-(i+1)]
        else:
            skill = skill_list[i]
            remaining = skill_list[i+1:]
        pos_dup = gcm(skill, remaining, cutoff=0.9)
        if len(pos_dup) > 0:
            pos_dups[skill] = pos_dup
    return pos_dups

In [10]:
"""
pos_dups = skill_dup_check(df_skills)

for s in pos_dups:
    print('{}: {}'.format(s, pos_dups[s]))
"""

"\npos_dups = skill_dup_check(df_skills)\n\nfor s in pos_dups:\n    print('{}: {}'.format(s, pos_dups[s]))\n"

In [11]:
def skill_dup_check2(df, domain):
    df2 = df.loc[df['Domain'] == domain]
    skill_list = df2['Skill'].tolist()
    pos_dups = {}
    for i in range(len(skill_list)):
        skill = skill_list[i]
        pos_dup = [s for s in skill_list if skill in s]
        if len(pos_dup) > 1:
            pos_dups[skill] = '; '.join(pos_dup)
    return pos_dups

In [12]:
skill_dup_check2(df_rest, 'SS')

{'Competition': 'Competition; Competition Law; Monopolistic Competition; Perfect Competition',
 'Criminal Law': 'Criminal Law; International Criminal Law',
 'Depression': 'Depression; Great Depression',
 'Economics': 'Agricultural Economics; Applied Economics; Behavioural Economics; Business Economics; Comparative Economics; Computational Economics; Development Economics; Ecological Economics; Economics; Education Economics; Energy Economics; Entrepreneurial Economics; Environmental Economics; Evolutionary Economics; Experimental Economics; Feminist Economics; Financial Economics; Green Economics; Health Economics; Heterodox Economics; Institutional Economics; International Economics; Islamic Economics; Knowledge Economics; Labour Economics; Law and Economics; Managerial Economics; Monetary Economics; Natural Resource Economics; Neoclassical Economics; Public Economics; Real Estate Economics; Regional Economics; Service Economics; Socialist Economics; Transport Economics; Transportatio

## Domain

In [13]:
df_domain = df_main.copy()
df_domain['Id'] = df_domain.index
df_domain['Domain'] = df_domain['Skill']
df_domain = df_domain.drop(columns=['Skill'])
df_domain = df_domain[['Id', 'Domain']]
df_domain.head()

Unnamed: 0,Id,Domain
0,0,Applied Science
1,1,Arts and Humanities
2,2,Business
3,3,Computer Science
4,4,Data Science


In [14]:
domains = {'B': 'Business', 'CS': 'Computer Science', 'DS': 'Data Science', 'IT': 'Information Technology (IT)', \
           'ML': 'Mathematics and Logic', 'AH': 'Arts and Humanities', 'AS': 'Applied Science', \
           'NS': 'Natural Science', 'SS': 'Social Science'}

df_rel = df_rest.copy()
df_rel['Id'] = df_rel.index
df_rel.loc[:, 'Domain'] = df_rel['Domain'].replace(domains)
domain_to_id_dict = df_domain.set_index('Domain').to_dict()['Id']
df_rel['DomainId'] = df_rel['Domain'].replace(domain_to_id_dict)
df_rel = df_rel.drop(columns=['Domain'])
df_rel = df_rel[['Id', 'Skill', 'DomainId']]
df_rel.head()

Unnamed: 0,Id,Skill,DomainId
0,0,Account Receivable,2
1,1,Accounting,2
2,2,Accounting Software,2
3,3,Accounting Standards,2
4,4,Accounts Payable,2


In [15]:
df_rel.describe()

Unnamed: 0,Id,DomainId
count,3149.0,3149.0
mean,1574.0,3.742775
std,909.182325,2.1319
min,0.0,0.0
25%,787.0,2.0
50%,1574.0,4.0
75%,2361.0,5.0
max,3148.0,8.0


In [16]:
df_rel.groupby('DomainId').count()

Unnamed: 0_level_0,Id,Skill
DomainId,Unnamed: 1_level_1,Unnamed: 2_level_1
0,185,185
1,258,258
2,463,463
3,665,665
4,505,505
5,550,550
6,114,114
7,119,119
8,290,290


## Export

In [17]:
df_domain.to_csv('skills_db2/domain.csv', index=False)
df_rel.to_csv('skills_db2/skill.csv', index=False)