In [1]:
import pandas as pd
from difflib import get_close_matches as gcm

skills = 'skills/Skills v3.xlsx'

In [2]:
excel_file = pd.ExcelFile(skills)
dfs = {sheet_name: excel_file.parse(sheet_name) for sheet_name in excel_file.sheet_names}
df_all = pd.DataFrame(columns=['Skill', 'Domain'])

for key, df in dfs.items():
    df['Domain'] = key
    df_all = df_all.append(df.drop_duplicates())

df_all.head()

Unnamed: 0,Skill,Domain
0,Applied Science,D
1,Arts and Humanities,D
2,Business,D
3,Computer Science,D
4,Data Science,D


In [3]:
df_all.describe()

Unnamed: 0,Skill,Domain
count,3145,3145
unique,3139,10
top,Cultural Psychology,CS
freq,2,686


## Check duplicates

In [4]:
v = df_all['Skill'].value_counts()
df_dup = df_all[df_all['Skill'].isin(v.index[v.gt(1)])].sort_values(by='Skill')
df_dup.head()

Unnamed: 0,Skill,Domain
90,Configuration Management,B
135,Configuration Management,CS
49,Cultural Psychology,AH
51,Cultural Psychology,SS
193,Google Analytics,B


In [5]:
df_dup

Unnamed: 0,Skill,Domain
90,Configuration Management,B
135,Configuration Management,CS
49,Cultural Psychology,AH
51,Cultural Psychology,SS
193,Google Analytics,B
181,Google Analytics,DS
103,Health Psychology,AH
122,Health Psychology,SS
153,Media Psychology,AH
176,Media Psychology,SS


## All Skills

In [6]:
def extract_skills(file_name):
    xl_file = pd.ExcelFile(file_name)
    dfs = {sheet_name: xl_file.parse(sheet_name) for sheet_name in xl_file.sheet_names}
    all_skills = []
    for key, df in dfs.items():
        all_skills.extend(df['Skill'].unique().tolist())
    df_skills = pd.DataFrame({'skill': all_skills})
    return df_skills.drop_duplicates()


In [7]:
df_skills = df_all[['Skill']].rename(columns={'Skill': 'skill'})
df_skills = df_skills.reset_index(drop=True)
df_skills['skill_id'] = df_skills.index + 1
df_skills = df_skills[['skill_id', 'skill']]
df_skills.head()

Unnamed: 0,skill_id,skill
0,1,Applied Science
1,2,Arts and Humanities
2,3,Business
3,4,Computer Science
4,5,Data Science


In [8]:
df_skills.describe()

Unnamed: 0,skill_id
count,3145.0
mean,1573.0
std,908.027624
min,1.0
25%,787.0
50%,1573.0
75%,2359.0
max,3145.0


## Duplicate Data Check

In [9]:
def skill_dup_check(df, reverse=False):
    skill_list = df['skill'].tolist()
    pos_dups = {}
    for i in range(len(skill_list)):
        if reverse:
            skill = skill_list[-(i+1)]
            remaining = skill_list[:-(i+1)]
        else:
            skill = skill_list[i]
            remaining = skill_list[i+1:]
        pos_dup = gcm(skill, remaining, cutoff=0.9)
        if len(pos_dup) > 0:
            pos_dups[skill] = pos_dup
    return pos_dups

In [10]:
"""
pos_dups = skill_dup_check(df_skills)

for s in pos_dups:
    print('{}: {}'.format(s, pos_dups[s]))
"""

"\npos_dups = skill_dup_check(df_skills)\n\nfor s in pos_dups:\n    print('{}: {}'.format(s, pos_dups[s]))\n"

## Domain

In [11]:
domains = {'B': 'Business', 'CS': 'Computer Science', 'DS': 'Data Science', 'IT': 'Information Technology (IT)', \
           'ML': 'Mathematics and Logic', 'AH': 'Arts and Humanities', 'AS': 'Applied Science', \
           'NS': 'Natural Science', 'SS': 'Social Science'}

df_rel = df_all.loc[df_all['Domain'] != 'D']
df_rel.loc[:, 'Domain'] = df_rel['Domain'].replace(domains)
skill_to_id_dict = df_skills.set_index('skill').to_dict()['skill_id']
df_rel = df_rel.replace(skill_to_id_dict)
df_rel = df_rel.rename({'Skill': 'skill_id', 'Domain': 'domain_id'})
df_rel.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value


Unnamed: 0,Skill,Domain
0,10,3
1,11,3
2,12,3
3,13,3
4,14,3


In [12]:
df_rel.describe()

Unnamed: 0,Skill,Domain
count,3136.0,3136.0
mean,1579.433036,4.742347
std,905.730027,2.118937
min,10.0,1.0
25%,795.75,3.0
50%,1578.5,4.0
75%,2363.25,6.0
max,3145.0,9.0


## Export

In [13]:
df_skills.to_csv('skills_db2/skill.csv', index=False)
df_rel.to_csv('skills_db2/domain.csv', index=False)