In [None]:
import pandas as pd
from sqlalchemy import create_engine
from secrets import secrets

### Domain

In [49]:
df_domains = pd.read_csv('skills_db2/domain.csv').rename(columns={'Id': 'DomainId'})
df_domains.head()

Unnamed: 0,DomainId,Domain
0,0,Applied Science
1,1,Arts and Humanities
2,2,Business
3,3,Computer Science
4,4,Data Science


In [50]:
df_d2 = pd.DataFrame([
    [0, 'Business'],
    [1, 'Computer Science'],
    [2, 'Data Science'],
    [3, 'Information Technology (IT)'],
    [4, 'Others']
], columns=['Id', 'Domain'])
df_d2.head()

Unnamed: 0,Id,Domain
0,0,Business
1,1,Computer Science
2,2,Data Science
3,3,Information Technology (IT)
4,4,Others


### Skill

In [51]:
df_skills = pd.read_csv('skills_db2/skill.csv')
df_skills.head()

Unnamed: 0,Id,Skill,DomainId
0,0,Account Receivable,2
1,1,Accounting,2
2,2,Accounting Software,2
3,3,Accounting Standards,2
4,4,Accounts Payable,2


In [52]:
to_keep = ['Business', 'Computer Science', 'Data Science', 'Information Technology (IT)']

df_s2 = df_skills.merge(df_domains, left_on='DomainId', right_on='DomainId', how='left').drop(columns=['DomainId'])
df_s2.loc[~df_s2['Domain'].isin(to_keep), 'Domain'] = 'Others'
df_s2 = df_s2.merge(df_d2, on='Domain').drop(columns=['Domain']).rename(columns={'Id_x': 'Id', 'Id_y': 'DomainId'})
df_s2.head()

Unnamed: 0,Id,Skill,DomainId
0,0,Account Receivable,0
1,1,Accounting,0
2,2,Accounting Software,0
3,3,Accounting Standards,0
4,4,Accounts Payable,0


In [53]:
df_s2.tail()

Unnamed: 0,Id,Skill,DomainId
3237,3237,Voting Systems,4
3238,3238,Wage,4
3239,3239,Welfare Economics,4
3240,3240,Xeert,4
3241,3241,Yassa,4


In [54]:
df_s2.describe()

Unnamed: 0,Id,DomainId
count,3242.0,3242.0
mean,1620.5,2.255706
std,936.029113,1.453275
min,0.0,0.0
25%,810.25,1.0
50%,1620.5,2.0
75%,2430.75,4.0
max,3241.0,4.0


### Source

In [55]:
df_source = pd.DataFrame([
    [0, 'Rubiqe'],
    [1, 'MYFutureJobs'],
    [2, 'Indeed']
], columns=['Id', 'Source'])
df_source.head()

Unnamed: 0,Id,Source
0,0,Rubiqe
1,1,MYFutureJobs
2,2,Indeed


### Redundant Skills

In [56]:
df_redskills = pd.read_excel('skills/Other Skills.xlsx')
df_redskills.head()

Unnamed: 0,Skill
0,Ada
1,Addition
2,Application
3,B
4,BASIC


In [57]:
df_r2 = df_redskills.merge(df_skills, on='Skill', how='left')
df_r2.loc[df_r2['Id'].isna()]

Unnamed: 0,Skill,Id,DomainId


In [58]:
df_r2 = df_r2[['Id']]
df_r2.head()

Unnamed: 0,Id
0,484
1,2271
2,522
3,534
4,537


### Alternate Skill Names

In [59]:
df_alternate = pd.read_excel('skills/Other Skills.xlsx', sheet_name='Duplicates')
df_alternate.head()

Unnamed: 0,Skill,Parent
0,.NET,.NET Framework
1,AB Testing,A/B Testing
2,Accountancy,Accounting
3,Administrative,Administration
4,SiteCatalyst,Adobe SiteCatalyst


In [60]:
df_a2 = df_alternate.merge(df_skills, left_on='Parent', right_on='Skill', how='left')
df_a2 = df_a2.rename(columns={'Skill_x': 'Skill'})
df_a2.loc[df_a2['Id'].isna()]

Unnamed: 0,Skill,Parent,Id,Skill_y,DomainId


In [61]:
df_a2 = df_a2[['Id', 'Skill']]
df_a2.head()

Unnamed: 0,Id,Skill
0,475,.NET
1,479,AB Testing
2,1,Accountancy
3,7,Administrative
4,1194,SiteCatalyst


### Skill Count

In [42]:
df_count = pd.read_csv('skills/ForAcceltic.csv')
df_count.head()

Unnamed: 0,Skill,Count,JobPostedMonth,Source
0,Sales,2580,2021-01-01,MYFutureJobs
1,Selection,190,2021-01-01,MYFutureJobs
2,Adobe Illustrator,63,2021-01-01,MYFutureJobs
3,Adobe Photoshop,94,2021-01-01,MYFutureJobs
4,Advertising,286,2021-01-01,MYFutureJobs


In [43]:
df_c2 = df_count.merge(df_skills[['Skill', 'Id']].rename(columns={'Id': 'skill_id'}), on=['Skill'], how='left')
df_c2 = df_c2.merge(df_source.rename(columns={'Id': 'source_id'}), on=['Source'], how='left')
df_c2.loc[df_c2['skill_id'].isna()]

Unnamed: 0,Skill,Count,JobPostedMonth,Source,skill_id,source_id


In [44]:
# df_c2['job_posted_date'] = pd.DatetimeIndex(df_c2['JobPostedMonth']).month
df_c2['JobPostedMonth'] = pd.to_datetime(df_c2['JobPostedMonth'])
df_c2['job_posted_date'] = df_c2['JobPostedMonth'].dt.strftime('%Y-%m-%d')
df_c2 = df_c2.drop(columns=['Skill', 'Source'])
df_c2 = df_c2.rename(columns={'Count': 'count', 'JobPostedMonth': 'job_posted_date_iso'})
df_c2 = df_c2.sort_values(by='skill_id')
df_c2.head()

Unnamed: 0,count,job_posted_date_iso,skill_id,source_id,job_posted_date
366,122,2021-01-01,0,1,2021-01-01
201,769,2021-01-01,1,1,2021-01-01
436,91,2021-01-01,2,1,2021-01-01
729,51,2021-01-01,3,1,2021-01-01
367,132,2021-01-01,4,1,2021-01-01


In [45]:
df_c2.describe()

Unnamed: 0,count,skill_id,source_id
count,1035.0,1035.0,1035.0
mean,74.495652,1349.038647,1.0
std,194.229208,988.48536,0.0
min,1.0,0.0,1.0
25%,8.0,426.5,1.0
50%,17.0,1112.0,1.0
75%,57.0,2333.0,1.0
max,2580.0,3236.0,1.0


In [46]:
df_c2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1035 entries, 366 to 455
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   count                1035 non-null   int64         
 1   job_posted_date_iso  1035 non-null   datetime64[ns]
 2   skill_id             1035 non-null   int64         
 3   source_id            1035 non-null   int64         
 4   job_posted_date      1035 non-null   object        
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 48.5+ KB


## To SQL

### Skills

In [19]:
engine = create_engine(secrets['skills_db'])
df_d2.to_sql('Domain', engine, index=False, if_exists='replace')
df_s2.to_sql('Skill', engine, index=False, if_exists='replace')
df_source.to_sql('Source', engine, index=False, if_exists='replace')
df_r2.to_sql('IgnoreSkill', engine, index=False, if_exists='replace')
df_a2.to_sql('AlternateSkill', engine, index=False, if_exists='replace')
engine.dispose()

NameError: name 'table' is not defined

In [62]:
engine = create_engine(secrets['skills_db'])
df_a2.to_sql('AlternateSkill', engine, index=False, if_exists='replace')
engine.dispose()

### Skill Count

In [23]:
engine = create_engine(secrets['skillstreet_dev'])
df_c2.to_sql('skill_trend_details', engine, index=False, if_exists='append')
engine.dispose()

In [24]:
engine = create_engine(secrets['skillstreet_stage'])
df_c2.to_sql('skill_trend_details', engine, index=False, if_exists='append')
engine.dispose()