In [1]:
import pandas as pd
from sqlalchemy import create_engine
from secrets import secrets

### Domain

In [2]:
df_domains = pd.read_csv('skills_db2/domain.csv').rename(columns={'Id': 'DomainId'})
df_domains.head()

Unnamed: 0,DomainId,Domain
0,0,Applied Science
1,1,Arts and Humanities
2,2,Business
3,3,Computer Science
4,4,Data Science


In [3]:
df_d2 = pd.DataFrame([
    [0, 'Business'],
    [1, 'Computer Science'],
    [2, 'Data Science'],
    [3, 'Information Technology (IT)'],
    [4, 'Others']
], columns=['Id', 'Domain'])
df_d2.head()

Unnamed: 0,Id,Domain
0,0,Business
1,1,Computer Science
2,2,Data Science
3,3,Information Technology (IT)
4,4,Others


### Skill

In [4]:
df_skills = pd.read_csv('skills_db2/skill.csv')
df_skills.head()

Unnamed: 0,Id,Skill,DomainId
0,0,Account Receivable,2
1,1,Accountancy,2
2,2,Accounting,2
3,3,Accounting Software,2
4,4,Accounting Standards,2


In [5]:
df_s2 = df_skills.merge(df_domains, left_on='DomainId', right_on='DomainId', how='left').drop(columns=['DomainId'])
df_s2 = df_s2.merge(df_d2, on='Domain').drop(columns=['Domain']).rename(columns={'Id_x': 'Id', 'Id_y': 'DomainId'})
df_s2.head()

Unnamed: 0,Id,Skill,DomainId
0,0,Account Receivable,0
1,1,Accountancy,0
2,2,Accounting,0
3,3,Accounting Software,0
4,4,Accounting Standards,0


In [6]:
df_s2.describe()

Unnamed: 0,Id,DomainId
count,2186.0,2186.0
mean,1092.5,1.517841
std,631.188165,1.081952
min,0.0,0.0
25%,546.25,1.0
50%,1092.5,1.0
75%,1638.75,3.0
max,2185.0,3.0


### Source

In [7]:
df_source = pd.DataFrame([
    [0, 'MYFutureJobs'],
    [1, 'Indeed']
], columns=['Id', 'Source'])
df_source.head()

Unnamed: 0,Id,Source
0,0,MYFutureJobs
1,1,Indeed


### Skill Count

In [8]:
df_count = pd.read_csv('myfuturejobs-insights/ForAcceltic.csv')
df_count.head()

Unnamed: 0,Skill,Domain,Count,JobPostedMonth,Source
0,Sales,Business,2580.0,2021-01-01,MYFutureJobs
1,Communication,Business,2151.0,2021-01-01,MYFutureJobs
2,English,Others,1932.0,2021-01-01,MYFutureJobs
3,Management,Business,1893.0,2021-01-01,MYFutureJobs
4,Training,Business,1398.0,2021-01-01,MYFutureJobs


In [9]:
df_c2 = df_count.merge(df_skills[['Skill', 'Id']].rename(columns={'Id': 'skill_id'}), on=['Skill'], how='left')
df_c2 = df_c2.merge(df_source.rename(columns={'Id': 'source_id'}), on=['Source'], how='left')
# df_c2['job_posted_date'] = pd.DatetimeIndex(df_c2['JobPostedMonth']).month
df_c2['JobPostedMonth'] = pd.to_datetime(df_c2['JobPostedMonth'])
df_c2['job_posted_date'] = df_c2['JobPostedMonth'].dt.strftime('%Y-%m-%d')
df_c2 = df_c2.drop(columns=['Skill', 'Domain', 'Source'])
df_c2 = df_c2.rename(columns={'Count': 'count', 'JobPostedMonth': 'job_posted_date_iso'})
df_c2 = df_c2.sort_values(by='skill_id')
df_c2.head()

Unnamed: 0,count,job_posted_date_iso,skill_id,source_id,job_posted_date
131,122.0,2021-01-01,0,0,2021-01-01
60,234.0,2021-01-01,1,0,2021-01-01
18,621.0,2021-01-01,2,0,2021-01-01
162,91.0,2021-01-01,3,0,2021-01-01
244,51.0,2021-01-01,4,0,2021-01-01


In [10]:
df_c2.describe()

Unnamed: 0,count,skill_id,source_id
count,934.0,934.0,934.0
mean,74.14561,1291.925054,0.0
std,196.234221,968.849552,0.0
min,1.0,0.0,0.0
25%,7.0,385.5,0.0
50%,17.0,1021.5,0.0
75%,53.75,2301.75,0.0
max,2580.0,3126.0,0.0


In [11]:
df_c2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 934 entries, 131 to 476
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   count                934 non-null    float64       
 1   job_posted_date_iso  934 non-null    datetime64[ns]
 2   skill_id             934 non-null    int64         
 3   source_id            934 non-null    int64         
 4   job_posted_date      934 non-null    object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 43.8+ KB


## To SQL

### Skills

In [None]:
engine = create_engine(secrets['skills_db'])
df_d2.to_sql('Domain', engine, index=False, if_exists='replace')
df_s2.to_sql('Skill', engine, index=False, if_exists='replace')
df_source.to_sql('Source', engine, index=False, if_exists='replace')
engine.dispose()

### Skill Count

In [None]:
engine = create_engine(secrets['skillstreet_dev'])
df_c2.to_sql('skill_trend_details', engine, index=False, if_exists='append')
engine.dispose()

In [12]:
engine = create_engine(secrets['skillstreet_stage'])
df_c2.to_sql('skill_trend_details', engine, index=False, if_exists='append')
engine.dispose()