In [1]:
import pandas as pd
from sqlalchemy import create_engine
from secrets import settings

In [41]:
engine = create_engine(settings['skills_db'])
# JMLR
df = pd.read_sql_query('select title, skills, data_skills from "ContentJMLR"', engine)
df['content'] = 'JMLR'
# Youtube
df_t = pd.read_sql_query('select title, skills, data_skills from "ContentYoutube"', engine)
df_t['content'] = 'Youtube'
df = df.append(df_t)
# Medium
df_t = pd.read_sql_query('select title, skills, data_skills from "ContentMedium"', engine)
df_t['content'] = 'Medium'
df = df.append(df_t)
# KDnuggets
df_t = pd.read_sql_query('select title, skills, data_skills from "ContentKDnuggets"', engine)
df_t['content'] = 'KDnuggets'
df = df.append(df_t)
# FreeCodeCamp
df_t = pd.read_sql_query('select title, skills, data_skills from "ContentFreeCodeCampCourse"', engine)
df_t['content'] = 'FreeCodeCamp'
df = df.append(df_t)
engine.dispose()
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,title,skills,data_skills,content
0,On the Optimality of Kernel-Embedding Based Go...,TestNG; Testing,,JMLR
1,Domain Generalization by Marginal Transfer Lea...,Algorithm; Analysis; Supervised Learning; Trai...,,JMLR
2,Regulating Greed Over Time in Multi-Armed Bandits,Algorithm; Analysis; Exploit; Retail; Sentry; ...,,JMLR
3,An Empirical Study of Bayesian Optimization: A...,Accounting; Algorithm; Bayesian Optimization; ...,,JMLR
4,The Decoupled Extended Kalman Filter for Dynam...,Exploit; Modelling; Uncertainty,,JMLR


In [42]:
df['content'].value_counts()

Medium          11787
Youtube          4862
KDnuggets         831
JMLR              290
FreeCodeCamp       46
Name: content, dtype: int64

In [43]:
df_ds = pd.read_csv('skills/dataskill.csv')
df_ds = df_ds.set_index('Skill')
df_ds.head()

Unnamed: 0_level_0,DataSkill
Skill,Unnamed: 1_level_1
Machine Learning,AI
Amazon Web Service (AWS),Cloud Technologies
Google Cloud Platform (GCP),Cloud Technologies
Microsoft Azure,Cloud Technologies
IBM Cloud,Cloud Technologies


In [44]:
for i, row in df.iterrows():
    data_skills = []
    skills = row['skills']
    if skills is None:
        continue
    for s in skills.split('; '):
        if s in df_ds.index:
            data_skills.append(df_ds.loc[s].item())  
    if row['data_skills'] is None and len(data_skills) < 1:
        continue
    if row['data_skills'] is None:
        df.loc[i, 'all_skills'] = '; '.join(data_skills)
    elif len(data_skills) > 1:
        df.loc[i, 'all_skills'] = row['data_skills'] + '; ' + '; '.join(data_skills)
    else:
        df.loc[i, 'all_skills'] = row['data_skills']

df.head()

Unnamed: 0,title,skills,data_skills,content,all_skills
0,On the Optimality of Kernel-Embedding Based Go...,TestNG; Testing,,JMLR,
1,Domain Generalization by Marginal Transfer Lea...,Algorithm; Analysis; Supervised Learning; Trai...,,JMLR,
2,Regulating Greed Over Time in Multi-Armed Bandits,Algorithm; Analysis; Exploit; Retail; Sentry; ...,,JMLR,
3,An Empirical Study of Bayesian Optimization: A...,Accounting; Algorithm; Bayesian Optimization; ...,,JMLR,
4,The Decoupled Extended Kalman Filter for Dynam...,Exploit; Modelling; Uncertainty,,JMLR,


In [45]:
skill_list = []

for _, row in df.iterrows():
    data_skills = row['all_skills']
    if not isinstance(data_skills, str):
        continue
    data_skills = data_skills.split('; ')
    for s in data_skills:
        skill_list.append({'skill': s, 'content': row['content']})

df_s = pd.DataFrame.from_dict(skill_list)
df_s.head()

Unnamed: 0,skill,content
0,AI,JMLR
1,R programming,JMLR
2,Unsupervised Machine Learning,JMLR
3,Visualizations,JMLR
4,Regressions,JMLR


In [46]:
df_s2 = df_s.value_counts().reset_index()
df_s2 = df_s2.pivot(index='skill', columns='content', values=0)
df_s2 = df_s2.fillna(0)
df_s2['Total'] = df_s2.sum(axis=1)
df_s2.to_csv('DataSkillContentCount.csv')
df_s2

content,FreeCodeCamp,JMLR,KDnuggets,Medium,Youtube,Total
skill,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AI,2.0,33.0,206.0,1739.0,410.0,2390.0
Apache,0.0,0.0,0.0,19.0,39.0,58.0
C++,0.0,3.0,1.0,6.0,142.0,152.0
Cloud Technologies,0.0,0.0,10.0,131.0,64.0,205.0
Computer Vision,0.0,0.0,21.0,67.0,135.0,223.0
Dashboards,0.0,0.0,4.0,49.0,169.0,222.0
Data Cleansing / Preparation,0.0,0.0,28.0,20.0,103.0,151.0
Database Management System (DBMS),0.0,0.0,0.0,2.0,141.0,143.0
Deep Learning,0.0,13.0,80.0,350.0,161.0,604.0
ETL,0.0,0.0,12.0,31.0,136.0,179.0


In [47]:
df.loc[df['skills'].str.contains('cloud', case=False, na=False), 'content'].value_counts()

Youtube      133
Medium        57
KDnuggets      9
Name: content, dtype: int64