In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt 
import seaborn as sns
from sqlalchemy import create_engine
from datetime import datetime as dt
from secrets import secrets
from skill_api import extract_skills, extract_ignore

In [2]:
# Skills
df_skills = pd.read_csv('skills_db2/skill.csv')
SKILLS = df_skills['Skill'].unique().tolist()
# Redundant skills
df_redskills = pd.read_excel('skills/Other Skills.xlsx')
RED_SKILLS = df_redskills['Skill'].unique().tolist()
# Duplicate skills
df_dupskills = pd.read_excel('skills/Other Skills.xlsx', sheet_name='Duplicates')
DUP_SKILLS = df_dupskills.set_index('Skill').to_dict()['Parent']
SKILLS.extend(list(DUP_SKILLS.keys()))

In [4]:
query = """
select Job_ID, Job_Title, Company, Country, Date_Posted, Description, Keywords_Present, Title_Keywords 
from {}
where Date_Posted > '2020-04-01'
and Description != 'No Description'
"""

def extract_df(job):
    df = pd.read_sql(query.format(job), engine)
    df.drop_duplicates(inplace=True)
    no_skill = df['Keywords_Present'].isna()
    df['Num_Skill'] = 0
    df.loc[~no_skill, 'Num_Skill'] = df.loc[~no_skill, 'Keywords_Present'].apply(lambda x: len(x.split(',')))
    df = df.loc[df['Num_Skill'] <= 3]
    print('Number of rows', len(df))
    return df

def extract_skills_all(df):
    i = 0
    job_info_list = []
    initial = dt.now()
    interval = dt.now()
    print_every = int(len(df) / 20)
    for _, job in df.iterrows():
        i += 1
        if i % print_every == 0:
            print("{} jobs processed. Time taken: {}".format(i, dt.now() - interval))
            interval = dt.now()
        all_info = job['Job_Title'] + ' ' + job['Description']
        all_skills = extract_skills(all_info, SKILLS)
        keep_skills, _ = extract_ignore(all_skills, RED_SKILLS, DUP_SKILLS)
        keep_skills.sort()
        # ignore_skills.sort()
        job_info_list.append({
            'title': job['Job_Title'],
            'company': job['Company'],
            'country': job['Country'],
            'date_posted': job['Date_Posted'],
            'description': job['Description'],
            'indeed_skills': job['Keywords_Present'],
            'skills': keep_skills,
        })
    print("Total time taken: {}".format(dt.now() - initial))
    return pd.DataFrame.from_dict(job_info_list)

In [5]:
engine = create_engine(secrets['indeed_db'])
df_ds = extract_df('Data_Scientist')
df_ds.head()

Number of rows 2896


Unnamed: 0,Job_ID,Job_Title,Company,Country,Date_Posted,Description,Keywords_Present,Title_Keywords,Num_Skill
2,4b5160535b55a659,Junior- Mid level Data Scientist openings,ClientSolv Technologies,USA,2020-05-08,Company DescriptionClientSolv Technologies is ...,'C','Junior',1
3,d5326d3f749525a9,Data Scientist,Triplebyte,USA,2020-05-07,About TriplebyteWe are a platform that helps e...,"'C', 'Machine Learning', 'R'",,3
7,f2a0194148860a86,"Data Scientist/Epidemiologist, Activity #2",Heluna Health,USA,2020-04-23,Project: LA County Novel Coronavirus (COVID-19...,"'C', 'R', 'SQL'",,3
10,fcabe7775b900c1c,Senior Data Scientist,SAIC,USA,2020-05-08,Job DescriptionDescriptionSAIC is seeking a re...,"'C', 'R', 'Statistics'",'Senior',3
16,5e84b2024b6813c2,Data Scientist,Intrepid,USA,2020-04-10,Intrepid sets the standard for delivering exce...,"'C', 'Python', 'R'",,3


In [6]:
engine.dispose()
df_dsj = extract_skills_all(df_ds)
df_dsj.head()

144 jobs processed. Time taken: 0:06:21.666809
288 jobs processed. Time taken: 0:06:03.697011
432 jobs processed. Time taken: 0:06:08.532298
576 jobs processed. Time taken: 0:05:37.191274
720 jobs processed. Time taken: 0:06:11.201109
864 jobs processed. Time taken: 0:06:10.987541
1008 jobs processed. Time taken: 0:06:39.284815
1152 jobs processed. Time taken: 0:07:07.009025
1296 jobs processed. Time taken: 0:06:38.922956
1440 jobs processed. Time taken: 0:07:19.292091
1584 jobs processed. Time taken: 0:08:23.135630
1728 jobs processed. Time taken: 0:06:35.178270
1872 jobs processed. Time taken: 0:09:03.286620
2016 jobs processed. Time taken: 0:06:46.331302
2160 jobs processed. Time taken: 0:06:09.596201
2304 jobs processed. Time taken: 0:05:39.781271
2448 jobs processed. Time taken: 0:06:32.337418
2592 jobs processed. Time taken: 0:05:15.511003
2736 jobs processed. Time taken: 0:06:29.782548
2880 jobs processed. Time taken: 0:07:09.106723
Total time taken: 2:13:09.571922


Unnamed: 0,title,company,country,date_posted,description,indeed_skills,skills
0,Junior- Mid level Data Scientist openings,ClientSolv Technologies,USA,2020-05-08,Company DescriptionClientSolv Technologies is ...,'C',"[Analytical, Business Administration, Hardware..."
1,Data Scientist,Triplebyte,USA,2020-05-07,About TriplebyteWe are a platform that helps e...,"'C', 'Machine Learning', 'R'","[Bayesian Inference, Data Acquisition, Dataset..."
2,"Data Scientist/Epidemiologist, Activity #2",Heluna Health,USA,2020-04-23,Project: LA County Novel Coronavirus (COVID-19...,"'C', 'R', 'SQL'","[Analysis, Data Management, Data Visualization..."
3,Senior Data Scientist,SAIC,USA,2020-05-08,Job DescriptionDescriptionSAIC is seeking a re...,"'C', 'R', 'Statistics'","[Agile, Analytical, Business Process, Dashboar..."
4,Data Scientist,Intrepid,USA,2020-04-10,Intrepid sets the standard for delivering exce...,"'C', 'Python', 'R'","[Administration, Administrative Law, Analytica..."


In [7]:
df_dsj.to_csv('indeed-insights/data_scientist_3.csv', index=False)


## K Means Clustering

In [10]:
skills = []

for _, row in df_dsj.iterrows():
    for s in row['skills']:
        if s not in skills:
            skills.append(s)

len(skills)

1516

In [52]:
import numpy as np

job_skills = df_dsj['skills'].tolist()
X = []

for job in job_skills:
    arr = np.zeros(len(skills))
    for s in job:
        arr[skills.index(s)] = 1
    X.append(arr)

X = np.array(X)
X.shape

(2896, 1516)

In [58]:
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans

# clustering = DBSCAN(metric=dist_func).fit(X)
# clustering = DBSCAN().fit(X)
clustering = KMeans(n_clusters=5, random_state=42).fit(X)
clustering

KMeans(n_clusters=5, random_state=42)

In [59]:
clustering.labels_

array([4, 1, 4, ..., 0, 4, 2], dtype=int32)

In [111]:
df2 = df_dsj.copy()
df2['cluster'] = clustering.labels_
df2.head()

Unnamed: 0,title,company,country,date_posted,description,indeed_skills,skills,cluster
0,Junior- Mid level Data Scientist openings,ClientSolv Technologies,USA,2020-05-08,Company DescriptionClientSolv Technologies is ...,'C',"[Analytical, Business Administration, Hardware...",4
1,Data Scientist,Triplebyte,USA,2020-05-07,About TriplebyteWe are a platform that helps e...,"'C', 'Machine Learning', 'R'","[Bayesian Inference, Data Acquisition, Dataset...",1
2,"Data Scientist/Epidemiologist, Activity #2",Heluna Health,USA,2020-04-23,Project: LA County Novel Coronavirus (COVID-19...,"'C', 'R', 'SQL'","[Analysis, Data Management, Data Visualization...",4
3,Senior Data Scientist,SAIC,USA,2020-05-08,Job DescriptionDescriptionSAIC is seeking a re...,"'C', 'R', 'Statistics'","[Agile, Analytical, Business Process, Dashboar...",2
4,Data Scientist,Intrepid,USA,2020-04-10,Intrepid sets the standard for delivering exce...,"'C', 'Python', 'R'","[Administration, Administrative Law, Analytica...",2


In [112]:
df2['cluster'].unique()

array([4, 1, 2, 3, 0], dtype=int32)

In [116]:
def get_skill_count(df, cluster):
    skill_count = {}
    df_temp = df.loc[df['cluster'] == cluster]
    for _, row in df_temp.iterrows():
        for s in row['skills']:
            if s in skill_count:
                skill_count[s] += 1
            else:
                skill_count[s] = 1
    skill_count = {k: v for k, v in sorted(skill_count.items(), key=lambda item: item[1], reverse=True)}
    return skill_count

In [117]:
get_skill_count(df2, 0)

{'TestNG': 379,
 'Testing': 361,
 'Analytical': 139,
 'Training': 120,
 'Communication': 117,
 'Design': 116,
 'Computer Science': 98,
 'Problem Solving': 92,
 'Research': 88,
 'Documentation': 84,
 'Market': 76,
 'Collaboration': 75,
 'Platform': 73,
 'Electronics': 67,
 'Analysis': 66,
 'Innovation': 63,
 'Law': 62,
 'Data Analysis': 59,
 'Research and Development (R&D)': 58,
 'Artificial Intelligence (AI)': 58,
 'Data Science': 56,
 'Engineering': 55,
 'Reporting': 54,
 'Machine Learning': 49,
 'Leadership': 48,
 'Database': 48,
 'Sales': 48,
 'Security': 47,
 'Management': 46,
 'Protocol': 45,
 'Information Technology (IT)': 44,
 'Microsoft Access': 44,
 'Algorithm': 43,
 'Insurance': 43,
 'Project Management': 42,
 'Modelling': 42,
 'Network': 41,
 'Automation': 41,
 'Statistics': 40,
 'Sentry': 40,
 'Selection': 39,
 'Presentation': 38,
 'Visualization': 36,
 'Agile': 36,
 'Programming': 36,
 'Manufacturing': 35,
 'Writing': 35,
 'Mathematics': 34,
 'Software Engineering': 34,
 '

In [77]:
get_skill_count(df2, 1)

{'Machine Learning': 450,
 'Computer Science': 323,
 'Data Science': 295,
 'Artificial Intelligence (AI)': 251,
 'Analytical': 216,
 'Research': 213,
 'Engineering': 199,
 'Algorithm': 194,
 'Platform': 171,
 'Python': 169,
 'Design': 163,
 'Statistics': 159,
 'Leadership': 149,
 'Mathematics': 129,
 'Communication': 129,
 'Innovation': 128,
 'Collaboration': 127,
 'Deep Learning': 123,
 'R': 101,
 'Security': 100,
 'Analysis': 100,
 'Modelling': 89,
 'Programming': 88,
 'Software Engineering': 88,
 'Training': 84,
 'Data Analysis': 82,
 'Law': 78,
 'Problem Solving': 74,
 'Big Data': 72,
 'Data Analytics': 72,
 'Insurance': 68,
 'Presentation': 67,
 'Microsoft Access': 66,
 'Dataset': 64,
 'Play': 63,
 'Decision Making': 62,
 'Market': 62,
 'Predictive Model': 61,
 'Natural Language Processing (NLP)': 61,
 'Email': 60,
 'Information Technology (IT)': 59,
 'Healthcare': 58,
 'Computer Vision': 57,
 'Visualization': 57,
 'Data Mining': 54,
 'Programming Language': 53,
 'Database': 53,
 

In [78]:
get_skill_count(df2, 2)

{'Analytical': 282,
 'Statistics': 262,
 'Computer Science': 178,
 'Mathematics': 174,
 'Data Science': 152,
 'Reporting': 149,
 'Statistical Analysis': 144,
 'Training': 143,
 'Data Analysis': 137,
 'Research': 132,
 'Design': 130,
 'Leadership': 126,
 'Presentation': 122,
 'Visualization': 120,
 'Communication': 119,
 'Problem Solving': 118,
 'Innovation': 111,
 'Collaboration': 97,
 'Data Management': 95,
 'Engineering': 89,
 'Dataset': 87,
 'Processing': 86,
 'Documentation': 86,
 'R': 83,
 'Microsoft Access': 82,
 'Programming': 80,
 'Biostatistics': 80,
 'Management': 78,
 'Data Visualization': 77,
 'Law': 77,
 'Database': 75,
 'Data Mining': 75,
 'Table': 75,
 'Healthcare': 70,
 'Accuracy': 69,
 'Methodology': 69,
 'Machine Learning': 68,
 'Medicine': 67,
 'Security': 64,
 'Economics': 64,
 'Electronics': 64,
 'SAS': 64,
 'Predictive Model': 59,
 'Decision Making': 59,
 'Information Technology (IT)': 58,
 'Network': 56,
 'Interpreter': 52,
 'Writing': 52,
 'Market': 51,
 'Subjec

In [79]:
get_skill_count(df2, 3)

{'Analytical': 324,
 'Communication': 305,
 'Market': 234,
 'Leadership': 213,
 'Training': 192,
 'Analysis': 191,
 'Design': 191,
 'Presentation': 185,
 'Innovation': 170,
 'Sales': 164,
 'Collaboration': 160,
 'Research': 153,
 'Platform': 149,
 'Reporting': 133,
 'Problem Solving': 122,
 'Project Management': 115,
 'Law': 109,
 'Marketing': 103,
 'Network': 92,
 'Data Science': 92,
 'Management': 88,
 'Microsoft Excel': 80,
 'English': 77,
 'Life Science': 76,
 'Writing': 73,
 'Manufacturing': 73,
 'Research and Development (R&D)': 72,
 'Engineering': 72,
 'Insurance': 71,
 'Artificial Intelligence (AI)': 70,
 'Information Technology (IT)': 69,
 'Finance': 68,
 'Decision Making': 68,
 'Documentation': 68,
 'Budget': 63,
 'Written Communication': 61,
 'Translation': 59,
 'Consulting': 58,
 'Investment': 58,
 'Data Analytics': 55,
 'Data Analysis': 54,
 'Administration': 54,
 'Organizational Skill': 54,
 'Email': 53,
 'Agile': 53,
 'Accuracy': 50,
 'Microsoft Access': 50,
 'Processing

In [80]:
get_skill_count(df2, 4)

{'Research': 261,
 'Design': 202,
 'Analytical': 181,
 'Training': 170,
 'Communication': 139,
 'Analysis': 134,
 'Data Analysis': 122,
 'Data Science': 114,
 'Platform': 114,
 'Management': 114,
 'Computer Science': 113,
 'Artificial Intelligence (AI)': 110,
 'Engineering': 105,
 'Database': 100,
 'Leadership': 99,
 'Written Communication': 96,
 'Information Technology (IT)': 93,
 'Security': 92,
 'Innovation': 91,
 'Statistics': 88,
 'Reporting': 82,
 'Documentation': 81,
 'Email': 79,
 'Microsoft Access': 78,
 'Research and Development (R&D)': 75,
 'Python': 75,
 'Marketing': 74,
 'Writing': 74,
 'R': 73,
 'Market': 73,
 'Protocol': 72,
 'Presentation': 71,
 'English': 70,
 'Interpreter': 68,
 'Data Analytics': 67,
 'Administration': 66,
 'Collaboration': 66,
 'Machine Learning': 65,
 'Problem Solving': 65,
 'Product Development': 60,
 'Microsoft Excel': 59,
 'Structured Query Language (SQL)': 58,
 'Visualization': 57,
 'Law': 57,
 'Sales': 56,
 'Project Management': 52,
 'Molecular

## Topic Modeling

In [147]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(max_df=0.8, min_df=5, stop_words='english')

doc_term_matrix = count_vect.fit_transform(df_dsj['description'].values.astype('U'))

In [148]:
doc_term_matrix

<2896x8872 sparse matrix of type '<class 'numpy.int64'>'
	with 531512 stored elements in Compressed Sparse Row format>

In [149]:
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=5, random_state=42)
LDA.fit(doc_term_matrix)

LatentDirichletAllocation(n_components=5, random_state=42)

In [150]:
for i,topic in enumerate(LDA.components_):
    print(f'Top 30 words for topic #{i+1}:')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-30:]])
    print('\n')

Top 30 words for topic #1:
['level', 'required', 'food', 'support', 'knowledge', 'sciences', 'working', 'life', 'including', 'role', 'years', 'related', 'company', 'technical', 'science', 'scientist', 'lab', 'skills', 'quality', 'process', 'molecular', 'cell', 'new', 'development', 'biology', 'team', 'laboratory', 'scientific', 'work', 'research']


Top 30 words for topic #2:
['following', 'related', 'qualifications', 'university', 'include', 'support', 'level', 'service', 'including', 'national', 'status', 'requirements', 'degree', 'employees', 'science', 'federal', 'job', 'management', 'time', 'applicants', 'provide', 'application', 'program', 'employment', 'education', 'required', 'research', 'work', 'position', 'information']


Top 30 words for topic #3:
['analytical', 'requirements', 'solutions', 'provide', 'complex', 'including', 'technical', 'develop', 'ability', 'degree', 'knowledge', 'information', 'project', 'programming', 'management', 'skills', 'research', 'health', 'analyt

In [151]:
topic_values = LDA.transform(doc_term_matrix)
print(topic_values.shape)
topic_values[1]

(2896, 5)


array([0.00106561, 0.04520808, 0.13743424, 0.81522425, 0.00106781])

In [152]:
df2['topic'] = topic_values.argmax(axis=1)
df2.head()

Unnamed: 0,title,company,country,date_posted,description,indeed_skills,skills,cluster,topic
0,Junior- Mid level Data Scientist openings,ClientSolv Technologies,USA,2020-05-08,Company DescriptionClientSolv Technologies is ...,'C',"[Analytical, Business Administration, Hardware...",4,2
1,Data Scientist,Triplebyte,USA,2020-05-07,About TriplebyteWe are a platform that helps e...,"'C', 'Machine Learning', 'R'","[Bayesian Inference, Data Acquisition, Dataset...",1,3
2,"Data Scientist/Epidemiologist, Activity #2",Heluna Health,USA,2020-04-23,Project: LA County Novel Coronavirus (COVID-19...,"'C', 'R', 'SQL'","[Analysis, Data Management, Data Visualization...",4,1
3,Senior Data Scientist,SAIC,USA,2020-05-08,Job DescriptionDescriptionSAIC is seeking a re...,"'C', 'R', 'Statistics'","[Agile, Analytical, Business Process, Dashboar...",2,3
4,Data Scientist,Intrepid,USA,2020-04-10,Intrepid sets the standard for delivering exce...,"'C', 'Python', 'R'","[Administration, Administrative Law, Analytica...",2,2


In [153]:
def get_skill_count(df, topic):
    skill_count = {}
    df_temp = df.loc[df['topic'] == topic]
    for _, row in df_temp.iterrows():
        for s in row['skills']:
            if s in skill_count:
                skill_count[s] += 1
            else:
                skill_count[s] = 1
    skill_count = {k: v for k, v in sorted(skill_count.items(), key=lambda item: item[1], reverse=True)}
    return skill_count

In [154]:
get_skill_count(df2, 0)

{'Research': 119,
 'Design': 105,
 'Training': 100,
 'Communication': 87,
 'Molecular Biology': 87,
 'Analytical': 86,
 'Research and Development (R&D)': 82,
 'Life Science': 77,
 'Protocol': 77,
 'TestNG': 71,
 'Biochemistry': 66,
 'Innovation': 63,
 'Testing': 62,
 'Documentation': 62,
 'Collaboration': 59,
 'Chemistry': 58,
 'Problem Solving': 56,
 'Data Analysis': 55,
 'Analysis': 50,
 'Presentation': 49,
 'Biotechnology': 46,
 'Engineering': 46,
 'Interpreter': 43,
 'Biology': 42,
 'Writing': 40,
 'Leadership': 40,
 'Genomics': 40,
 'Database': 38,
 'Management': 38,
 'Platform': 37,
 'Polymerase chain reaction (PCR)': 36,
 'Bioinformatics': 34,
 'Manufacturing': 34,
 'Product Development': 33,
 'Medicine': 33,
 'Data Science': 32,
 'Quantitative Polymerase Chain Reaction (qPCR)': 32,
 'Written Communication': 31,
 'Processing': 30,
 'Immunology': 30,
 'Chemical': 30,
 'Reporting': 29,
 'Literature': 29,
 'Organizational Skill': 28,
 'Statistics': 27,
 'Administration': 26,
 'Chem

In [155]:
get_skill_count(df2, 1)

{'Research': 184,
 'Analytical': 147,
 'Training': 133,
 'Computer Science': 120,
 'Communication': 118,
 'Analysis': 112,
 'Security': 112,
 'Data Science': 105,
 'Microsoft Access': 99,
 'Statistics': 99,
 'Law': 97,
 'Email': 96,
 'Information Technology (IT)': 84,
 'Mathematics': 82,
 'Leadership': 77,
 'Machine Learning': 77,
 'Presentation': 73,
 'Artificial Intelligence (AI)': 71,
 'Collaboration': 70,
 'Administration': 68,
 'Engineering': 67,
 'Design': 64,
 'Database': 63,
 'Documentation': 62,
 'Data Analysis': 60,
 'Management': 60,
 'Market': 59,
 'Writing': 58,
 'TestNG': 57,
 'Written Communication': 57,
 'Reporting': 56,
 'Processing': 54,
 'Microsoft Excel': 53,
 'Data Analytics': 53,
 'History': 52,
 'Computing': 51,
 'Division': 50,
 'Economics': 49,
 'Human Resource (HR)': 49,
 'Selection': 48,
 'English': 47,
 'Problem Solving': 46,
 'Testing': 46,
 'R': 45,
 'Electronics': 45,
 'Innovation': 45,
 'Insurance': 44,
 'Network': 43,
 'Modelling': 43,
 'Brand Managemen

In [156]:
get_skill_count(df2, 2)

{'Analytical': 361,
 'Statistics': 226,
 'Data Science': 207,
 'Design': 191,
 'Computer Science': 190,
 'Reporting': 185,
 'Leadership': 184,
 'Communication': 178,
 'Research': 168,
 'Training': 156,
 'Mathematics': 153,
 'Data Analysis': 151,
 'Statistical Analysis': 147,
 'Presentation': 146,
 'Problem Solving': 145,
 'Visualization': 143,
 'Machine Learning': 140,
 'Collaboration': 140,
 'Analysis': 133,
 'Innovation': 127,
 'R': 116,
 'Healthcare': 112,
 'Documentation': 111,
 'Dataset': 111,
 'Database': 105,
 'Accuracy': 104,
 'Data Management': 103,
 'Engineering': 103,
 'TestNG': 103,
 'Law': 102,
 'Information Technology (IT)': 96,
 'Management': 96,
 'Algorithm': 95,
 'Data Mining': 94,
 'Programming': 92,
 'Interpreter': 90,
 'Security': 88,
 'Biostatistics': 86,
 'Testing': 85,
 'Subject Matter Expert (SME)': 83,
 'Predictive Model': 83,
 'Methodology': 82,
 'SAS': 82,
 'Project Management': 80,
 'Decision Making': 80,
 'Medicine': 80,
 'Microsoft Access': 79,
 'Table': 7

In [157]:
get_skill_count(df2, 3)

{'Analytical': 422,
 'Machine Learning': 401,
 'Computer Science': 383,
 'Data Science': 340,
 'Platform': 330,
 'Artificial Intelligence (AI)': 322,
 'Design': 298,
 'Leadership': 266,
 'Communication': 266,
 'Research': 249,
 'Engineering': 248,
 'Innovation': 242,
 'Python': 220,
 'Market': 212,
 'Training': 206,
 'Statistics': 192,
 'Collaboration': 188,
 'Algorithm': 184,
 'TestNG': 160,
 'Problem Solving': 157,
 'Analysis': 140,
 'Mathematics': 135,
 'Law': 131,
 'Data Analysis': 128,
 'Testing': 127,
 'Software Engineering': 126,
 'Insurance': 126,
 'Sales': 125,
 'Decision Making': 124,
 'Agile': 123,
 'Deep Learning': 122,
 'R': 121,
 'Presentation': 119,
 'Marketing': 118,
 'Management': 112,
 'Reporting': 112,
 'Visualization': 108,
 'Big Data': 107,
 'Information Technology (IT)': 105,
 'Play': 101,
 'Finance': 100,
 'Data Analytics': 100,
 'Microsoft Access': 97,
 'Programming': 97,
 'Security': 97,
 'Architecture': 97,
 'Modelling': 97,
 'Network': 95,
 'Automation': 94,


In [158]:
get_skill_count(df2, 4)

{'Communication': 160,
 'Design': 144,
 'Sales': 130,
 'Market': 128,
 'Research': 127,
 'Analytical': 126,
 'Training': 114,
 'TestNG': 102,
 'Presentation': 96,
 'Testing': 96,
 'Analysis': 91,
 'Electronics': 88,
 'Innovation': 86,
 'Research and Development (R&D)': 84,
 'Marketing': 80,
 'Manufacturing': 79,
 'Platform': 71,
 'Reporting': 70,
 'Collaboration': 68,
 'Leadership': 68,
 'Problem Solving': 67,
 'Management': 64,
 'Network': 62,
 'Data Analysis': 60,
 'Semiconductors': 59,
 'Documentation': 58,
 'Project Management': 57,
 'English': 56,
 'Engineering': 56,
 'Artificial Intelligence (AI)': 53,
 'Insurance': 51,
 'Construction': 48,
 'Selection': 47,
 'Computer Science': 46,
 'Hardware': 42,
 'Writing': 40,
 'Product Development': 40,
 'Written Communication': 39,
 'Processing': 38,
 'Protocol': 38,
 'Ad Design': 38,
 'Chemistry': 38,
 'Database': 37,
 'Division': 36,
 'Optics': 36,
 'Law': 36,
 'Budget': 35,
 'Email': 34,
 'Information Technology (IT)': 33,
 'Technical S

## Topic Modeling

In [135]:
count_vect = CountVectorizer(max_df=0.8, min_df=5, stop_words='english')
doc_term_matrix = count_vect.fit_transform(df_dsj['skills'].apply(lambda x: ' '.join(x)).values.astype('U'))
doc_term_matrix

<2896x809 sparse matrix of type '<class 'numpy.int64'>'
	with 69661 stored elements in Compressed Sparse Row format>

In [136]:
LDA = LatentDirichletAllocation(n_components=5, random_state=42)
LDA.fit(doc_term_matrix)

LatentDirichletAllocation(n_components=5, random_state=42)

In [137]:
for i,topic in enumerate(LDA.components_):
    print(f'Top 30 words for topic #{i+1}:')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-30:]])
    print('\n')

Top 30 words for topic #1:
['collaboration', 'mathematics', 'medicine', 'documentation', 'sas', 'healthcare', 'access', 'analytics', 'technology', 'administration', 'economics', 'design', 'security', 'database', 'training', 'presentation', 'reporting', 'leadership', 'statistical', 'statistics', 'business', 'microsoft', 'research', 'science', 'information', 'communication', 'analytical', 'management', 'analysis', 'data']


Top 30 words for topic #2:
['ai', 'development', 'artificial', 'computing', 'deep', 'communication', 'physics', 'analytical', 'security', 'design', 'predictive', 'language', 'processing', 'python', 'statistical', 'statistics', 'model', 'mathematics', 'modelling', 'software', 'programming', 'research', 'analysis', 'algorithm', 'machine', 'computer', 'engineering', 'science', 'learning', 'data']


Top 30 words for topic #3:
['user', 'bi', 'sql', 'leadership', 'microsoft', 'product', 'visualization', 'experience', 'market', 'software', 'analytics', 'language', 'communica

## Managerial Roles

In [96]:
df3 = df_dsj.loc[df_dsj['title'].str.contains('manager', case=False)]
df3.head()

Unnamed: 0,title,company,country,date_posted,description,indeed_skills,skills
105,Principal Data Scientist Manager - Cloud Hardw...,Microsoft,USA,2020-05-02,To delight our customers in a Cloud First worl...,"'C', 'R'","[Applied Statistics, Automation, Cloud Service..."
136,Arity-Data Scientist-Sr Manager,Allstate,USA,2020-04-10,"Founded by The Allstate Corporation in 2016, A...","'C', 'Excel', 'R'","[Agile, Architecture, Business Analysis, Busin..."
202,Project Manager - Data Science | Chicago IL,Photon,USA,2020-04-28,Project Manager - Data Science | Chicago IL - ...,"'C', 'R', 'SQL'","[Apache Spark, Business Intelligence (BI), Dat..."
208,Corporate Communication Manager - Artificial I...,TuSimple,USA,2020-05-11,"This role is located in San Diego, CA. Relocat...","'Artificial Intelligence', 'C', 'R'","[Algorithm, Array, Artificial Intelligence (AI..."
234,Project Manager (Data Science & Public Health),ICF,USA,2020-04-29,ICF seeks a Project Manager with a data scienc...,"'C', 'Excel', 'R'","[Analytical, Business Analysis, Collaboration,..."


In [97]:
skill_count = {}
for _, row in df3.iterrows():
    for s in row['skills']:
        if s in skill_count:
            skill_count[s] += 1
        else:
            skill_count[s] = 1
skill_count = {k: v for k, v in sorted(skill_count.items(), key=lambda item: item[1], reverse=True)}
skill_count

{'Analytical': 120,
 'Leadership': 109,
 'Data Science': 91,
 'Communication': 87,
 'Computer Science': 86,
 'Design': 84,
 'Engineering': 79,
 'Platform': 75,
 'Project Management': 75,
 'Market': 75,
 'Innovation': 73,
 'Collaboration': 70,
 'Machine Learning': 67,
 'Research': 66,
 'Presentation': 64,
 'Training': 61,
 'Artificial Intelligence (AI)': 60,
 'Sales': 55,
 'Reporting': 54,
 'Marketing': 52,
 'Statistics': 46,
 'Problem Solving': 42,
 'Management': 41,
 'Documentation': 41,
 'Analysis': 40,
 'Law': 39,
 'TestNG': 38,
 'Mathematics': 36,
 'Architecture': 34,
 'Decision Making': 34,
 'Information Technology (IT)': 34,
 'Testing': 34,
 'Budget': 32,
 'Product Management': 32,
 'English': 31,
 'Written Communication': 30,
 'Finance': 29,
 'Database': 29,
 'Agile': 28,
 'Business Development': 28,
 'Business Intelligence (BI)': 27,
 'Consulting': 27,
 'R': 27,
 'Data Analytics': 25,
 'Microsoft Access': 25,
 'Algorithm': 24,
 'Email': 24,
 'Network': 24,
 'Product Development