In [1]:
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt 
import seaborn as sns
from sqlalchemy import create_engine
from datetime import datetime as dt
from secrets import secrets
from skill_api import extract_skills, extract_ignore

In [2]:
# Skills
df_skills = pd.read_csv('skills_db2/skill.csv')
SKILLS = df_skills['Skill'].unique().tolist()
# Redundant skills
df_redskills = pd.read_excel('skills/Other Skills.xlsx')
RED_SKILLS = df_redskills['Skill'].unique().tolist()
# Duplicate skills
df_dupskills = pd.read_excel('skills/Other Skills.xlsx', sheet_name='Duplicates')
DUP_SKILLS = df_dupskills.set_index('Skill').to_dict()['Parent']
SKILLS.extend(list(DUP_SKILLS.keys()))

In [4]:
query = """
select Job_ID, Job_Title, Company, Country, Date_Posted, Description, Keywords_Present, Title_Keywords 
from {}
where Date_Posted > '2020-04-01'
and Description != 'No Description'
"""

def extract_df(job):
    df = pd.read_sql(query.format(job), engine)
    df.drop_duplicates(inplace=True)
    no_skill = df['Keywords_Present'].isna()
    df['Num_Skill'] = 0
    df.loc[~no_skill, 'Num_Skill'] = df.loc[~no_skill, 'Keywords_Present'].apply(lambda x: len(x.split(',')))
    print('Number of rows', len(df))
    return df

def extract_skills_all(df):
    i = 0
    job_info_list = []
    initial = dt.now()
    interval = dt.now()
    print_every = int(len(df) / 20)
    for _, job in df.iterrows():
        i += 1
        if i % print_every == 0:
            print("{} jobs processed. Time taken: {}".format(i, dt.now() - interval))
            interval = dt.now()
        all_info = job['Job_Title'] + ' ' + job['Description']
        all_skills = extract_skills(all_info, SKILLS)
        keep_skills, _ = extract_ignore(all_skills, RED_SKILLS, DUP_SKILLS)
        keep_skills.sort()
        # ignore_skills.sort()
        job_info_list.append({
            'title': job['Job_Title'],
            'company': job['Company'],
            'country': job['Country'],
            'date_posted': job['Date_Posted'],
            'description': job['Description'],
            'indeed_skills': job['Keywords_Present'],
            'skills': keep_skills,
        })
    print("Total time taken: {}".format(dt.now() - initial))
    return pd.DataFrame.from_dict(job_info_list)

In [5]:
engine = create_engine(secrets['indeed_db'])
df_ds = extract_df('Data_Scientist')
df_ds.head()

Number of rows 2896


Unnamed: 0,Job_ID,Job_Title,Company,Country,Date_Posted,Description,Keywords_Present,Title_Keywords,Num_Skill
2,4b5160535b55a659,Junior- Mid level Data Scientist openings,ClientSolv Technologies,USA,2020-05-08,Company DescriptionClientSolv Technologies is ...,'C','Junior',1
3,d5326d3f749525a9,Data Scientist,Triplebyte,USA,2020-05-07,About TriplebyteWe are a platform that helps e...,"'C', 'Machine Learning', 'R'",,3
7,f2a0194148860a86,"Data Scientist/Epidemiologist, Activity #2",Heluna Health,USA,2020-04-23,Project: LA County Novel Coronavirus (COVID-19...,"'C', 'R', 'SQL'",,3
10,fcabe7775b900c1c,Senior Data Scientist,SAIC,USA,2020-05-08,Job DescriptionDescriptionSAIC is seeking a re...,"'C', 'R', 'Statistics'",'Senior',3
16,5e84b2024b6813c2,Data Scientist,Intrepid,USA,2020-04-10,Intrepid sets the standard for delivering exce...,"'C', 'Python', 'R'",,3


In [6]:
engine.dispose()
df_dsj = extract_skills_all(df_ds)
df_dsj.head()

144 jobs processed. Time taken: 0:06:21.666809
288 jobs processed. Time taken: 0:06:03.697011
432 jobs processed. Time taken: 0:06:08.532298
576 jobs processed. Time taken: 0:05:37.191274
720 jobs processed. Time taken: 0:06:11.201109
864 jobs processed. Time taken: 0:06:10.987541
1008 jobs processed. Time taken: 0:06:39.284815
1152 jobs processed. Time taken: 0:07:07.009025
1296 jobs processed. Time taken: 0:06:38.922956
1440 jobs processed. Time taken: 0:07:19.292091
1584 jobs processed. Time taken: 0:08:23.135630
1728 jobs processed. Time taken: 0:06:35.178270
1872 jobs processed. Time taken: 0:09:03.286620
2016 jobs processed. Time taken: 0:06:46.331302
2160 jobs processed. Time taken: 0:06:09.596201
2304 jobs processed. Time taken: 0:05:39.781271
2448 jobs processed. Time taken: 0:06:32.337418
2592 jobs processed. Time taken: 0:05:15.511003
2736 jobs processed. Time taken: 0:06:29.782548
2880 jobs processed. Time taken: 0:07:09.106723
Total time taken: 2:13:09.571922


Unnamed: 0,title,company,country,date_posted,description,indeed_skills,skills
0,Junior- Mid level Data Scientist openings,ClientSolv Technologies,USA,2020-05-08,Company DescriptionClientSolv Technologies is ...,'C',"[Analytical, Business Administration, Hardware..."
1,Data Scientist,Triplebyte,USA,2020-05-07,About TriplebyteWe are a platform that helps e...,"'C', 'Machine Learning', 'R'","[Bayesian Inference, Data Acquisition, Dataset..."
2,"Data Scientist/Epidemiologist, Activity #2",Heluna Health,USA,2020-04-23,Project: LA County Novel Coronavirus (COVID-19...,"'C', 'R', 'SQL'","[Analysis, Data Management, Data Visualization..."
3,Senior Data Scientist,SAIC,USA,2020-05-08,Job DescriptionDescriptionSAIC is seeking a re...,"'C', 'R', 'Statistics'","[Agile, Analytical, Business Process, Dashboar..."
4,Data Scientist,Intrepid,USA,2020-04-10,Intrepid sets the standard for delivering exce...,"'C', 'Python', 'R'","[Administration, Administrative Law, Analytica..."


In [7]:
df_dsj.to_csv('indeed-insights/data_scientist_3.csv', index=False)

## K Means Clustering

In [10]:
skills = []

for _, row in df_dsj.iterrows():
    for s in row['skills']:
        if s not in skills:
            skills.append(s)

len(skills)

1516

In [52]:
import numpy as np

job_skills = df_dsj['skills'].tolist()
X = []

for job in job_skills:
    arr = np.zeros(len(skills))
    for s in job:
        arr[skills.index(s)] = 1
    X.append(arr)

X = np.array(X)
X.shape

(2896, 1516)

In [58]:
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans

# clustering = DBSCAN(metric=dist_func).fit(X)
# clustering = DBSCAN().fit(X)
clustering = KMeans(n_clusters=5, random_state=42).fit(X)
clustering

KMeans(n_clusters=5, random_state=42)

In [59]:
clustering.labels_

array([4, 1, 4, ..., 0, 4, 2], dtype=int32)

In [111]:
df2 = df_dsj.copy()
df2['cluster'] = clustering.labels_
df2.head()

Unnamed: 0,title,company,country,date_posted,description,indeed_skills,skills,cluster
0,Junior- Mid level Data Scientist openings,ClientSolv Technologies,USA,2020-05-08,Company DescriptionClientSolv Technologies is ...,'C',"[Analytical, Business Administration, Hardware...",4
1,Data Scientist,Triplebyte,USA,2020-05-07,About TriplebyteWe are a platform that helps e...,"'C', 'Machine Learning', 'R'","[Bayesian Inference, Data Acquisition, Dataset...",1
2,"Data Scientist/Epidemiologist, Activity #2",Heluna Health,USA,2020-04-23,Project: LA County Novel Coronavirus (COVID-19...,"'C', 'R', 'SQL'","[Analysis, Data Management, Data Visualization...",4
3,Senior Data Scientist,SAIC,USA,2020-05-08,Job DescriptionDescriptionSAIC is seeking a re...,"'C', 'R', 'Statistics'","[Agile, Analytical, Business Process, Dashboar...",2
4,Data Scientist,Intrepid,USA,2020-04-10,Intrepid sets the standard for delivering exce...,"'C', 'Python', 'R'","[Administration, Administrative Law, Analytica...",2


In [112]:
df2['cluster'].unique()

array([4, 1, 2, 3, 0], dtype=int32)

In [203]:
def get_skill_count(df, cluster):
    skill_count = {}
    df_temp = df.loc[df['cluster'] == cluster]
    for _, row in df_temp.iterrows():
        for s in row['skills']:
            if s in skill_count:
                skill_count[s] += 1
            else:
                skill_count[s] = 1
    skill_count = [k for k, v in sorted(skill_count.items(), key=lambda item: item[1], reverse=True)]
    skill_count.remove('TestNG')
    skills = skill_count[:30]
    for i, s in enumerate(skills):
        print('{}. {}'.format(i+1, s))

In [204]:
get_skill_count(df2, 0)

1. Testing
2. Analytical
3. Training
4. Communication
5. Design
6. Computer Science
7. Problem Solving
8. Research
9. Documentation
10. Market
11. Collaboration
12. Platform
13. Electronics
14. Analysis
15. Innovation
16. Law
17. Data Analysis
18. Research and Development (R&D)
19. Artificial Intelligence (AI)
20. Data Science
21. Engineering
22. Reporting
23. Machine Learning
24. Leadership
25. Database
26. Sales
27. Security
28. Management
29. Protocol
30. Information Technology (IT)


In [205]:
get_skill_count(df2, 1)

1. Machine Learning
2. Computer Science
3. Data Science
4. Artificial Intelligence (AI)
5. Analytical
6. Research
7. Engineering
8. Algorithm
9. Platform
10. Python
11. Design
12. Statistics
13. Leadership
14. Mathematics
15. Communication
16. Innovation
17. Collaboration
18. Deep Learning
19. R
20. Security
21. Analysis
22. Modelling
23. Programming
24. Software Engineering
25. Training
26. Data Analysis
27. Law
28. Problem Solving
29. Big Data
30. Data Analytics


In [206]:
get_skill_count(df2, 2)

1. Analytical
2. Statistics
3. Computer Science
4. Mathematics
5. Data Science
6. Reporting
7. Statistical Analysis
8. Training
9. Data Analysis
10. Research
11. Design
12. Leadership
13. Presentation
14. Visualization
15. Communication
16. Problem Solving
17. Innovation
18. Collaboration
19. Data Management
20. Engineering
21. Dataset
22. Processing
23. Documentation
24. R
25. Microsoft Access
26. Programming
27. Biostatistics
28. Management
29. Data Visualization
30. Law


In [207]:
get_skill_count(df2, 3)

1. Analytical
2. Communication
3. Market
4. Leadership
5. Training
6. Analysis
7. Design
8. Presentation
9. Innovation
10. Sales
11. Collaboration
12. Research
13. Platform
14. Reporting
15. Problem Solving
16. Project Management
17. Law
18. Marketing
19. Network
20. Data Science
21. Management
22. Microsoft Excel
23. English
24. Life Science
25. Writing
26. Manufacturing
27. Research and Development (R&D)
28. Engineering
29. Insurance
30. Artificial Intelligence (AI)


In [208]:
get_skill_count(df2, 4)

1. Research
2. Design
3. Analytical
4. Training
5. Communication
6. Analysis
7. Data Analysis
8. Data Science
9. Platform
10. Management
11. Computer Science
12. Artificial Intelligence (AI)
13. Engineering
14. Database
15. Leadership
16. Written Communication
17. Information Technology (IT)
18. Security
19. Innovation
20. Statistics
21. Reporting
22. Documentation
23. Email
24. Microsoft Access
25. Research and Development (R&D)
26. Python
27. Marketing
28. Writing
29. R
30. Market


In [166]:
122/1010, 113/1010, 110/1010, 105/1010, 100/1010, 88/1010

(0.12079207920792079,
 0.11188118811881188,
 0.10891089108910891,
 0.10396039603960396,
 0.09900990099009901,
 0.08712871287128712)

## Topic Modeling

In [147]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(max_df=0.8, min_df=5, stop_words='english')

doc_term_matrix = count_vect.fit_transform(df_dsj['description'].values.astype('U'))

In [148]:
doc_term_matrix

<2896x8872 sparse matrix of type '<class 'numpy.int64'>'
	with 531512 stored elements in Compressed Sparse Row format>

In [149]:
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=5, random_state=42)
LDA.fit(doc_term_matrix)

LatentDirichletAllocation(n_components=5, random_state=42)

In [150]:
for i,topic in enumerate(LDA.components_):
    print(f'Top 30 words for topic #{i+1}:')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-30:]])
    print('\n')

Top 30 words for topic #1:
['level', 'required', 'food', 'support', 'knowledge', 'sciences', 'working', 'life', 'including', 'role', 'years', 'related', 'company', 'technical', 'science', 'scientist', 'lab', 'skills', 'quality', 'process', 'molecular', 'cell', 'new', 'development', 'biology', 'team', 'laboratory', 'scientific', 'work', 'research']


Top 30 words for topic #2:
['following', 'related', 'qualifications', 'university', 'include', 'support', 'level', 'service', 'including', 'national', 'status', 'requirements', 'degree', 'employees', 'science', 'federal', 'job', 'management', 'time', 'applicants', 'provide', 'application', 'program', 'employment', 'education', 'required', 'research', 'work', 'position', 'information']


Top 30 words for topic #3:
['analytical', 'requirements', 'solutions', 'provide', 'complex', 'including', 'technical', 'develop', 'ability', 'degree', 'knowledge', 'information', 'project', 'programming', 'management', 'skills', 'research', 'health', 'analyt

In [151]:
topic_values = LDA.transform(doc_term_matrix)
print(topic_values.shape)
topic_values[1]

(2896, 5)


array([0.00106561, 0.04520808, 0.13743424, 0.81522425, 0.00106781])

In [152]:
df2['topic'] = topic_values.argmax(axis=1)
df2.head()

Unnamed: 0,title,company,country,date_posted,description,indeed_skills,skills,cluster,topic
0,Junior- Mid level Data Scientist openings,ClientSolv Technologies,USA,2020-05-08,Company DescriptionClientSolv Technologies is ...,'C',"[Analytical, Business Administration, Hardware...",4,2
1,Data Scientist,Triplebyte,USA,2020-05-07,About TriplebyteWe are a platform that helps e...,"'C', 'Machine Learning', 'R'","[Bayesian Inference, Data Acquisition, Dataset...",1,3
2,"Data Scientist/Epidemiologist, Activity #2",Heluna Health,USA,2020-04-23,Project: LA County Novel Coronavirus (COVID-19...,"'C', 'R', 'SQL'","[Analysis, Data Management, Data Visualization...",4,1
3,Senior Data Scientist,SAIC,USA,2020-05-08,Job DescriptionDescriptionSAIC is seeking a re...,"'C', 'R', 'Statistics'","[Agile, Analytical, Business Process, Dashboar...",2,3
4,Data Scientist,Intrepid,USA,2020-04-10,Intrepid sets the standard for delivering exce...,"'C', 'Python', 'R'","[Administration, Administrative Law, Analytica...",2,2


In [164]:
df2['cluster'].value_counts()

4    1010
1     563
3     546
2     398
0     379
Name: cluster, dtype: int64

In [161]:
df2['cluster'].value_counts(normalize=True)

4    0.348757
1    0.194406
3    0.188536
2    0.137431
0    0.130870
Name: cluster, dtype: float64

In [165]:
df2['topic'].value_counts()

3    1060
2     655
1     452
4     422
0     307
Name: topic, dtype: int64

In [163]:
df2['topic'].value_counts(normalize=True)

3    0.366022
2    0.226174
1    0.156077
4    0.145718
0    0.106008
Name: topic, dtype: float64

In [211]:
def get_skill_count(df, topic):
    skill_count = {}
    df_temp = df.loc[df['topic'] == topic]
    for _, row in df_temp.iterrows():
        for s in row['skills']:
            if s in skill_count:
                skill_count[s] += 1
            else:
                skill_count[s] = 1
    skill_count = [k for k, v in sorted(skill_count.items(), key=lambda item: item[1], reverse=True)]
    skill_count.remove('TestNG')
    skills = skill_count[:30]
    for i, s in enumerate(skills):
        print('{}. {}'.format(i+1, s))

In [212]:
get_skill_count(df2, 0)

1. Research
2. Design
3. Training
4. Communication
5. Molecular Biology
6. Analytical
7. Research and Development (R&D)
8. Life Science
9. Protocol
10. Biochemistry
11. Innovation
12. Testing
13. Documentation
14. Collaboration
15. Chemistry
16. Problem Solving
17. Data Analysis
18. Analysis
19. Presentation
20. Biotechnology
21. Engineering
22. Interpreter
23. Biology
24. Writing
25. Leadership
26. Genomics
27. Database
28. Management
29. Platform
30. Polymerase chain reaction (PCR)


In [213]:
get_skill_count(df2, 1)

1. Research
2. Analytical
3. Training
4. Computer Science
5. Communication
6. Analysis
7. Security
8. Data Science
9. Microsoft Access
10. Statistics
11. Law
12. Email
13. Information Technology (IT)
14. Mathematics
15. Leadership
16. Machine Learning
17. Presentation
18. Artificial Intelligence (AI)
19. Collaboration
20. Administration
21. Engineering
22. Design
23. Database
24. Documentation
25. Data Analysis
26. Management
27. Market
28. Writing
29. Written Communication
30. Reporting


In [214]:
get_skill_count(df2, 2)

1. Analytical
2. Statistics
3. Data Science
4. Design
5. Computer Science
6. Reporting
7. Leadership
8. Communication
9. Research
10. Training
11. Mathematics
12. Data Analysis
13. Statistical Analysis
14. Presentation
15. Problem Solving
16. Visualization
17. Machine Learning
18. Collaboration
19. Analysis
20. Innovation
21. R
22. Healthcare
23. Documentation
24. Dataset
25. Database
26. Accuracy
27. Data Management
28. Engineering
29. Law
30. Information Technology (IT)


In [215]:
get_skill_count(df2, 3)

1. Analytical
2. Machine Learning
3. Computer Science
4. Data Science
5. Platform
6. Artificial Intelligence (AI)
7. Design
8. Leadership
9. Communication
10. Research
11. Engineering
12. Innovation
13. Python
14. Market
15. Training
16. Statistics
17. Collaboration
18. Algorithm
19. Problem Solving
20. Analysis
21. Mathematics
22. Law
23. Data Analysis
24. Testing
25. Software Engineering
26. Insurance
27. Sales
28. Decision Making
29. Agile
30. Deep Learning


In [216]:
get_skill_count(df2, 4)

1. Communication
2. Design
3. Sales
4. Market
5. Research
6. Analytical
7. Training
8. Presentation
9. Testing
10. Analysis
11. Electronics
12. Innovation
13. Research and Development (R&D)
14. Marketing
15. Manufacturing
16. Platform
17. Reporting
18. Collaboration
19. Leadership
20. Problem Solving
21. Management
22. Network
23. Data Analysis
24. Semiconductors
25. Documentation
26. Project Management
27. English
28. Engineering
29. Artificial Intelligence (AI)
30. Insurance


## Topic Modeling (30 Topics)

In [167]:
count_vect = CountVectorizer(max_df=0.8, min_df=5, stop_words='english')
doc_term_matrix = count_vect.fit_transform(df_dsj['skills'].apply(lambda x: ' '.join(x)).values.astype('U'))
doc_term_matrix

<2896x809 sparse matrix of type '<class 'numpy.int64'>'
	with 69661 stored elements in Compressed Sparse Row format>

In [169]:
LDA = LatentDirichletAllocation(n_components=30, random_state=42)
LDA.fit(doc_term_matrix)

LatentDirichletAllocation(n_components=30, random_state=42)

In [170]:
for i,topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic #{i+1}:')
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-30:]])
    print('\n')

Top 10 words for topic #1:
['critical', 'electronics', 'healthcare', 'thinking', 'written', 'audit', 'professional', 'artificial', 'leadership', 'communication', 'risk', 'big', 'brand', 'informatics', 'analytical', 'ai', 'consulting', 'analysis', 'certified', 'database', 'systems', 'bi', 'analytics', 'business', 'intelligence', 'security', 'technology', 'management', 'data', 'information']


Top 10 words for topic #2:
['neural', 'applied', 'design', 'computing', 'classification', 'communication', 'signal', 'network', 'scripting', 'visualization', 'feature', 'programming', 'big', 'statistics', 'research', 'analysis', 'spark', 'modelling', 'language', 'processing', 'mathematics', 'python', 'apache', 'computer', 'engineering', 'algorithm', 'machine', 'science', 'learning', 'data']


Top 10 words for topic #3:
['tool', 'software', 'leadership', 'statistics', 'operations', 'innovation', 'big', 'problem', 'solving', 'platform', 'communication', 'design', 'computer', 'marketing', 'artificial'

['sales', 'reinforcement', 'collaboration', 'analysis', 'statistics', 'platform', 'research', 'internet', 'testng', 'mining', 'testing', 'microsoft', 'python', 'modelling', 'analytical', 'design', 'programming', 'azure', 'algorithm', 'engineering', 'deep', 'computer', 'nlp', 'natural', 'science', 'processing', 'data', 'language', 'machine', 'learning']


Top 10 words for topic #30:
['customer', 'technical', 'estate', 'sentry', 'pharmacology', 'organization', 'cro', 'real', 'contract', 'design', 'chemistry', 'organizational', 'skill', 'protocol', 'management', 'life', 'analysis', 'database', 'critical', 'documentation', 'training', 'thinking', 'research', 'analytical', 'science', 'communication', 'problem', 'solving', 'testng', 'testing']




In [171]:
topic_values = LDA.transform(doc_term_matrix)
print(topic_values.shape)
topic_values[1]

(2896, 30)


array([0.00119048, 0.00119048, 0.00119048, 0.16895865, 0.12994958,
       0.00119048, 0.00119048, 0.00119048, 0.00119048, 0.00119048,
       0.00119048, 0.00119048, 0.22687734, 0.00119048, 0.00119048,
       0.00119048, 0.00119048, 0.00119048, 0.00119048, 0.00119048,
       0.00119048, 0.00119048, 0.29811947, 0.00119048, 0.00119048,
       0.00119048, 0.00119048, 0.00119048, 0.14633306, 0.00119048])

In [172]:
df2['topic_30'] = topic_values.argmax(axis=1)
df2.head()

Unnamed: 0,title,company,country,date_posted,description,indeed_skills,skills,cluster,topic,topic_30
0,Junior- Mid level Data Scientist openings,ClientSolv Technologies,USA,2020-05-08,Company DescriptionClientSolv Technologies is ...,'C',"[Analytical, Business Administration, Hardware...",4,2,0
1,Data Scientist,Triplebyte,USA,2020-05-07,About TriplebyteWe are a platform that helps e...,"'C', 'Machine Learning', 'R'","[Bayesian Inference, Data Acquisition, Dataset...",1,3,22
2,"Data Scientist/Epidemiologist, Activity #2",Heluna Health,USA,2020-04-23,Project: LA County Novel Coronavirus (COVID-19...,"'C', 'R', 'SQL'","[Analysis, Data Management, Data Visualization...",4,1,12
3,Senior Data Scientist,SAIC,USA,2020-05-08,Job DescriptionDescriptionSAIC is seeking a re...,"'C', 'R', 'Statistics'","[Agile, Analytical, Business Process, Dashboar...",2,3,18
4,Data Scientist,Intrepid,USA,2020-04-10,Intrepid sets the standard for delivering exce...,"'C', 'Python', 'R'","[Administration, Administrative Law, Analytica...",2,2,0


In [190]:
temp_dict = []

for i in range(30):
    df_temp = df2.loc[df2['topic_30']==i]
    skill_count = {}
    for _, row in df_temp.iterrows():
        for s in row['skills']:
            if s in skill_count:
                skill_count[s] += 1
            else:
                skill_count[s] = 1
    skill_count = [(k, v) for k, v in sorted(skill_count.items(), key=lambda item: item[1], reverse=True)]
    skills = ', '.join([s[0] for s in skill_count][:10])
    temp_dict.append({
        'topic': i+1,
        'job_count': len(df_temp),
        'job_precent': round(len(df_temp) / len(df2) * 100, 2),
        'skills': skills
    })

temp_dict

[{'topic': 1,
  'job_count': 71,
  'job_precent': 2.45,
  'skills': 'Analytical, Information Technology (IT), Data Analytics, Artificial Intelligence (AI), Business Intelligence (BI), Database, Leadership, Analysis, Brand Management, Certified Information Systems Security Professional (CISSP)'},
 {'topic': 2,
  'job_count': 127,
  'job_precent': 4.39,
  'skills': 'Machine Learning, Data Science, Computer Science, Python, Algorithm, Statistics, Research, Engineering, Mathematics, Design'},
 {'topic': 3,
  'job_count': 153,
  'job_precent': 5.28,
  'skills': 'Machine Learning, Analytical, Data Science, Artificial Intelligence (AI), Engineering, Computer Science, Design, Platform, Problem Solving, Market'},
 {'topic': 4,
  'job_count': 51,
  'job_precent': 1.76,
  'skills': 'Statistics, Computer Science, Mathematics, Physics, Analytical, Research, Security, Engineering, Problem Solving, Training'},
 {'topic': 5,
  'job_count': 118,
  'job_precent': 4.07,
  'skills': 'Analytical, Informati

## Managerial Roles

In [96]:
df3 = df_dsj.loc[df_dsj['title'].str.contains('manager', case=False)]
df3.head()

Unnamed: 0,title,company,country,date_posted,description,indeed_skills,skills
105,Principal Data Scientist Manager - Cloud Hardw...,Microsoft,USA,2020-05-02,To delight our customers in a Cloud First worl...,"'C', 'R'","[Applied Statistics, Automation, Cloud Service..."
136,Arity-Data Scientist-Sr Manager,Allstate,USA,2020-04-10,"Founded by The Allstate Corporation in 2016, A...","'C', 'Excel', 'R'","[Agile, Architecture, Business Analysis, Busin..."
202,Project Manager - Data Science | Chicago IL,Photon,USA,2020-04-28,Project Manager - Data Science | Chicago IL - ...,"'C', 'R', 'SQL'","[Apache Spark, Business Intelligence (BI), Dat..."
208,Corporate Communication Manager - Artificial I...,TuSimple,USA,2020-05-11,"This role is located in San Diego, CA. Relocat...","'Artificial Intelligence', 'C', 'R'","[Algorithm, Array, Artificial Intelligence (AI..."
234,Project Manager (Data Science & Public Health),ICF,USA,2020-04-29,ICF seeks a Project Manager with a data scienc...,"'C', 'Excel', 'R'","[Analytical, Business Analysis, Collaboration,..."


In [97]:
skill_count = {}
for _, row in df3.iterrows():
    for s in row['skills']:
        if s in skill_count:
            skill_count[s] += 1
        else:
            skill_count[s] = 1
skill_count = {k: v for k, v in sorted(skill_count.items(), key=lambda item: item[1], reverse=True)}
skill_count

{'Analytical': 120,
 'Leadership': 109,
 'Data Science': 91,
 'Communication': 87,
 'Computer Science': 86,
 'Design': 84,
 'Engineering': 79,
 'Platform': 75,
 'Project Management': 75,
 'Market': 75,
 'Innovation': 73,
 'Collaboration': 70,
 'Machine Learning': 67,
 'Research': 66,
 'Presentation': 64,
 'Training': 61,
 'Artificial Intelligence (AI)': 60,
 'Sales': 55,
 'Reporting': 54,
 'Marketing': 52,
 'Statistics': 46,
 'Problem Solving': 42,
 'Management': 41,
 'Documentation': 41,
 'Analysis': 40,
 'Law': 39,
 'TestNG': 38,
 'Mathematics': 36,
 'Architecture': 34,
 'Decision Making': 34,
 'Information Technology (IT)': 34,
 'Testing': 34,
 'Budget': 32,
 'Product Management': 32,
 'English': 31,
 'Written Communication': 30,
 'Finance': 29,
 'Database': 29,
 'Agile': 28,
 'Business Development': 28,
 'Business Intelligence (BI)': 27,
 'Consulting': 27,
 'R': 27,
 'Data Analytics': 25,
 'Microsoft Access': 25,
 'Algorithm': 24,
 'Email': 24,
 'Network': 24,
 'Product Development