In [9]:
import json
import os
import pandas as pd

In [88]:
# Create an empty list to store the job data
job_data = []

# Iterate through job data files and extract job information
for role_name in ["data scientist", "data analyst", "data engineer", "machine learning engineer"]:
    for company in ["apple", "google", "microsoft", "facebook", "tesla", "amazon", "UT Health Science Center at San Antonio"]:
        folder_path = f"raw_data/{role_name.replace(' ', '-')}/{company.replace(' ', '-')}"
        for filename in os.listdir(folder_path):
            with open(os.path.join(folder_path, filename), 'r') as f:
                job = json.load(f)
                job_data.append({
                    'id': job["id"],
                    'role': role_name,
                    'company': job["company"],
                    'description': job["description"]
                })

# Create a pandas DataFrame from the job data
df = pd.DataFrame(job_data)

print(f"There are {df.shape[0]} rows and {df.shape[1]} columns" + ".")
print("The four columns are " + ", ".join(df.columns) + ".")

print("Number of unique roles:", df["role"].nunique())
print("Number of unique companies:", df["company"].nunique())
role_counts = df["role"].value_counts().to_dict()

for role, count in role_counts.items():
    print(f"There are {count} {role} roles.")



There are 241 rows and 4 columns.
The four columns are id, role, company, description.
Number of unique roles: 4
Number of unique companies: 50
There are 65 data analyst roles.
There are 61 data engineer roles.
There are 60 machine learning engineer roles.
There are 55 data scientist roles.


In [17]:
import spacy
from collections import Counter

# Load the English language model for spaCy
nlp = spacy.load('en_core_web_sm')

# Combine all job descriptions into one long string
descriptions = " ".join(df["description"].values)

# Tokenize the combined string using spaCy
doc = nlp(descriptions)

# Create a counter object to count the frequency of each token
token_freq = Counter(token.text for token in doc if not token.is_stop and not token.is_punct and not token.is_space)

# Print the top 10 most common tokens
print(token_freq.most_common(10))


[('data', 2021), ('experience', 799), ('business', 635), ('Experience', 619), ('Data', 591), ('team', 511), ('work', 468), ('Apple', 392), ('role', 373), ('including', 371)]


In [41]:
import spacy
from collections import Counter

# Load the English language model for spaCy
nlp = spacy.load('en_core_web_sm')

# Combine all job descriptions into one long string
descriptions = " ".join(df["description"].values)

# Tokenize the combined string using spaCy
doc = nlp(descriptions)

# Create a counter object to count the frequency of each two-word phrase
phrase_freq = Counter((doc[i].text.lower(), doc[i+1].text.lower()) for i in range(len(doc)-1) if not doc[i].is_stop and not doc[i+1].is_stop and not doc[i].is_punct and not doc[i+1].is_punct and not doc[i].is_space and not doc[i+1].is_space)

# Print the top 10 most common two-word phrases
print(phrase_freq.most_common(10))


[(('machine', 'learning'), 354), (('base', 'pay'), 165), (('+', 'years'), 163), (('computer', 'science'), 144), (('data', 'science'), 142), (('employee', 'stock'), 99), (('preferred', 'qualifications'), 94), (('data', 'center'), 88), (('equal', 'employment'), 84), (('equal', 'opportunity'), 83)]


In [93]:
import re
from collections import Counter

# Define a dictionary of abbreviations
abbreviations = {
    "machine learning": "ML",
    "artificial intelligence": "AI",
    "natural language processing": "NLP"
}

# Define a list of phrases to search for
phrases = ["SQL", "R", "data mining", "python", "java", "machine learning", "ML", "natural language processing", "NLP", "deep learning", "data analytics", "predictive modeling", "forecasting", "business", "statistical analysis", "KPI", "artificial intelligence", "AI", "statistical modeling", "data management", "algorithms", "automation", "TensorFlow", "PyTorch", "big data", "SAS", "survival analysis", "time series", "theory", "inference", "Power BI", "Tableau"]

# Create a counter object to count the number of descriptions that contain each phrase
phrase_count = Counter()

# Iterate through each description and count the number of phrases it contains
for desc in df["description"].values:
    desc_phrases = set()
    for phrase in phrases:
        if re.search(rf"\b{phrase}\b", desc, re.IGNORECASE):
            desc_phrases.add(phrase)
    for phrase in desc_phrases:
        if phrase in abbreviations:
            phrase = abbreviations[phrase]
        phrase_count[phrase] += 1

# Print the phrases and the number of descriptions that contain them in descending order
total_count = 0

for phrase, count in phrase_count.most_common():
    print(f"{phrase}: {count}")
    total_count += count

print(f"\nTotal count: {total_count}")


business: 167
ML: 165
python: 147
SQL: 123
AI: 54
R: 52
algorithms: 51
big data: 49
Tableau: 49
java: 46
deep learning: 44
data analytics: 42
data mining: 38
NLP: 38
data management: 34
TensorFlow: 33
automation: 24
Power BI: 20
PyTorch: 20
forecasting: 19
statistical analysis: 17
predictive modeling: 10
KPI: 8
time series: 8
statistical modeling: 7
SAS: 7
inference: 7
theory: 6
survival analysis: 1

Total count: 1286


In [97]:
import re
from collections import Counter

# Define a dictionary of abbreviations
abbreviations = {
    "machine learning": "ML",
    "artificial intelligence": "AI",
    "natural language processing": "NLP"
}

# Define a list of phrases to search for
phrases = ["Excel", "SQL", "R", "data mining", "python", "java", "machine learning", "ML", "natural language processing", "NLP", "deep learning", "data analytics", "predictive modeling", "forecasting", "business", "statistical analysis", "KPI", "artificial intelligence", "AI", "statistical modeling", "data management", "algorithms", "automation", "TensorFlow", "PyTorch", "big data", "SAS", "survival analysis", "time series", "inference", "Power BI", "Tableau"]

# Create a counter object to count the number of descriptions that contain each phrase
phrase_count = Counter()

# Iterate through each description and count the number of phrases it contains
for desc in df["description"].values:
    desc_phrases = set()
    for phrase in phrases:
        if re.search(rf"\b{phrase}\b", desc, re.IGNORECASE):
            desc_phrases.add(phrase)
    for phrase in desc_phrases:
        if phrase in abbreviations:
            phrase = abbreviations[phrase]
        phrase_count[phrase] += 1

# Print the phrases and the percentage of descriptions that contain them in descending order
total_count = len(df)

for phrase, count in phrase_count.most_common():
    score = count / total_count
    print(f"{phrase}: {score:.2%}")

print(f"\nTotal count: {total_count}")


business: 69.29%
ML: 68.46%
python: 61.00%
SQL: 51.04%
AI: 22.41%
R: 21.58%
algorithms: 21.16%
big data: 20.33%
Tableau: 20.33%
java: 19.09%
deep learning: 18.26%
data analytics: 17.43%
data mining: 15.77%
NLP: 15.77%
data management: 14.11%
TensorFlow: 13.69%
Excel: 11.62%
automation: 9.96%
Power BI: 8.30%
PyTorch: 8.30%
forecasting: 7.88%
statistical analysis: 7.05%
predictive modeling: 4.15%
KPI: 3.32%
time series: 3.32%
statistical modeling: 2.90%
SAS: 2.90%
inference: 2.90%
survival analysis: 0.41%

Total count: 241


In [94]:
import re
from collections import Counter

# Define a dictionary of abbreviations
abbreviations = {
    "machine learning": "ML",
    "artificial intelligence": "AI",
    "natural language processing": "NLP"
}

# Define a list of phrases to search for
phrases = ["SQL", "R", "data mining", "python", "java", "machine learning", "ML", "natural language processing", "NLP", "deep learning", "data analytics", "predictive modeling", "forecasting", "business", "statistical analysis", "KPI", "artificial intelligence", "AI", "statistical modeling", "data management", "algorithms", "automation", "TensorFlow", "PyTorch", "big data", "SAS", "survival analysis", "time series", "theory", "inference", "Power BI", "Tableau"]

# Create a counter object to count the number of descriptions that contain each phrase
phrase_count = Counter()

# Iterate through each description and count the number of phrases it contains
for desc in df["description"].values:
    desc_phrases = set()
    for phrase in phrases:
        if re.search(rf"\b{phrase}\b", desc, re.IGNORECASE):
            desc_phrases.add(phrase)
    for phrase in desc_phrases:
        if phrase in abbreviations:
            phrase = abbreviations[phrase]
        phrase_count[phrase] += 1

# Print the phrases, the number of descriptions that contain them, and their score in descending order
total_count = sum(phrase_count.values())

for phrase, count in phrase_count.most_common():
    score = count / total_count
    print(f"{phrase}: {count} ({score:.2%})")

print(f"\nTotal count: {total_count}")


business: 167 (12.99%)
ML: 165 (12.83%)
python: 147 (11.43%)
SQL: 123 (9.56%)
AI: 54 (4.20%)
R: 52 (4.04%)
algorithms: 51 (3.97%)
big data: 49 (3.81%)
Tableau: 49 (3.81%)
java: 46 (3.58%)
deep learning: 44 (3.42%)
data analytics: 42 (3.27%)
data mining: 38 (2.95%)
NLP: 38 (2.95%)
data management: 34 (2.64%)
TensorFlow: 33 (2.57%)
automation: 24 (1.87%)
Power BI: 20 (1.56%)
PyTorch: 20 (1.56%)
forecasting: 19 (1.48%)
statistical analysis: 17 (1.32%)
predictive modeling: 10 (0.78%)
KPI: 8 (0.62%)
time series: 8 (0.62%)
statistical modeling: 7 (0.54%)
SAS: 7 (0.54%)
inference: 7 (0.54%)
theory: 6 (0.47%)
survival analysis: 1 (0.08%)

Total count: 1286


In [65]:
import re
from collections import Counter

# Define a list of phrases to search for
phrases = ["SQL", "R", "data mining", "python", "java", "machine learning", "ML", "natural language processing", "NLP", "deep learning", "data analytics", "predictive modeling", "forecasting", "business", "statistical analysis", "KPI", "artificial intelligence", r"\bAI\b", "statistical modeling", "data management", "algorithms", "automation", "TensorFlow", "PyTorch", "big data", "SAS", "survival analysis", "time series"]

# Create a counter object to count the number of descriptions that contain each phrase
phrase_count = Counter()

# Iterate through each description associated with the desired company and count the number of phrases it contains
for desc in df[df["company"] == "UT Health San Antonio"]["description"].values:
    desc_phrases = set()
    for phrase in phrases:
        if re.search(rf"\b{phrase}\b", desc, re.IGNORECASE):
            desc_phrases.add(phrase)
    for phrase in desc_phrases:
        phrase_count[phrase] += 1

# Print the phrases and the number of descriptions that contain them in descending order
for phrase, count in phrase_count.most_common():
    print(f"{phrase}: {count}")


data analytics: 5
business: 5
predictive modeling: 4
data management: 4
KPI: 4
SQL: 4
forecasting: 3
SAS: 2
R: 2
java: 2
python: 2
data mining: 2
statistical analysis: 1


In [99]:
import spacy
from collections import Counter

# Load the English language model for spaCy
nlp = spacy.load('en_core_web_sm')

# Create an empty dictionary to store token counts for each job role
token_counts_by_role = {}

# Iterate over each job role
for role in df["role"].unique():
    # Combine all job descriptions for the current role into one long string
    descriptions = " ".join(df[df["role"] == role]["description"].values)

    # Tokenize the combined string using spaCy
    doc = nlp(descriptions)

    # Create a counter object to count the frequency of each token
    token_freq = Counter(token.lemma_ for token in doc if not token.is_stop and not token.is_punct and not token.is_space)

    # Convert abbreviations back into full phrases for readability
    token_freq = {key: abbreviations.get(key, key) + f" ({count})" for key, count in token_freq.items()}

    # Store the token counts for the current role in the dictionary
    token_counts_by_role[role] = token_freq

    # Print the top 10 most common tokens for the current role
    print(f"Top 10 tokens for {role}:")
    for token, count in token_freq.most_common(10):
        print(f"{token}: {count}")
    print()


Top 10 tokens for data scientist:


AttributeError: 'dict' object has no attribute 'most_common'