## Imports and initializing

In [1]:
#Important pip installs (Once per hardware is generally enough) 
#Preferred to be done in an external terminal in a specific virtual environment.
#However, if not possible, run this cell to download it.
import sys
#!{sys.executable} -m pip install --upgrade pip
#!{sys.executable} -m pip install numpy
#!{sys.executable} -m pip install pandas

## if error (e.g. c++ error) open jupyter notebook through anaconda prompt
## installataion https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html
#!{sys.executable} -m pip install scattertext  


#!{sys.executable} -m pip install nltk
#!{sys.executable} -m pip install regex
#!{sys.executable} -m pip install requests

In [2]:
#Typical data science imports
import numpy as np
import pandas as pd

In [3]:
#Specific NLP imports

import scattertext as st


#Stopwords to remove from data to improve NLP
import requests    
#Take a seemingly comprehensive stopword list from the internet.
stopwords_list = requests.get("https://gist.githubusercontent.com/ZohebAbai/513218c3468130eacff6481f424e4e64/raw/b70776f341a148293ff277afa0d0302c8c38f7e2/gist_stopwords.txt").content
global stopwords 
stopwords = stopwords_list.decode().split(',') 


In [4]:
#Regex for text processing
import re
def remove_special_characters(word_list,remove_stop_words = True):
    global stopwords
    res = re.findall('(?!_)\w+(?<!_)', word_list.lower())
    if remove_stop_words == True:
        no_stop_words = [word for word in res if word not in stopwords]
        res = " ".join(no_stop_words)
    else:
        with_stop_words = [word for word in res]
        res = " ".join(with_stop_words)
    return res

In [5]:
#Extra for specific situations
from scipy.stats import hmean
def compare_df_category_count(word_df_list):
    all_df = pd.concat(word_df_list)
    grouped_df = all_df.groupby(all_df.index.tolist(),as_index=False).size().sort_values(by='size', ascending = False)
    return grouped_df

## Functions

### 1. Functions for reading csv files to get the F-score

In [6]:
def csv_string_list_to_df_list(csv_list,drop_duplicates=True,drop_rows_to_lowest = True):
    """Reads pruned csv files and converts to a list of data frame for each file(role).
    Args:
        text (str): .csv file names including .csv   (e.g. data_scientist.csv)
    Returns:
        list of pandas.DataFrame: representing each category. 
    """
    
    #Read csv file and prune it. Repeat for all csv files.
    df_list = []
    for csv in csv_list:
        category_df = pd.read_csv(csv)         #Reads csv file
        category_name = csv.split('.')[0]      #The category gets named based on file name. fullstack.csv => fullstack
        category_df['title'] = [category_name]*len(category_df) #continuation of above line
        category_df.dropna(subset=['requirements'],inplace=True)          #drop rows with null
        if drop_duplicates == True:
            category_df =  category_df[category_df.groupby(['title','requirements']).cumcount().le(0)] #remove duplicates
            #category_df.drop_duplicates(subset=['requirements'],inplace=True) 
        df_list.append(category_df)  
    
    #To remove participation bias, all roles should have the same amount of data.
    #Cut down rows so that all roles have the same rows as the role with the lowest amount of rows.
    if drop_rows_to_lowest == True:
        lowest_len = len(df_list[0])         
        for df in df_list:                   
            if lowest_len > len(df):
                lowest_len = len(df)
        df_list = list(map(lambda x: x[:lowest_len],df_list))    
    
    
    #Add the column to be parsed. requirements column will be used for the model.
    new_df =[]
    for df in df_list:                 
        df['parsed'] =df['requirements'].apply(remove_special_characters)     
        #df['parsed'] =df['parsed'].apply(ss.stem)       #stemming could be added
        df['parsed'] =df['parsed'].apply(st.whitespace_nlp_with_sentences)   #NLP Tokenizing 
        df = df[['title','parsed']]      #Drop every other column to get correct format.
        new_df.append(df)
    return new_df
    

In [7]:
def get_f_score(pruned_df_list,beta = 1):
    """Compares each df in df_list and returns word frequency and F-Score
    Args:
        list of pandas.DataFrame: (from method: csv_string_list_to_df_list)
    Returns:
        pandas.DataFrame: with frequency and F-score (roles*2 columns)
    """
    
    # Calculates frequency and f-score from df_list.
    all_categories_df = pd.concat(pruned_df_list) 
    corpus = (
        st.CorpusFromParsedDocuments(
            all_categories_df,                             #df of interest with parsed column
            category_col="title",                          #target column for comparison (x)
            parsed_col="parsed")                           #target column for parsed data (y)
            .build()                                       #instantiate build
            .get_unigram_corpus()                          #comparing single word by word. 
            #.compact(st.AssociationCompactor(2000))       #limiting only 2000 items to the corpus
    )
    # Makes a Dataframe out of the calculated data.
    #F-score = Harmonic mean of precision and frequency with beta 
    #beta < 1 => precision favored, vice verca.   beta = 0 => f-score = precision  vice verca
    #beta = 1 => freq and precision equally favored.

    freq_list = corpus.get_term_freq_df()   #Word frequency data frame
    f_score_list = freq_list.copy()      # to get the column names correct, temp1 and temp2 is needed
    for df in pruned_df_list:                #Adds F-score for each category
        precision = freq_list[df['title'].iloc[0] + ' freq']/freq_list.sum(axis=1)
        frequency = freq_list[df['title'].iloc[0] + ' freq']/freq_list[df['title'].iloc[0] + ' freq'].sum()
        f_score_list[df['title'].iloc[0] + ' f-score'] = (1+beta**2) * precision * frequency / (beta**2 * precision + frequency)
        
        
        
    return f_score_list.fillna(0) #Null occurs when 0-division happens, which is when presicion and freq = 0

### 2. Functions for calculating role based on F-score

In [8]:

def calculate_role(text,f_score_list,diminishing_repetition_amount = 0):
    """Gives each word in text score based on the f_score_list and 
    returns a data frame with scores for each role. 
    The score is squared to reward high-scoring/confident words.
    Args:
        text (str): Text to calculate role. 
        f_score_list (dataframe): (from previous method: get_f_score)
        diminishing_repetition_amount (float): If True, f_score will be divided by 1+n*diminishing_repetition_amount where n = times word appeared
    Returns:
        pandas.DataFrame: n long dataframe Each roles column name and respective F-score and percentage
    """
    
    #Goes through text word by word, giving points to each role respectively.
    word_dict = {} 
    text = remove_special_characters(text)
    f_scores = f_score_list.filter(regex='f-score')**2 # Only squared f-scores (no frequency columns)
    score_array = [0 for x in range(len(f_scores.columns))]  # creates [0,0,0,...,n] for n roles.
    score_df = pd.DataFrame(score_array, index=f_scores.columns ,columns=['Squared sum']).transpose()
    
    for w in text.split(' '):
        if w in f_scores.index:                             # if word is in f_score_list
            if w in word_dict:                              # if word has appeared before(diminishing repetion)
                word_dict[w] += diminishing_repetition_amount
            else:
                word_dict[w] = 1
            score_df+= (f_scores.loc[w])/word_dict[w]
            
    score_df = score_df.transpose()
    score_df['Percentage'] = score_df.apply(lambda x: round(100*x/x.sum(),2))
                
    return score_df                                                     

## An example of usage

### Initializing f-score list

In [9]:
#We have 6 different roles we want to analyze
csv_list = ['fullstack.csv','data_engineer.csv','data_analyst.csv','data_scientist.csv','ml_engineer.csv','devops_engineer.csv']

In [10]:
#Start with making dataframes of each role by reading the csv files.

df_list = csv_string_list_to_df_list(csv_list)

In [11]:
#Calculate the frequency and f-score of this df_list
#Lower beta => precision favored, vice verca.   beta = 0 => f-score = precision  vice verca
#For this data set, beta = 0.05 is good to find category specific words. 
#Supervised learning will optimize this later.
f_score_list = get_f_score(df_list,beta = 0.05)

In [12]:
#What does it look like?
f_score_list

Unnamed: 0_level_0,fullstack freq,data_engineer freq,data_analyst freq,data_scientist freq,ml_engineer freq,devops_engineer freq,fullstack f-score,data_engineer f-score,data_analyst f-score,data_scientist f-score,ml_engineer f-score,devops_engineer f-score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
computer,34,36,28,72,73,25,0.120420,0.125590,0.097323,0.248929,0.257969,0.088372
science,30,39,36,115,54,21,0.097000,0.124371,0.114419,0.363726,0.174243,0.067780
technical,22,27,22,51,14,28,0.123178,0.147617,0.119591,0.274934,0.078108,0.156290
field,11,27,29,59,25,10,0.062638,0.150073,0.160250,0.323277,0.141846,0.056766
relevant,10,11,13,20,7,7,0.120710,0.126227,0.147393,0.222871,0.083854,0.083939
...,...,...,...,...,...,...,...,...,...,...,...,...
acquiring,0,0,0,0,0,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.060382
isolate,0,0,0,0,0,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.060382
defects,0,0,0,0,0,2,0.000000,0.000000,0.000000,0.000000,0.000000,0.113904
reproduce,0,0,0,0,0,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.060382


In [13]:
#Lets order by fullstack f-score
f_score_list.sort_values(by="fullstack f-score",ascending = False)

Unnamed: 0_level_0,fullstack freq,data_engineer freq,data_analyst freq,data_scientist freq,ml_engineer freq,devops_engineer freq,fullstack f-score,data_engineer f-score,data_analyst f-score,data_scientist f-score,ml_engineer f-score,devops_engineer f-score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
react,44,1,0,0,4,1,0.678094,0.014453,0.000000,0.000000,0.061047,0.015281
javascript,49,5,5,4,3,10,0.539511,0.052563,0.051986,0.040934,0.032802,0.109440
node,27,1,0,0,3,6,0.520029,0.017787,0.000000,0.000000,0.057082,0.114348
competent,17,0,0,0,0,1,0.515658,0.000000,0.000000,0.000000,0.000000,0.029834
html5,15,0,0,0,0,0,0.500416,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
extends,0,1,0,1,0,0,0.000000,0.046928,0.000000,0.041836,0.000000,0.000000
groups,0,2,2,0,1,1,0.000000,0.079054,0.075889,0.000000,0.046225,0.046407
persuade,0,1,0,0,0,0,0.000000,0.049233,0.000000,0.000000,0.000000,0.000000
normalization,0,1,1,0,0,0,0.000000,0.046928,0.044715,0.000000,0.000000,0.000000


In [14]:
#If you want to know the f-score of a speicific word
f_score_list.loc['git']
#E.g. you know there is a new term that has not become widely known yet

fullstack freq             21.000000
data_engineer freq          5.000000
data_analyst freq           0.000000
data_scientist freq         3.000000
ml_engineer freq           11.000000
devops_engineer freq       35.000000
fullstack f-score           0.233787
data_engineer f-score       0.053120
data_analyst f-score        0.000000
data_scientist f-score      0.031017
ml_engineer f-score         0.121599
devops_engineer f-score     0.387269
Name: git, dtype: float64

### Using the f-score list to make predictions

In [15]:
##With this f_score_list, many calculations can be done
##One example could be to calcualte what role a CV best fits.

In [16]:
#Lets say that a consultant has this CV
cv_skills = "Python Java, React.js, SQL, NoSQL, Docker, TensorFlow, Machine Learning, Git"


In [17]:
#Calculate the score
calculate_role(cv_skills,f_score_list).sort_values(by='Percentage',ascending = False)

Unnamed: 0,Squared sum,Percentage
ml_engineer f-score,1.035395,39.6
fullstack f-score,0.60754,23.24
devops_engineer f-score,0.354453,13.56
data_scientist f-score,0.304304,11.64
data_engineer f-score,0.228479,8.74
data_analyst f-score,0.084329,3.23


In [18]:
##This works with any text

In [19]:
#Lets say we have a consultant description (Fullstack cv)
cv_description_fullstack = "Christine is an enthousiastic and curious creator who works well with any team. She keeps her eye on the project as a whole with a focus on UX. With over 10 years experience in design she has the ability to code quick and goodlooking solutions, especially in the frontend. Her knowledge of backend allows her to jump in where needed to bring the product to completion. She is often described as creative, encouraging, capable and kind."
cv_description_data_scientist = "Martin is a Data Scientist with a great passion for learning new things and tackling different problems in tech. His latest role was as a Data Scientist for Nira Dynamics where he had a leading technical role in development of new data-driven products within Predictive Analytics, going from vague ideas all the way to a product. Martin is also often working on his own projects parallel to his work at HiQ, these projects are most often in his favorite fields of A.I and VR. Examples are Generative A.I for music & images and VR worlds. He has previously participated and arranged a lot in events for game development and hackathons. Martin keeps pushing his knowledge further all the time through new certificates, reading technical books and working on different projects. He has been programming since 2008 where he began with physics simulations and mathematical help tools, he also studied a bachelor in applied physics and electrotechnology. Martin is appreciated by his colleagues for his problem solving skills, being both precise and creative, while also being humble and open towards other peoples solutions. He has a very positive and helpful attitude which makes him work well with others."

In [66]:
#Why this works will be shown below with visuals
calculate_role(cv_description_fullstack,f_score_list).sort_values(by='Percentage',ascending = False)

Unnamed: 0,Squared sum,Percentage
fullstack f-score,0.727313,30.84
data_analyst f-score,0.430808,18.26
data_scientist f-score,0.379651,16.1
data_engineer f-score,0.305731,12.96
devops_engineer f-score,0.273835,11.61
ml_engineer f-score,0.241317,10.23


In [21]:
#Lets copy different job posts and try applying the method on it
#The variables are named after what the post looked for. 
#e.g case_data_engineer was a announcement searching for a data engineer
case_fullstack = "Java Full stack developer 100 Remote 6 months Contract to Hire Can Spons JD Mandatory Java, JavaScript, React, Angular What You will Need Candidate possess a bachelorrsquos degree in Computer Science, or related field, or equivalent experience 7+ years of experience creating modern web experiences across devices Expert in creating customer experiences web portal using JavaScript, CSS and HTML Project experience using React, Angular, Vue.JS or similar frameworks andor libraries Working experience with distributed SCM (GitHub a plus), DevOps, AWS Experience integrating with lightweight middleware technologies, integration patterns, microservices Proficient demonstration of SQL knowledge Track record of taking ownership and driving results in a data-driven, fast-paced environment Excellent interpersonal and communication skills, strong analytical skills, and ability to deal with ambiguity in a rapidly evolving business environment Bonus Understanding of testing automation, including the building of scrappy codingautomation for testing code faster Experience in Retail, E-Commerce or Technology Industry, Merchandising Vendor Management focus a plus Strong engineering background building Productsprototypes that are E-Commerce scale Experience in Software Quality Processes and Atlassian Tools (Confluence, JIRA, etc.) Feel free to reach out to me for questions or clarifications."
case_data_engineer = "Design and implement data extraction process of data from existing systems - Update current data lake data stream design making them capable of handling new types of data streams - with subsequent implementation - Support development of graphical user presentation interface that uses the new data streams Description Core skills: Cloud technical development skills - Experience with AWS services like Glue, EC2, Lambda, S3, Redshift, PostgreSQL etc. - IT architecture (preferably AWS) - Experience with data harmonisation and data entity identification - Experience with data contextualization and visualisation – Preferably Tableau, Power BI - Azure DevOps -Python 3 - Document the technical architecture and the process flows Personal skills: Analytical mindset Can work independently but also collaboratively Visualisation of complex scenarios Can translate between business needs and technical requirements Self-paced Fluent in English"
case_data_analyst = "In this position you support Pleo’s Market Expansion and SMB Acquisition & Growth domains, and you are a member of the Data & Analytics competence group. SMB Acquisition & Growth Domain We contribute 70% of Pleo’s revenue, and we do that by wow-ing customers at every touchpoint, from our ads, to our website, to our sign up journey, to our renewals, to our engagement comms, in-product nudges, etc. We design for the end users, apply consumer approach to growth tactics, build for scalability, make the product accessible & fast, and push for organic growth. We are a super-cross-functional domain, with marketing, sales, customer success, product, etc. all housed under the same “virtual” roof, but a huge focus on experimentation and customer obsession. Market Expansion Domain Pleo will become the go-to spending solution for companies in the SMB segment across Europe, empowering employees by enabling a healthy spending culture. We are launching 15 new markets in 15 months as a way to launch simultaneous bets to supercharge Pleo’s hyper-growth. So What's Data & Analytics At Pleo Like? Join our community of 20+ talented data professionals working from more than 10 different locations worldwide with backgrounds in data analytics, analytical engineering and data engineering. Work with a modern data & analytics toolkit including Kafka, BigQuery, dbt, Looker, Fivetran, Segment, & Amplitude. A shared vision to stop the guessing game and unlock growth for Pleo. A lot of attention on data culture, strategy and empowerment of our stakeholders. Dedicated learning & development guidance and support for data professionals. Deal with a huge variety of data and insights spanning the entire business, from the detailed inner workings of the Pleo platform to web & app metrics, CRM details, financial and banking records, human resource information, and everything else needed to fuel Pleo’s decision making processes in a data inspired manner. What Great Looks Like In This Role You and your team are company-wide trusted experts for insights related to Market Expansion and SMB Acquisition & Growth domains. You successfully nurture a culture of data inspired decision making in alignment with the other Data & Analytics leaders. You are an ambassador for excellence within the Data & Analytics community, you lead by example and inspire others. You and your team empowered the Market Expansion and SMB Acquisition & Growth domains to derive descriptive and diagnostic insights autonomously, while you focus on predictive and prescriptive insights. You and your team are highly satisfied in terms of career development and your sense of belonging to the Market Expansion and SMB Acquisition & Growth domains and the Data & Analytics competence group. You solve problems through collaboration rather than control. Your Responsibilities Be the Data Analytics Competence Lead for up to 4 Data Analysts of Market Expansion and SMB Acquisition & Growth domains, providing career development and coaching Ensure Data & Analytics best practices and standards are identified, cultivated and followed in your competence team and domains, own the quality of work and provide ways of solving problems Act as multiplier for Data & Analytics inside the domains and facilitate the feedback loop to other Data & Analytics functions at Pleo Be accountable for your team’s delivery and quality of metrics and dashboards Drive a culture of experimentation and data-informed decision making across the entire customer lifecycle Facilitate activities to increase the level of data literacy within the domains Your Colleagues Say You Have excellent hands-on knowledge of SQL and experience with more advanced analytics topics such as cohort and regression analysis Are an expert in product analytics Are experienced with data visualisation tools such as Tableau, Looker or similar Communicate with clarity and empathy on insights and recommendations to cross-functional stakeholders for decision making Are an authentic leader and a role model for radical candour Have an eye for details and quality is more important for you than speed Show Me The Benefits Get your own Pleo card, which means full autonomy and no out-of-pocket spending Ability to work remotely (anywhere between the east coast of the Americas to European time zones)... ...or onsite if you want to (Copenhagen, London, Berlin, Stockholm, Madrid, Lisbon) Catered lunch in our offices or daily budget if you work remotely 25 days of annual holidays, on top of the standardised festive and bank-related ones, of course 2500€ per year as flex benefit (maybe you want to buy additional holidays, pay the gym, book a professional coach, pay part of your MBA, or finally get that pet you always dreamed of) Great parental leave: 100% paid, 24 weeks for primary caretakers & 8 weeks for secondary Loads of weird and wonderful niche communities to join in the company Trips abroad for team camps and fun Wild enthusiasm and encouragement from us if you want to host MeetUps, events, etc - we'll help (venue, food etc)"
case_data_scientist = "The Platform team creates the technology that enables Spotify to learn quickly and scale easily, enabling rapid growth in our users and our business around the globe. Spanning many disciplines, we work to make the business work; creating the frameworks, capabilities and tools needed to welcome a billion customers. Join us and help to amplify productivity, quality and innovation across Spotify. At Client Platform, a part of the Platform Mission, we are passionate about amplifying productivity, quality and innovation across all client developers at Spotify. Client Platform strives to bring a great experience for client developers at Spotify, and through this deliver stable and reliable products for people to enjoy. We are looking for a Lead Data Scientist that will study the behavior of client developers at Spotify, help evolve our product strategy, drive and own work for our critical initiatives, and bring data and insights into high impact collaborations. Spotify is a fast paced company that believes that every decision should be data-informed and every feature be fueled with data. As a Lead Data Scientist you will be working independently in one of three Product Areas, and together with other Data Science Leads, you will be part of a team bridging insights work across our Tribe. What You’ll Do Lead insights work and establish collaboration with cross-functional roles, e.g. product managers, engineers, designers Partner with Product, Design and Tech leads to determine goals and priorities, as well as empowering them with data through the decision making process Define metrics, build dashboards, create reports and key datasets to empower data-informed product development Communicate insights and recommendations to key partners, helping activate data best practices in the Client Platform teams Perform exploratory analysis to understand who our users are, how they get value out of our offering and where we can further develop our product to bring greater value Who You Are 5+ years of working experience, with a degree in statistics, mathematics, computer science, engineering, economics or any other quantitative field A communicative person who values building strong relationships with colleagues and partners, you also enjoy mentoring and guiding others Able to navigate loosely defined problems, as well as coming up with impactful and actionable insights Have understanding of how to instrument products to accurately collect user and system behaviors through data, thus offering a wide variety of insights and product development use cases Skilled in advanced analytics, and you possess statistical competence (such as regression modeling and significance testing) Hands-on experience synthesizing insights from data using tools such as Python, R, BigQuery, SQL, Tableau Preferably have some level of leadership and management experience, as well as strong project management skills Where you'll be For this role, it can be within the EMEA region in which we have a work location and is within working hours. Prefer an office to work from home instead? Not a problem! We have plenty of options for your working preferences. Find more information about our Work From Anywhere options here . Spotify is an equal opportunity employer. You are welcome at Spotify for who you are, no matter where you come from, what you look like, or what’s playing in your headphones. Our platform is for everyone, and so is our workplace. The more voices we have represented and amplified in our business, the more we will all thrive, contribute, and be forward-thinking! So bring us your personal experience, your perspectives, and your background. It’s in our differences that we will find the power to keep revolutionizing the way the world listens. Spotify transformed music listening forever when we launched in 2008. Our mission is to unlock the potential of human creativity by giving a million creative artists the opportunity to live off their art and billions of fans the chance to enjoy and be passionate about these creators. Everything we do is driven by our love for music and podcasting. Today, we are the world’s most popular audio streaming subscription service. Global COVID and Vaccination Disclosure Spotify is committed to safety and well-being of our employees, vendors and clients. We are following regional guidelines mandating vaccination and testing requirements, including those requiring vaccinations and testing for in-person roles and event attendance. For the US, we have mandated that all employees and contractors be fully vaccinated in order to work in our offices and externally with any third-parties. For all other locations, we strongly encourage our employees to get vaccinated and also follow local COVID and safety protocols."
case_ml_engineer = "What if your job had an impact on shaping the future of urban mobility? Imagine your experiments and analysis improving sustainable last-mile transportation for cities all over Europe. Imagine changing an industry with your team's latest products. YOUR MISSION AT VOI At Voi, we are committed to make the 15 minute cities a reality and we do it by setting our riders and cities first in all aspects of development. We are looking for an experienced Machine Learning Engineer to join our Operations team, where we ensure our Vois are ready to meet the mobility needs in our cities. Here you'll be able to impact fleet efficiency and enable more users to have an excellent ride experience. Our team identifies algorithmically the mobility needs in the field and suggests appropriate actions, such as: Anticipating battery levels and keeping our scooters ready for the next ride Ensuring that users will have Vois available close by whenever they'll need one Identifying parking that is less than ideal and deciding incentives YOUR TEAM AT VOI You’ll be joining a small, skilled and motivated team with a high degree of autonomy. You will have the opportunity to be part of forming the team and influencing its core culture. Voi is a hybrid workplace that operates on trust and freedom, but we also love to hang out in person! We are innovative and curious team members, keen on learning and growing, which is why we make sure to dedicate time to it every sprint. We welcome diverse ideas and continuously aim for a workplace that feels like home. What You’ll Be Working On Driving ML projects end to end - starting from a practical goal, creating a prototype, implementing it into production and measuring the business impact Developing production grade ML ensuring reliability and scalability Building tailor made machine learning models to improve fleet utilization, uptime and user experience Make improvements to all modules in our ongoing projects What You’ll Need To Embark Have 3+ years of experience in working with Machine Learning Have a solid technical and academic background with a MSc or similar Proven experience in building end-to-end machine learning in production Previous experience in a fast moving organization, approaching complex problems with iterative pragmatic solutions Eagerness to learn and contribute to the team's perpetual technical growth Knowledge of tooling related to: cloud solutions (we use GCP), orchestration (e.g. we use Airflow and Prefect) and data processing (e.g. dbt and Snowflake) is meriting Previous experience in forecasting, predictive modeling, optimization and geospatial techniques are meriting Are able to work from Stockholm a few days per quarter Professional working proficiency in English, Swedish is not required"
case_devops_engineer = "Our ambition is to amplify all business processes in H&M using AI and Advanced Analytics by 2025. This means we need to improve our existing platforms or build novel platforms to efficiently and cost-effectively prepare and serve hundreds of millions of AI modeling features train, manage and serve tens of thousands of AI models run thousands of AB tests deploy hundreds of exploration/dev/test/prod environments integrate with key IT systems supporting critical business processes As a Senior Devops Engineer in the AI Foundation, your mission is to contribute to these challenging, yet rewarding causes which will enable us to roll out scalable and production-ready AI and Advanced Analytics software. You will be part of an agile team that not only takes the responsibility of delivering such platforms but also ensures the success of the stakeholders through ensuring the best experience. In addition, you will be part of guilds that establish and promote best practices throughout the organization by adopting inner source culture that champions collaborations, reusable codes, and knowledge sharing. Qualifications Who you are? You feel that our AI and Advanced Analytics ambitions are very exciting. You have at least 5+ years of experience in a similar role. Automation is a part of your DNA, you are passionate about building zero touch systems. You are experienced in managing infrastructure with public cloud providers (Azure or Google cloud is preferred). Working with CI/CD pipelines is your specialty. You have a good understanding of Infrastructure as Code tools, such as Terraform, ARM, Ansible. You have experiences in setting up systems for better observability, such as logging, monitoring, alerting, etc. You have experiences with Docker, Kubernetes and micro service management systems. You believe in knowledge sharing and upskilling. You are a role model for the team not just for your technical skills, but also for adopting agile ways of working. You bear a team-first mentality and truly believe in striving in diversity. You have good communication skills, preferably in English, both verbal and written. You are not shy in presenting your work."

In [76]:
#Try it yourself by replacing the first argument with the variable name
calculate_role(case_fullstack,f_score_list).sort_values(by='Percentage',ascending = False)

#It seems to be able to find correct answer for all of these cases
#However, some are very close
#cases_data_engineer might actually need a data_analyst and a data_scientist
#This is where the definitions collide. As the web scraped data is from 

Unnamed: 0,Squared sum,Percentage
fullstack f-score,4.765296,31.62
devops_engineer f-score,2.504727,16.62
data_scientist f-score,2.218091,14.72
data_engineer f-score,2.040528,13.54
data_analyst f-score,1.928856,12.8
ml_engineer f-score,1.613045,10.7


### Improve F-score calculation

### Visual improvement

In [23]:
# Adding color to calculate role
def colored(r, g, b, text):
    return "\033[38;2;{};{};{}m{} \033[38;2;255;255;255m".format(r, g, b, text)

#Lets expand on the calcluate role method and add visuals
#You can compare 2 categories
def calculate_role_visual(text,f_score_list,diminishing_repetition = False, category1 = 'none',brightness = 1,category2 = 'none'):
    """Calculate_role but prints the text and colors it depending on relevancy to category1 (and category2 if not 'none')
        Args:
            text (str): Text to calculate role. 
            f_score_dict (dict<dataframe>): By separating f_score_list by category and adding each category in one dictionary
            diminishing_repetition_amount (float): f_score will be divided by 1+n*diminishing_repetition_amount where n = times word appeared
            category1 (str): name of category to be visually shown Red
            brightness (int): color sharpness multiplier
            category2 (str): name of category to be visually shown Blue
        Returns:
            pandas.DataFrame: with frequency and F-score (roles*2 columns)
    """
    score_df = calculate_role(text,f_score_list,diminishing_repetition)
    
    ##here comes the visual
    # red are words that category1 picks up
    # blue are words that category2 picks up
    # green is when no category is picked up
    if category1 != 'none':
        processed_text = remove_special_characters(text,remove_stop_words = False)
        colored_list = []
        word_list1 = {}
        word_list2 = {}

        for word in processed_text.split():
            color1 = 0
            color2 = 0
            color3 = 150
            if word in f_score_list[category1 + ' f-score'].index:
                if f_score_list[category1 + ' f-score'].loc[word] > 0:
                    word_list1[word] = f_score_list[category1 + ' f-score'].loc[word]
                    score1 = f_score_list[category1 + ' f-score'].loc[word]
                    color1 = score1*brightness if score1*brightness <= 1 else 1
                    color3 = 0
            if category2 != 'none':
                if word in f_score_list[category2 + ' f-score'].index:   
                    if f_score_list[category2 + ' f-score'].loc[word] > 0:
                        word_list2[word] = f_score_list[category2 + ' f-score'].loc[word]
                        score2 = f_score_list[category2 + ' f-score'].loc[word]
                        color2 = score2*brightness if score2*brightness <= 1 else 1
                        color3 = 0
            colored_list.append(colored(int(color1*255),color3,int(color2*255),word))
        print("".join(colored_list))

        df1 = pd.DataFrame.from_dict(word_list1, orient='index',columns=[category1 + ' f-score'])
        if category2 == 'none':
            display(df1.transpose().fillna(0))
            return   score_df
        df2 = pd.DataFrame.from_dict(word_list2, orient='index',columns=[category2 + ' f-score'])
        display(pd.concat([df1, df2], axis=1).transpose().fillna(0))
    return   score_df

#### With this function, you can now see what words are important

In [24]:
#f score list initialization
csv_list = ['fullstack.csv','data_engineer.csv','data_analyst.csv','data_scientist.csv','ml_engineer.csv','devops_engineer.csv']
df_list = csv_string_list_to_df_list(csv_list)
f_score_list = get_f_score(df_list,beta = 0.05)

In [75]:
    # red are words that category1 picks up
    # green is when no category is picked up
    # interesting to see that machine learning are so red together, even though f_score_list only reads single words
calculate_role_visual(case_ml_engineer,f_score_list,0,'ml_engineer',2).sort_values(by='Percentage',ascending = False)

[38;2;0;150;0mwhat [38;2;255;255;255m[38;2;0;150;0mif [38;2;255;255;255m[38;2;0;150;0myour [38;2;255;255;255m[38;2;13;0;0mjob [38;2;255;255;255m[38;2;0;150;0mhad [38;2;255;255;255m[38;2;0;150;0man [38;2;255;255;255m[38;2;37;0;0mimpact [38;2;255;255;255m[38;2;0;150;0mon [38;2;255;255;255m[38;2;0;150;0mshaping [38;2;255;255;255m[38;2;0;150;0mthe [38;2;255;255;255m[38;2;23;0;0mfuture [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;0;150;0murban [38;2;255;255;255m[38;2;0;150;0mmobility [38;2;255;255;255m[38;2;0;150;0mimagine [38;2;255;255;255m[38;2;0;150;0myour [38;2;255;255;255m[38;2;23;0;0mexperiments [38;2;255;255;255m[38;2;0;150;0mand [38;2;255;255;255m[38;2;40;0;0manalysis [38;2;255;255;255m[38;2;21;0;0mimproving [38;2;255;255;255m[38;2;0;150;0msustainable [38;2;255;255;255m[38;2;0;150;0mlast [38;2;255;255;255m[38;2;0;150;0mmile [38;2;255;255;255m[38;2;49;0;0mtransportation [38;2;255;255;255m[38;2;0;150;0mfor [38;2;255;255;2

Unnamed: 0,job,impact,future,experiments,analysis,improving,transportation,industry,team,latest,...,predictive,modeling,optimization,techniques,work,days,professional,proficiency,english,required
ml_engineer f-score,0.026845,0.073917,0.046225,0.046225,0.079754,0.042322,0.096918,0.151768,0.112594,0.126966,...,0.098556,0.156803,0.242747,0.177378,0.146407,0.096918,0.153827,0.191072,0.127936,0.054531


Unnamed: 0,Squared sum,Percentage
ml_engineer f-score,4.758132,27.71
data_scientist f-score,4.040579,23.53
data_engineer f-score,2.380703,13.86
data_analyst f-score,2.086215,12.15
devops_engineer f-score,2.020177,11.76
fullstack f-score,1.886101,10.98


#### You can also compare two categories with each other

In [26]:
3

3

In [73]:
    # red are words that category1 picks up
    # blue are words that category2 picks up
    # green is when no category is picked up
# interesting that glue, ec2, lambda, s3, redshift, postgresql are all picked up as mostly data_engineer
calculate_role_visual(case_data_engineer,f_score_list,0.5,'data_analyst',3,'data_engineer').sort_values(by='Percentage',ascending = False)

[38;2;89;0;116mdesign [38;2;255;255;255m[38;2;0;150;0mand [38;2;255;255;255m[38;2;173;0;22mimplement [38;2;255;255;255m[38;2;226;0;220mdata [38;2;255;255;255m[38;2;0;0;29mextraction [38;2;255;255;255m[38;2;246;0;68mprocess [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;226;0;220mdata [38;2;255;255;255m[38;2;0;150;0mfrom [38;2;255;255;255m[38;2;26;0;81mexisting [38;2;255;255;255m[38;2;95;0;194msystems [38;2;255;255;255m[38;2;60;0;31mupdate [38;2;255;255;255m[38;2;74;0;95mcurrent [38;2;255;255;255m[38;2;226;0;220mdata [38;2;255;255;255m[38;2;0;0;68mlake [38;2;255;255;255m[38;2;226;0;220mdata [38;2;255;255;255m[38;2;0;0;236mstream [38;2;255;255;255m[38;2;89;0;116mdesign [38;2;255;255;255m[38;2;143;0;21mmaking [38;2;255;255;255m[38;2;0;150;0mthem [38;2;255;255;255m[38;2;0;150;0mcapable [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;0;0;98mhandling [38;2;255;255;255m[38;2;0;150;0mnew [38;2;255;255;255m[38;2;26;0;54mtyp

Unnamed: 0,design,implement,data,process,existing,systems,update,current,making,types,...,stream,handling,streams,interface,glue,ec2,lambda,s3,devops,flows
data_analyst f-score,0.117195,0.226426,0.296161,0.322347,0.034075,0.124621,0.078875,0.096813,0.187533,0.034075,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
data_engineer f-score,0.152775,0.029174,0.288689,0.089293,0.106038,0.254181,0.041149,0.124187,0.02757,0.070692,...,0.309772,0.128732,0.049233,0.04483,0.171643,0.210341,0.190138,0.265197,0.044647,0.093856


Unnamed: 0,Squared sum,Percentage
data_engineer f-score,1.844311,22.07
data_analyst f-score,1.841247,22.04
devops_engineer f-score,1.642351,19.66
data_scientist f-score,1.467783,17.57
fullstack f-score,0.969005,11.6
ml_engineer f-score,0.590344,7.07


In [28]:
#As you can see, the visuals are very muddled with words that are from other categories. e.g 'devops'
#Which is why we need to only include words that are distinct to that category to
#this will make the visuals much clearer

## Improving F-score calculations and visuals


### Distinct F-score calculating with dictionaries to get distinct f-scores

In [29]:


def calculate_role_distinct(text,f_score_dict,diminishing_repetition_amount = 0):
    """Gives each word in text score based on the f_score_list and 
    returns a array(score_array) with scores for each role.
    Different from calculate_role, calculate_role_distinct adds only to its own category.
    It does this by separating each category in a dictionary. 
    For example, the top 50 f-scoring words in each category gets added to its seperate key in a dictionary(f_score_dict)
    If for example 'Data' were to be top 50 words in data_scientist, but not in fullstack, 
    then 'Data' would obly give points to data_scientist.
    Reason to use this: to get more clear visuals(each word is more prone connected to only one category).
    
    The score is squared to reward high-scoring/confident words.
    Args:
        text (str): Text to calculate role. 
        f_score_dict (dict<dataframe>): By separating f_score_list by category and adding each category in one dictionary
        diminishing_repetition_amount (float): f_score will be divided by 1+n*diminishing_repetition_amount where n = times word appeared
    Returns:
        pandas.DataFrame: n long dataframe Each roles column name and respective F-score and percentage
    """
    
     #Goes through text word by word, giving points to each role respectively.
    #OBS. negative i because f-scores in f_score_list are formatted to be the last columns 
    text = remove_special_characters(text)
    score_dict = {}
    word_dict = {} 
    for df_name in f_score_dict:
        score_dict[df_name+' f-score'] = 0
    for w in text.split(' '):
        if w in word_dict:
            word_dict[w] += diminishing_repetition_amount
        else:
            word_dict[w] = 1
        for df_name in f_score_dict:
            if w in f_score_dict[df_name].index:    
                score = f_score_dict[df_name].loc[w][df_name + ' f-score']**2
                score_dict[df_name+' f-score'] += score/word_dict[w]
    score_df = pd.DataFrame.from_dict(score_dict,orient='index',columns = ['Squared Sum'])
    score_df['Percentage'] = score_df.apply(lambda x: round(100*x/x.sum(),2))
    return score_df 


In [30]:
def colored(r, g, b, text):
    return "\033[38;2;{};{};{}m{} \033[38;2;255;255;255m".format(r, g, b, text)


def calculate_role_distinct_visual(text,f_score_dict,diminishing_repetition_amount = 0, category1 = 'none',brightness = 1,category2 = 'none'):
    """Calculate_role_visual but using calculate_role_distinct method instead.
        Args:
            text (str): Text to calculate role. 
            f_score_dict (dict<dataframe>): By separating f_score_list by category and adding each category in one dictionary
            diminishing_repetition_amount (float): If True, f_score will be divided by 1+n*diminishing_repetition_amount where n = times word appeared
            category1 (str): name of category to be visually shown Red
            brightness (int): color sharpness
            category1 (str): name of category to be visually shown Blue and compared with category1
        Returns:
            pandas.DataFrame: with frequency and F-score (roles*2 columns)
    """
    score_df = calculate_role_distinct(text,f_score_dict,diminishing_repetition_amount)
    
    ##here comes the visuals
    # red are words that category1 picks up
    # blue are words that category2 picks up
    # green is when no category is picked up
    if category1 != 'none':
        processed_text = remove_special_characters(text,remove_stop_words = False)
        colored_list = []
        word_list1 = {}
        word_list2 = {}

        for word in processed_text.split():
            color1 = 0
            color2 = 0
            color3 = 150
            for df_name in f_score_dict:
                if word in f_score_dict[category1].index:
                    color3 = 0
                    word_list1[word] = f_score_dict[category1].loc[word][category1 + ' f-score']
                    score1 = f_score_dict[category1].loc[word][category1 + ' f-score']
                    color1 = score1*brightness if score1*brightness <= 1 else 1
                if category2 != 'none':
                    if word in f_score_dict[category2].index and category2 != 'none':    
                        color3 = 0
                        word_list2[word] = f_score_dict[category2].loc[word][category2 + ' f-score']
                        score2 = f_score_dict[category2].loc[word][category2 + ' f-score']
                        color2 = score2*brightness if score2*brightness <= 1 else 1
            colored_list.append(colored(int(color1*255),color3,int(color2*255),word))
        print("".join(colored_list))

        df1 = pd.DataFrame.from_dict(word_list1, orient='index',columns=[category1 + ' f-score'])
        if category2 == 'none':
            display(df1.transpose())
            return   score_df
        df2 = pd.DataFrame.from_dict(word_list2, orient='index',columns=[category2 + ' f-score'])
        display(pd.concat([df1, df2], axis=1).transpose().fillna(0))
    return   score_df

#### Initializing

In [31]:
csv_list = ['fullstack.csv','data_engineer.csv','data_analyst.csv','data_scientist.csv','ml_engineer.csv','devops_engineer.csv']
df_list = csv_string_list_to_df_list(csv_list)
f_score_list = get_f_score(df_list,beta = 0.05)

In [32]:
#Lets test by only using the first 200 highest scoring words from each category. 
max_word_count = 300  
chosen_f_score_list = f_score_list

f_score_dict={}
f_score_dict['fullstack'] = chosen_f_score_list.sort_values(by="fullstack f-score",ascending = False)[:max_word_count]
f_score_dict['data_engineer'] = chosen_f_score_list.sort_values(by="data_engineer f-score",ascending = False)[:max_word_count]
f_score_dict['data_analyst'] = chosen_f_score_list.sort_values(by="data_analyst f-score",ascending = False)[:max_word_count]
f_score_dict['data_scientist'] = chosen_f_score_list.sort_values(by="data_scientist f-score",ascending = False)[:max_word_count]
f_score_dict['ml_engineer'] = chosen_f_score_list.sort_values(by="ml_engineer f-score",ascending = False)[:max_word_count]
f_score_dict['devops_engineer'] = chosen_f_score_list.sort_values(by="devops_engineer f-score",ascending = False)[:max_word_count]


##### this initialisation can be shortened

In [33]:
def initialize_f_score_dict(chosen_f_score_list,max_word_count):
    chosen_f_score_list =chosen_f_score_list.copy()
    f_score_dict={}
    categories = chosen_f_score_list.filter(regex = 'f-score')
    for category_name in categories:
        f_score_dict[category_name.split(' ')[0]] = chosen_f_score_list.sort_values(by=category_name,ascending = False)[:max_word_count]
    return f_score_dict

In [34]:
f_score_dict = initialize_f_score_dict(f_score_list,300)

#### Calculating

In [35]:
case_fullstack = "Java Full stack developer 100 Remote 6 months Contract to Hire Can Spons JD Mandatory Java, JavaScript, React, Angular What You will Need Candidate possess a bachelorrsquos degree in Computer Science, or related field, or equivalent experience 7+ years of experience creating modern web experiences across devices Expert in creating customer experiences web portal using JavaScript, CSS and HTML Project experience using React, Angular, Vue.JS or similar frameworks andor libraries Working experience with distributed SCM (GitHub a plus), DevOps, AWS Experience integrating with lightweight middleware technologies, integration patterns, microservices Proficient demonstration of SQL knowledge Track record of taking ownership and driving results in a data-driven, fast-paced environment Excellent interpersonal and communication skills, strong analytical skills, and ability to deal with ambiguity in a rapidly evolving business environment Bonus Understanding of testing automation, including the building of scrappy codingautomation for testing code faster Experience in Retail, E-Commerce or Technology Industry, Merchandising Vendor Management focus a plus Strong engineering background building Productsprototypes that are E-Commerce scale Experience in Software Quality Processes and Atlassian Tools (Confluence, JIRA, etc.) Feel free to reach out to me for questions or clarifications."
case_data_engineer = "Design and implement data extraction process of data from existing systems - Update current data lake data stream design making them capable of handling new types of data streams - with subsequent implementation - Support development of graphical user presentation interface that uses the new data streams Description Core skills: Cloud technical development skills - Experience with AWS services like Glue, EC2, Lambda, S3, Redshift, PostgreSQL etc. - IT architecture (preferably AWS) - Experience with data harmonisation and data entity identification - Experience with data contextualization and visualisation – Preferably Tableau, Power BI - Azure DevOps -Python 3 - Document the technical architecture and the process flows Personal skills: Analytical mindset Can work independently but also collaboratively Visualisation of complex scenarios Can translate between business needs and technical requirements Self-paced Fluent in English"
case_data_analyst = "In this position you support Pleo’s Market Expansion and SMB Acquisition & Growth domains, and you are a member of the Data & Analytics competence group. SMB Acquisition & Growth Domain We contribute 70% of Pleo’s revenue, and we do that by wow-ing customers at every touchpoint, from our ads, to our website, to our sign up journey, to our renewals, to our engagement comms, in-product nudges, etc. We design for the end users, apply consumer approach to growth tactics, build for scalability, make the product accessible & fast, and push for organic growth. We are a super-cross-functional domain, with marketing, sales, customer success, product, etc. all housed under the same “virtual” roof, but a huge focus on experimentation and customer obsession. Market Expansion Domain Pleo will become the go-to spending solution for companies in the SMB segment across Europe, empowering employees by enabling a healthy spending culture. We are launching 15 new markets in 15 months as a way to launch simultaneous bets to supercharge Pleo’s hyper-growth. So What's Data & Analytics At Pleo Like? Join our community of 20+ talented data professionals working from more than 10 different locations worldwide with backgrounds in data analytics, analytical engineering and data engineering. Work with a modern data & analytics toolkit including Kafka, BigQuery, dbt, Looker, Fivetran, Segment, & Amplitude. A shared vision to stop the guessing game and unlock growth for Pleo. A lot of attention on data culture, strategy and empowerment of our stakeholders. Dedicated learning & development guidance and support for data professionals. Deal with a huge variety of data and insights spanning the entire business, from the detailed inner workings of the Pleo platform to web & app metrics, CRM details, financial and banking records, human resource information, and everything else needed to fuel Pleo’s decision making processes in a data inspired manner. What Great Looks Like In This Role You and your team are company-wide trusted experts for insights related to Market Expansion and SMB Acquisition & Growth domains. You successfully nurture a culture of data inspired decision making in alignment with the other Data & Analytics leaders. You are an ambassador for excellence within the Data & Analytics community, you lead by example and inspire others. You and your team empowered the Market Expansion and SMB Acquisition & Growth domains to derive descriptive and diagnostic insights autonomously, while you focus on predictive and prescriptive insights. You and your team are highly satisfied in terms of career development and your sense of belonging to the Market Expansion and SMB Acquisition & Growth domains and the Data & Analytics competence group. You solve problems through collaboration rather than control. Your Responsibilities Be the Data Analytics Competence Lead for up to 4 Data Analysts of Market Expansion and SMB Acquisition & Growth domains, providing career development and coaching Ensure Data & Analytics best practices and standards are identified, cultivated and followed in your competence team and domains, own the quality of work and provide ways of solving problems Act as multiplier for Data & Analytics inside the domains and facilitate the feedback loop to other Data & Analytics functions at Pleo Be accountable for your team’s delivery and quality of metrics and dashboards Drive a culture of experimentation and data-informed decision making across the entire customer lifecycle Facilitate activities to increase the level of data literacy within the domains Your Colleagues Say You Have excellent hands-on knowledge of SQL and experience with more advanced analytics topics such as cohort and regression analysis Are an expert in product analytics Are experienced with data visualisation tools such as Tableau, Looker or similar Communicate with clarity and empathy on insights and recommendations to cross-functional stakeholders for decision making Are an authentic leader and a role model for radical candour Have an eye for details and quality is more important for you than speed Show Me The Benefits Get your own Pleo card, which means full autonomy and no out-of-pocket spending Ability to work remotely (anywhere between the east coast of the Americas to European time zones)... ...or onsite if you want to (Copenhagen, London, Berlin, Stockholm, Madrid, Lisbon) Catered lunch in our offices or daily budget if you work remotely 25 days of annual holidays, on top of the standardised festive and bank-related ones, of course 2500€ per year as flex benefit (maybe you want to buy additional holidays, pay the gym, book a professional coach, pay part of your MBA, or finally get that pet you always dreamed of) Great parental leave: 100% paid, 24 weeks for primary caretakers & 8 weeks for secondary Loads of weird and wonderful niche communities to join in the company Trips abroad for team camps and fun Wild enthusiasm and encouragement from us if you want to host MeetUps, events, etc - we'll help (venue, food etc)"
case_data_scientist = "The Platform team creates the technology that enables Spotify to learn quickly and scale easily, enabling rapid growth in our users and our business around the globe. Spanning many disciplines, we work to make the business work; creating the frameworks, capabilities and tools needed to welcome a billion customers. Join us and help to amplify productivity, quality and innovation across Spotify. At Client Platform, a part of the Platform Mission, we are passionate about amplifying productivity, quality and innovation across all client developers at Spotify. Client Platform strives to bring a great experience for client developers at Spotify, and through this deliver stable and reliable products for people to enjoy. We are looking for a Lead Data Scientist that will study the behavior of client developers at Spotify, help evolve our product strategy, drive and own work for our critical initiatives, and bring data and insights into high impact collaborations. Spotify is a fast paced company that believes that every decision should be data-informed and every feature be fueled with data. As a Lead Data Scientist you will be working independently in one of three Product Areas, and together with other Data Science Leads, you will be part of a team bridging insights work across our Tribe. What You’ll Do Lead insights work and establish collaboration with cross-functional roles, e.g. product managers, engineers, designers Partner with Product, Design and Tech leads to determine goals and priorities, as well as empowering them with data through the decision making process Define metrics, build dashboards, create reports and key datasets to empower data-informed product development Communicate insights and recommendations to key partners, helping activate data best practices in the Client Platform teams Perform exploratory analysis to understand who our users are, how they get value out of our offering and where we can further develop our product to bring greater value Who You Are 5+ years of working experience, with a degree in statistics, mathematics, computer science, engineering, economics or any other quantitative field A communicative person who values building strong relationships with colleagues and partners, you also enjoy mentoring and guiding others Able to navigate loosely defined problems, as well as coming up with impactful and actionable insights Have understanding of how to instrument products to accurately collect user and system behaviors through data, thus offering a wide variety of insights and product development use cases Skilled in advanced analytics, and you possess statistical competence (such as regression modeling and significance testing) Hands-on experience synthesizing insights from data using tools such as Python, R, BigQuery, SQL, Tableau Preferably have some level of leadership and management experience, as well as strong project management skills Where you'll be For this role, it can be within the EMEA region in which we have a work location and is within working hours. Prefer an office to work from home instead? Not a problem! We have plenty of options for your working preferences. Find more information about our Work From Anywhere options here . Spotify is an equal opportunity employer. You are welcome at Spotify for who you are, no matter where you come from, what you look like, or what’s playing in your headphones. Our platform is for everyone, and so is our workplace. The more voices we have represented and amplified in our business, the more we will all thrive, contribute, and be forward-thinking! So bring us your personal experience, your perspectives, and your background. It’s in our differences that we will find the power to keep revolutionizing the way the world listens. Spotify transformed music listening forever when we launched in 2008. Our mission is to unlock the potential of human creativity by giving a million creative artists the opportunity to live off their art and billions of fans the chance to enjoy and be passionate about these creators. Everything we do is driven by our love for music and podcasting. Today, we are the world’s most popular audio streaming subscription service. Global COVID and Vaccination Disclosure Spotify is committed to safety and well-being of our employees, vendors and clients. We are following regional guidelines mandating vaccination and testing requirements, including those requiring vaccinations and testing for in-person roles and event attendance. For the US, we have mandated that all employees and contractors be fully vaccinated in order to work in our offices and externally with any third-parties. For all other locations, we strongly encourage our employees to get vaccinated and also follow local COVID and safety protocols."
case_ml_engineer = "What if your job had an impact on shaping the future of urban mobility? Imagine your experiments and analysis improving sustainable last-mile transportation for cities all over Europe. Imagine changing an industry with your team's latest products. YOUR MISSION AT VOI At Voi, we are committed to make the 15 minute cities a reality and we do it by setting our riders and cities first in all aspects of development. We are looking for an experienced Machine Learning Engineer to join our Operations team, where we ensure our Vois are ready to meet the mobility needs in our cities. Here you'll be able to impact fleet efficiency and enable more users to have an excellent ride experience. Our team identifies algorithmically the mobility needs in the field and suggests appropriate actions, such as: Anticipating battery levels and keeping our scooters ready for the next ride Ensuring that users will have Vois available close by whenever they'll need one Identifying parking that is less than ideal and deciding incentives YOUR TEAM AT VOI You’ll be joining a small, skilled and motivated team with a high degree of autonomy. You will have the opportunity to be part of forming the team and influencing its core culture. Voi is a hybrid workplace that operates on trust and freedom, but we also love to hang out in person! We are innovative and curious team members, keen on learning and growing, which is why we make sure to dedicate time to it every sprint. We welcome diverse ideas and continuously aim for a workplace that feels like home. What You’ll Be Working On Driving ML projects end to end - starting from a practical goal, creating a prototype, implementing it into production and measuring the business impact Developing production grade ML ensuring reliability and scalability Building tailor made machine learning models to improve fleet utilization, uptime and user experience Make improvements to all modules in our ongoing projects What You’ll Need To Embark Have 3+ years of experience in working with Machine Learning Have a solid technical and academic background with a MSc or similar Proven experience in building end-to-end machine learning in production Previous experience in a fast moving organization, approaching complex problems with iterative pragmatic solutions Eagerness to learn and contribute to the team's perpetual technical growth Knowledge of tooling related to: cloud solutions (we use GCP), orchestration (e.g. we use Airflow and Prefect) and data processing (e.g. dbt and Snowflake) is meriting Previous experience in forecasting, predictive modeling, optimization and geospatial techniques are meriting Are able to work from Stockholm a few days per quarter Professional working proficiency in English, Swedish is not required"
case_devops_engineer = "Our ambition is to amplify all business processes in H&M using AI and Advanced Analytics by 2025. This means we need to improve our existing platforms or build novel platforms to efficiently and cost-effectively prepare and serve hundreds of millions of AI modeling features train, manage and serve tens of thousands of AI models run thousands of AB tests deploy hundreds of exploration/dev/test/prod environments integrate with key IT systems supporting critical business processes As a Senior Devops Engineer in the AI Foundation, your mission is to contribute to these challenging, yet rewarding causes which will enable us to roll out scalable and production-ready AI and Advanced Analytics software. You will be part of an agile team that not only takes the responsibility of delivering such platforms but also ensures the success of the stakeholders through ensuring the best experience. In addition, you will be part of guilds that establish and promote best practices throughout the organization by adopting inner source culture that champions collaborations, reusable codes, and knowledge sharing. Qualifications Who you are? You feel that our AI and Advanced Analytics ambitions are very exciting. You have at least 5+ years of experience in a similar role. Automation is a part of your DNA, you are passionate about building zero touch systems. You are experienced in managing infrastructure with public cloud providers (Azure or Google cloud is preferred). Working with CI/CD pipelines is your specialty. You have a good understanding of Infrastructure as Code tools, such as Terraform, ARM, Ansible. You have experiences in setting up systems for better observability, such as logging, monitoring, alerting, etc. You have experiences with Docker, Kubernetes and micro service management systems. You believe in knowledge sharing and upskilling. You are a role model for the team not just for your technical skills, but also for adopting agile ways of working. You bear a team-first mentality and truly believe in striving in diversity. You have good communication skills, preferably in English, both verbal and written. You are not shy in presenting your work."

In [36]:
calculate_role_distinct_visual(case_data_engineer,f_score_dict,0.5,'data_engineer',2,'data_analyst').sort_values(by='Percentage',ascending = False)

[38;2;77;0;0mdesign [38;2;255;255;255m[38;2;0;150;0mand [38;2;255;255;255m[38;2;0;0;115mimplement [38;2;255;255;255m[38;2;147;0;151mdata [38;2;255;255;255m[38;2;0;150;0mextraction [38;2;255;255;255m[38;2;0;0;164mprocess [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;147;0;151mdata [38;2;255;255;255m[38;2;0;150;0mfrom [38;2;255;255;255m[38;2;0;150;0mexisting [38;2;255;255;255m[38;2;129;0;0msystems [38;2;255;255;255m[38;2;0;150;0mupdate [38;2;255;255;255m[38;2;0;150;0mcurrent [38;2;255;255;255m[38;2;147;0;151mdata [38;2;255;255;255m[38;2;0;150;0mlake [38;2;255;255;255m[38;2;147;0;151mdata [38;2;255;255;255m[38;2;157;0;0mstream [38;2;255;255;255m[38;2;77;0;0mdesign [38;2;255;255;255m[38;2;0;0;95mmaking [38;2;255;255;255m[38;2;0;150;0mthem [38;2;255;255;255m[38;2;0;150;0mcapable [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;65;0;0mhandling [38;2;255;255;255m[38;2;0;150;0mnew [38;2;255;255;255m[38;2;0;150;0mtypes [38

Unnamed: 0,design,data,systems,stream,handling,implementation,development,skills,cloud,technical,...,support,user,tableau,power,3,personal,analytical,independently,complex,paced
data_engineer f-score,0.152775,0.288689,0.254181,0.309772,0.128732,0.173862,0.177485,0.14221,0.281908,0.147617,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
data_analyst f-score,0.0,0.296161,0.0,0.0,0.0,0.0,0.0,0.242059,0.0,0.0,...,0.255675,0.136474,0.413553,0.209979,0.145547,0.24279,0.354525,0.175467,0.201563,0.159064


Unnamed: 0,Squared Sum,Percentage
data_engineer f-score,1.658586,22.93
data_analyst f-score,1.64068,22.68
devops_engineer f-score,1.532254,21.18
data_scientist f-score,1.224687,16.93
fullstack f-score,0.824537,11.4
ml_engineer f-score,0.353134,4.88


#### Compare with normal calculate

In [37]:
calculate_role_visual(case_data_engineer,f_score_list,True,'data_engineer',2,'data_analyst').sort_values(by='Percentage',ascending = False)

[38;2;77;0;59mdesign [38;2;255;255;255m[38;2;0;150;0mand [38;2;255;255;255m[38;2;14;0;115mimplement [38;2;255;255;255m[38;2;147;0;151mdata [38;2;255;255;255m[38;2;19;0;0mextraction [38;2;255;255;255m[38;2;45;0;164mprocess [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;147;0;151mdata [38;2;255;255;255m[38;2;0;150;0mfrom [38;2;255;255;255m[38;2;54;0;17mexisting [38;2;255;255;255m[38;2;129;0;63msystems [38;2;255;255;255m[38;2;20;0;40mupdate [38;2;255;255;255m[38;2;63;0;49mcurrent [38;2;255;255;255m[38;2;147;0;151mdata [38;2;255;255;255m[38;2;45;0;0mlake [38;2;255;255;255m[38;2;147;0;151mdata [38;2;255;255;255m[38;2;157;0;0mstream [38;2;255;255;255m[38;2;77;0;59mdesign [38;2;255;255;255m[38;2;14;0;95mmaking [38;2;255;255;255m[38;2;0;150;0mthem [38;2;255;255;255m[38;2;0;150;0mcapable [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;65;0;0mhandling [38;2;255;255;255m[38;2;0;150;0mnew [38;2;255;255;255m[38;2;36;0;17mtypes 

Unnamed: 0,design,implement,data,extraction,process,existing,systems,update,current,lake,...,collaboratively,complex,business,requirements,paced,fluent,english,graphical,mindset,translate
data_engineer f-score,0.152775,0.029174,0.288689,0.038028,0.089293,0.106038,0.254181,0.041149,0.124187,0.089659,...,0.079054,0.092514,0.204626,0.238816,0.060927,0.090146,0.166171,0.0,0.0,0.0
data_analyst f-score,0.117195,0.226426,0.296161,0.0,0.322347,0.034075,0.124621,0.078875,0.096813,0.0,...,0.037945,0.201563,0.323216,0.124586,0.159064,0.058251,0.059478,0.089429,0.032955,0.037945


Unnamed: 0,Squared sum,Percentage
data_engineer f-score,1.708606,22.01
data_analyst f-score,1.697068,21.86
devops_engineer f-score,1.569829,20.22
data_scientist f-score,1.338687,17.24
fullstack f-score,0.903769,11.64
ml_engineer f-score,0.546502,7.04


#### You can now see how much clearer the distinct version is. It avoids words that have little to no relevance to desired categories

### Precision showcase - show all different cases

In [38]:
csv_list = ['fullstack.csv','data_engineer.csv','data_analyst.csv','data_scientist.csv','ml_engineer.csv','devops_engineer.csv']
df_list = csv_string_list_to_df_list(csv_list)
f_score_dict = initialize_f_score_dict(f_score_list,300)

### different cases to compare

In [39]:
cv_description_fullstack = "Christine is an enthousiastic and curious creator who works well with any team. She keeps her eye on the project as a whole with a focus on UX. With over 10 years experience in design she has the ability to code quick and goodlooking solutions, especially in the frontend. Her knowledge of backend allows her to jump in where needed to bring the product to completion. She is often described as creative, encouraging, capable and kind."
cv_description_data_scientist = "Martin is a Data Scientist with a great passion for learning new things and tackling different problems in tech. His latest role was as a Data Scientist for Nira Dynamics where he had a leading technical role in development of new data-driven products within Predictive Analytics, going from vague ideas all the way to a product. Martin is also often working on his own projects parallel to his work at HiQ, these projects are most often in his favorite fields of A.I and VR. Examples are Generative A.I for music & images and VR worlds. He has previously participated and arranged a lot in events for game development and hackathons. Martin keeps pushing his knowledge further all the time through new certificates, reading technical books and working on different projects. He has been programming since 2008 where he began with physics simulations and mathematical help tools, he also studied a bachelor in applied physics and electrotechnology. Martin is appreciated by his colleagues for his problem solving skills, being both precise and creative, while also being humble and open towards other peoples solutions. He has a very positive and helpful attitude which makes him work well with others."
case_fullstack = "Java Full stack developer 100 Remote 6 months Contract to Hire Can Spons JD Mandatory Java, JavaScript, React, Angular What You will Need Candidate possess a bachelorrsquos degree in Computer Science, or related field, or equivalent experience 7+ years of experience creating modern web experiences across devices Expert in creating customer experiences web portal using JavaScript, CSS and HTML Project experience using React, Angular, Vue.JS or similar frameworks andor libraries Working experience with distributed SCM (GitHub a plus), DevOps, AWS Experience integrating with lightweight middleware technologies, integration patterns, microservices Proficient demonstration of SQL knowledge Track record of taking ownership and driving results in a data-driven, fast-paced environment Excellent interpersonal and communication skills, strong analytical skills, and ability to deal with ambiguity in a rapidly evolving business environment Bonus Understanding of testing automation, including the building of scrappy codingautomation for testing code faster Experience in Retail, E-Commerce or Technology Industry, Merchandising Vendor Management focus a plus Strong engineering background building Productsprototypes that are E-Commerce scale Experience in Software Quality Processes and Atlassian Tools (Confluence, JIRA, etc.) Feel free to reach out to me for questions or clarifications."
case_data_engineer = "Design and implement data extraction process of data from existing systems - Update current data lake data stream design making them capable of handling new types of data streams - with subsequent implementation - Support development of graphical user presentation interface that uses the new data streams Description Core skills: Cloud technical development skills - Experience with AWS services like Glue, EC2, Lambda, S3, Redshift, PostgreSQL etc. - IT architecture (preferably AWS) - Experience with data harmonisation and data entity identification - Experience with data contextualization and visualisation – Preferably Tableau, Power BI - Azure DevOps -Python 3 - Document the technical architecture and the process flows Personal skills: Analytical mindset Can work independently but also collaboratively Visualisation of complex scenarios Can translate between business needs and technical requirements Self-paced Fluent in English"
case_data_analyst = "In this position you support Pleo’s Market Expansion and SMB Acquisition & Growth domains, and you are a member of the Data & Analytics competence group. SMB Acquisition & Growth Domain We contribute 70% of Pleo’s revenue, and we do that by wow-ing customers at every touchpoint, from our ads, to our website, to our sign up journey, to our renewals, to our engagement comms, in-product nudges, etc. We design for the end users, apply consumer approach to growth tactics, build for scalability, make the product accessible & fast, and push for organic growth. We are a super-cross-functional domain, with marketing, sales, customer success, product, etc. all housed under the same “virtual” roof, but a huge focus on experimentation and customer obsession. Market Expansion Domain Pleo will become the go-to spending solution for companies in the SMB segment across Europe, empowering employees by enabling a healthy spending culture. We are launching 15 new markets in 15 months as a way to launch simultaneous bets to supercharge Pleo’s hyper-growth. So What's Data & Analytics At Pleo Like? Join our community of 20+ talented data professionals working from more than 10 different locations worldwide with backgrounds in data analytics, analytical engineering and data engineering. Work with a modern data & analytics toolkit including Kafka, BigQuery, dbt, Looker, Fivetran, Segment, & Amplitude. A shared vision to stop the guessing game and unlock growth for Pleo. A lot of attention on data culture, strategy and empowerment of our stakeholders. Dedicated learning & development guidance and support for data professionals. Deal with a huge variety of data and insights spanning the entire business, from the detailed inner workings of the Pleo platform to web & app metrics, CRM details, financial and banking records, human resource information, and everything else needed to fuel Pleo’s decision making processes in a data inspired manner. What Great Looks Like In This Role You and your team are company-wide trusted experts for insights related to Market Expansion and SMB Acquisition & Growth domains. You successfully nurture a culture of data inspired decision making in alignment with the other Data & Analytics leaders. You are an ambassador for excellence within the Data & Analytics community, you lead by example and inspire others. You and your team empowered the Market Expansion and SMB Acquisition & Growth domains to derive descriptive and diagnostic insights autonomously, while you focus on predictive and prescriptive insights. You and your team are highly satisfied in terms of career development and your sense of belonging to the Market Expansion and SMB Acquisition & Growth domains and the Data & Analytics competence group. You solve problems through collaboration rather than control. Your Responsibilities Be the Data Analytics Competence Lead for up to 4 Data Analysts of Market Expansion and SMB Acquisition & Growth domains, providing career development and coaching Ensure Data & Analytics best practices and standards are identified, cultivated and followed in your competence team and domains, own the quality of work and provide ways of solving problems Act as multiplier for Data & Analytics inside the domains and facilitate the feedback loop to other Data & Analytics functions at Pleo Be accountable for your team’s delivery and quality of metrics and dashboards Drive a culture of experimentation and data-informed decision making across the entire customer lifecycle Facilitate activities to increase the level of data literacy within the domains Your Colleagues Say You Have excellent hands-on knowledge of SQL and experience with more advanced analytics topics such as cohort and regression analysis Are an expert in product analytics Are experienced with data visualisation tools such as Tableau, Looker or similar Communicate with clarity and empathy on insights and recommendations to cross-functional stakeholders for decision making Are an authentic leader and a role model for radical candour Have an eye for details and quality is more important for you than speed Show Me The Benefits Get your own Pleo card, which means full autonomy and no out-of-pocket spending Ability to work remotely (anywhere between the east coast of the Americas to European time zones)... ...or onsite if you want to (Copenhagen, London, Berlin, Stockholm, Madrid, Lisbon) Catered lunch in our offices or daily budget if you work remotely 25 days of annual holidays, on top of the standardised festive and bank-related ones, of course 2500€ per year as flex benefit (maybe you want to buy additional holidays, pay the gym, book a professional coach, pay part of your MBA, or finally get that pet you always dreamed of) Great parental leave: 100% paid, 24 weeks for primary caretakers & 8 weeks for secondary Loads of weird and wonderful niche communities to join in the company Trips abroad for team camps and fun Wild enthusiasm and encouragement from us if you want to host MeetUps, events, etc - we'll help (venue, food etc)"
case_data_scientist = "The Platform team creates the technology that enables Spotify to learn quickly and scale easily, enabling rapid growth in our users and our business around the globe. Spanning many disciplines, we work to make the business work; creating the frameworks, capabilities and tools needed to welcome a billion customers. Join us and help to amplify productivity, quality and innovation across Spotify. At Client Platform, a part of the Platform Mission, we are passionate about amplifying productivity, quality and innovation across all client developers at Spotify. Client Platform strives to bring a great experience for client developers at Spotify, and through this deliver stable and reliable products for people to enjoy. We are looking for a Lead Data Scientist that will study the behavior of client developers at Spotify, help evolve our product strategy, drive and own work for our critical initiatives, and bring data and insights into high impact collaborations. Spotify is a fast paced company that believes that every decision should be data-informed and every feature be fueled with data. As a Lead Data Scientist you will be working independently in one of three Product Areas, and together with other Data Science Leads, you will be part of a team bridging insights work across our Tribe. What You’ll Do Lead insights work and establish collaboration with cross-functional roles, e.g. product managers, engineers, designers Partner with Product, Design and Tech leads to determine goals and priorities, as well as empowering them with data through the decision making process Define metrics, build dashboards, create reports and key datasets to empower data-informed product development Communicate insights and recommendations to key partners, helping activate data best practices in the Client Platform teams Perform exploratory analysis to understand who our users are, how they get value out of our offering and where we can further develop our product to bring greater value Who You Are 5+ years of working experience, with a degree in statistics, mathematics, computer science, engineering, economics or any other quantitative field A communicative person who values building strong relationships with colleagues and partners, you also enjoy mentoring and guiding others Able to navigate loosely defined problems, as well as coming up with impactful and actionable insights Have understanding of how to instrument products to accurately collect user and system behaviors through data, thus offering a wide variety of insights and product development use cases Skilled in advanced analytics, and you possess statistical competence (such as regression modeling and significance testing) Hands-on experience synthesizing insights from data using tools such as Python, R, BigQuery, SQL, Tableau Preferably have some level of leadership and management experience, as well as strong project management skills Where you'll be For this role, it can be within the EMEA region in which we have a work location and is within working hours. Prefer an office to work from home instead? Not a problem! We have plenty of options for your working preferences. Find more information about our Work From Anywhere options here . Spotify is an equal opportunity employer. You are welcome at Spotify for who you are, no matter where you come from, what you look like, or what’s playing in your headphones. Our platform is for everyone, and so is our workplace. The more voices we have represented and amplified in our business, the more we will all thrive, contribute, and be forward-thinking! So bring us your personal experience, your perspectives, and your background. It’s in our differences that we will find the power to keep revolutionizing the way the world listens. Spotify transformed music listening forever when we launched in 2008. Our mission is to unlock the potential of human creativity by giving a million creative artists the opportunity to live off their art and billions of fans the chance to enjoy and be passionate about these creators. Everything we do is driven by our love for music and podcasting. Today, we are the world’s most popular audio streaming subscription service. Global COVID and Vaccination Disclosure Spotify is committed to safety and well-being of our employees, vendors and clients. We are following regional guidelines mandating vaccination and testing requirements, including those requiring vaccinations and testing for in-person roles and event attendance. For the US, we have mandated that all employees and contractors be fully vaccinated in order to work in our offices and externally with any third-parties. For all other locations, we strongly encourage our employees to get vaccinated and also follow local COVID and safety protocols."
case_ml_engineer = "What if your job had an impact on shaping the future of urban mobility? Imagine your experiments and analysis improving sustainable last-mile transportation for cities all over Europe. Imagine changing an industry with your team's latest products. YOUR MISSION AT VOI At Voi, we are committed to make the 15 minute cities a reality and we do it by setting our riders and cities first in all aspects of development. We are looking for an experienced Machine Learning Engineer to join our Operations team, where we ensure our Vois are ready to meet the mobility needs in our cities. Here you'll be able to impact fleet efficiency and enable more users to have an excellent ride experience. Our team identifies algorithmically the mobility needs in the field and suggests appropriate actions, such as: Anticipating battery levels and keeping our scooters ready for the next ride Ensuring that users will have Vois available close by whenever they'll need one Identifying parking that is less than ideal and deciding incentives YOUR TEAM AT VOI You’ll be joining a small, skilled and motivated team with a high degree of autonomy. You will have the opportunity to be part of forming the team and influencing its core culture. Voi is a hybrid workplace that operates on trust and freedom, but we also love to hang out in person! We are innovative and curious team members, keen on learning and growing, which is why we make sure to dedicate time to it every sprint. We welcome diverse ideas and continuously aim for a workplace that feels like home. What You’ll Be Working On Driving ML projects end to end - starting from a practical goal, creating a prototype, implementing it into production and measuring the business impact Developing production grade ML ensuring reliability and scalability Building tailor made machine learning models to improve fleet utilization, uptime and user experience Make improvements to all modules in our ongoing projects What You’ll Need To Embark Have 3+ years of experience in working with Machine Learning Have a solid technical and academic background with a MSc or similar Proven experience in building end-to-end machine learning in production Previous experience in a fast moving organization, approaching complex problems with iterative pragmatic solutions Eagerness to learn and contribute to the team's perpetual technical growth Knowledge of tooling related to: cloud solutions (we use GCP), orchestration (e.g. we use Airflow and Prefect) and data processing (e.g. dbt and Snowflake) is meriting Previous experience in forecasting, predictive modeling, optimization and geospatial techniques are meriting Are able to work from Stockholm a few days per quarter Professional working proficiency in English, Swedish is not required"
case_devops_engineer = "Our ambition is to amplify all business processes in H&M using AI and Advanced Analytics by 2025. This means we need to improve our existing platforms or build novel platforms to efficiently and cost-effectively prepare and serve hundreds of millions of AI modeling features train, manage and serve tens of thousands of AI models run thousands of AB tests deploy hundreds of exploration/dev/test/prod environments integrate with key IT systems supporting critical business processes As a Senior Devops Engineer in the AI Foundation, your mission is to contribute to these challenging, yet rewarding causes which will enable us to roll out scalable and production-ready AI and Advanced Analytics software. You will be part of an agile team that not only takes the responsibility of delivering such platforms but also ensures the success of the stakeholders through ensuring the best experience. In addition, you will be part of guilds that establish and promote best practices throughout the organization by adopting inner source culture that champions collaborations, reusable codes, and knowledge sharing. Qualifications Who you are? You feel that our AI and Advanced Analytics ambitions are very exciting. You have at least 5+ years of experience in a similar role. Automation is a part of your DNA, you are passionate about building zero touch systems. You are experienced in managing infrastructure with public cloud providers (Azure or Google cloud is preferred). Working with CI/CD pipelines is your specialty. You have a good understanding of Infrastructure as Code tools, such as Terraform, ARM, Ansible. You have experiences in setting up systems for better observability, such as logging, monitoring, alerting, etc. You have experiences with Docker, Kubernetes and micro service management systems. You believe in knowledge sharing and upskilling. You are a role model for the team not just for your technical skills, but also for adopting agile ways of working. You bear a team-first mentality and truly believe in striving in diversity. You have good communication skills, preferably in English, both verbal and written. You are not shy in presenting your work."


In [62]:
calculate_role_distinct_visual(cv_description_fullstack,f_score_dict,0.3,'fullstack',2,'data_analyst').sort_values(by='Percentage',ascending = False)

[38;2;0;150;0mchristine [38;2;255;255;255m[38;2;0;150;0mis [38;2;255;255;255m[38;2;0;150;0man [38;2;255;255;255m[38;2;0;150;0menthousiastic [38;2;255;255;255m[38;2;0;150;0mand [38;2;255;255;255m[38;2;0;150;0mcurious [38;2;255;255;255m[38;2;0;150;0mcreator [38;2;255;255;255m[38;2;0;150;0mwho [38;2;255;255;255m[38;2;0;0;74mworks [38;2;255;255;255m[38;2;0;150;0mwell [38;2;255;255;255m[38;2;0;150;0mwith [38;2;255;255;255m[38;2;0;150;0many [38;2;255;255;255m[38;2;107;0;0mteam [38;2;255;255;255m[38;2;0;150;0mshe [38;2;255;255;255m[38;2;0;150;0mkeeps [38;2;255;255;255m[38;2;0;150;0mher [38;2;255;255;255m[38;2;0;150;0meye [38;2;255;255;255m[38;2;0;150;0mon [38;2;255;255;255m[38;2;0;150;0mthe [38;2;255;255;255m[38;2;0;0;101mproject [38;2;255;255;255m[38;2;0;150;0mas [38;2;255;255;255m[38;2;0;150;0ma [38;2;255;255;255m[38;2;0;150;0mwhole [38;2;255;255;255m[38;2;0;150;0mwith [38;2;255;255;255m[38;2;0;150;0ma [38;2;255;255;255m[38;2;0;0;97mfocus 

Unnamed: 0,team,ux,years,experience,design,code,frontend,knowledge,backend,needed,creative,works,project,focus,10,ability,product
fullstack f-score,0.211746,0.384837,0.143509,0.159384,0.293484,0.324989,0.272789,0.124282,0.29058,0.187676,0.158081,0.0,0.0,0.0,0.0,0.0,0.0
data_analyst f-score,0.0,0.0,0.159052,0.138641,0.0,0.0,0.0,0.181101,0.0,0.0,0.0,0.145628,0.199453,0.191439,0.253973,0.229941,0.246459


Unnamed: 0,Squared Sum,Percentage
fullstack f-score,0.665193,33.29
data_analyst f-score,0.353071,17.67
data_scientist f-score,0.309829,15.51
data_engineer f-score,0.267167,13.37
devops_engineer f-score,0.213061,10.66
ml_engineer f-score,0.189822,9.5


In [61]:
calculate_role_distinct_visual(cv_description_data_scientist,f_score_dict,0.3,'data_scientist',2,'data_analyst').sort_values(by='Percentage',ascending = False)

[38;2;0;150;0mmartin [38;2;255;255;255m[38;2;0;150;0mis [38;2;255;255;255m[38;2;0;150;0ma [38;2;255;255;255m[38;2;134;0;151mdata [38;2;255;255;255m[38;2;102;0;0mscientist [38;2;255;255;255m[38;2;0;150;0mwith [38;2;255;255;255m[38;2;0;150;0ma [38;2;255;255;255m[38;2;0;150;0mgreat [38;2;255;255;255m[38;2;73;0;0mpassion [38;2;255;255;255m[38;2;0;150;0mfor [38;2;255;255;255m[38;2;139;0;0mlearning [38;2;255;255;255m[38;2;0;150;0mnew [38;2;255;255;255m[38;2;0;150;0mthings [38;2;255;255;255m[38;2;0;150;0mand [38;2;255;255;255m[38;2;0;150;0mtackling [38;2;255;255;255m[38;2;0;150;0mdifferent [38;2;255;255;255m[38;2;132;0;0mproblems [38;2;255;255;255m[38;2;0;150;0min [38;2;255;255;255m[38;2;0;150;0mtech [38;2;255;255;255m[38;2;0;150;0mhis [38;2;255;255;255m[38;2;0;150;0mlatest [38;2;255;255;255m[38;2;0;0;81mrole [38;2;255;255;255m[38;2;0;150;0mwas [38;2;255;255;255m[38;2;0;150;0mas [38;2;255;255;255m[38;2;0;150;0ma [38;2;255;255;255m[38;2;134;0;

Unnamed: 0,data,scientist,passion,learning,problems,technical,predictive,analytics,product,working,...,bachelor,applied,problem,solving,skills,solutions,role,driven,products,reading
data_scientist f-score,0.263723,0.200751,0.1433,0.273951,0.260358,0.274934,0.384243,0.406654,0.138335,0.192361,...,0.134984,0.313526,0.163058,0.154985,0.201764,0.171211,0.0,0.0,0.0,0.0
data_analyst f-score,0.296161,0.0,0.0,0.0,0.0,0.0,0.0,0.247226,0.246459,0.137601,...,0.168793,0.0,0.214668,0.212668,0.242059,0.0,0.159064,0.156956,0.248883,0.189724


Unnamed: 0,Squared Sum,Percentage
data_scientist f-score,1.751899,32.92
data_analyst f-score,0.915484,17.2
ml_engineer f-score,0.853522,16.04
data_engineer f-score,0.832118,15.64
fullstack f-score,0.644949,12.12
devops_engineer f-score,0.323839,6.09


In [55]:
calculate_role_distinct_visual(case_fullstack,f_score_dict,0.3,'fullstack',2,'devops_engineer').sort_values(by='Percentage',ascending = False)

[38;2;78;0;0mjava [38;2;255;255;255m[38;2;0;150;0mfull [38;2;255;255;255m[38;2;160;0;79mstack [38;2;255;255;255m[38;2;0;0;133mdeveloper [38;2;255;255;255m[38;2;109;0;0m100 [38;2;255;255;255m[38;2;191;0;75mremote [38;2;255;255;255m[38;2;0;150;0m6 [38;2;255;255;255m[38;2;0;150;0mmonths [38;2;255;255;255m[38;2;0;150;0mcontract [38;2;255;255;255m[38;2;0;150;0mto [38;2;255;255;255m[38;2;0;150;0mhire [38;2;255;255;255m[38;2;0;150;0mcan [38;2;255;255;255m[38;2;0;150;0mspons [38;2;255;255;255m[38;2;0;150;0mjd [38;2;255;255;255m[38;2;0;150;0mmandatory [38;2;255;255;255m[38;2;78;0;0mjava [38;2;255;255;255m[38;2;255;0;0mjavascript [38;2;255;255;255m[38;2;255;0;0mreact [38;2;255;255;255m[38;2;247;0;0mangular [38;2;255;255;255m[38;2;0;150;0mwhat [38;2;255;255;255m[38;2;0;150;0myou [38;2;255;255;255m[38;2;0;150;0mwill [38;2;255;255;255m[38;2;0;150;0mneed [38;2;255;255;255m[38;2;65;0;0mcandidate [38;2;255;255;255m[38;2;0;150;0mpossess [38;2;255;255;25

Unnamed: 0,java,stack,100,remote,javascript,react,angular,candidate,experience,years,...,aws,fast,communication,skills,automation,industry,management,background,processes,tools
fullstack f-score,0.154795,0.314202,0.214439,0.375468,0.539511,0.678094,0.485325,0.129147,0.159384,0.143509,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
devops_engineer f-score,0.0,0.155419,0.0,0.148144,0.0,0.0,0.0,0.0,0.160984,0.15854,...,0.479043,0.168096,0.130805,0.122255,0.497308,0.123411,0.21571,0.19679,0.132338,0.195009


Unnamed: 0,Squared Sum,Percentage
fullstack f-score,4.045242,33.09
devops_engineer f-score,2.027184,16.58
data_scientist f-score,1.751327,14.33
data_engineer f-score,1.612291,13.19
data_analyst f-score,1.520965,12.44
ml_engineer f-score,1.267798,10.37


In [56]:
calculate_role_distinct_visual(case_data_engineer,f_score_dict,0.3,'data_engineer',2,'data_analyst').sort_values(by='Percentage',ascending = False)

[38;2;77;0;0mdesign [38;2;255;255;255m[38;2;0;150;0mand [38;2;255;255;255m[38;2;0;0;115mimplement [38;2;255;255;255m[38;2;147;0;151mdata [38;2;255;255;255m[38;2;0;150;0mextraction [38;2;255;255;255m[38;2;0;0;164mprocess [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;147;0;151mdata [38;2;255;255;255m[38;2;0;150;0mfrom [38;2;255;255;255m[38;2;0;150;0mexisting [38;2;255;255;255m[38;2;129;0;0msystems [38;2;255;255;255m[38;2;0;150;0mupdate [38;2;255;255;255m[38;2;0;150;0mcurrent [38;2;255;255;255m[38;2;147;0;151mdata [38;2;255;255;255m[38;2;0;150;0mlake [38;2;255;255;255m[38;2;147;0;151mdata [38;2;255;255;255m[38;2;157;0;0mstream [38;2;255;255;255m[38;2;77;0;0mdesign [38;2;255;255;255m[38;2;0;0;95mmaking [38;2;255;255;255m[38;2;0;150;0mthem [38;2;255;255;255m[38;2;0;150;0mcapable [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;65;0;0mhandling [38;2;255;255;255m[38;2;0;150;0mnew [38;2;255;255;255m[38;2;0;150;0mtypes [38

Unnamed: 0,design,data,systems,stream,handling,implementation,development,skills,cloud,technical,...,support,user,tableau,power,3,personal,analytical,independently,complex,paced
data_engineer f-score,0.152775,0.288689,0.254181,0.309772,0.128732,0.173862,0.177485,0.14221,0.281908,0.147617,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
data_analyst f-score,0.0,0.296161,0.0,0.0,0.0,0.0,0.0,0.242059,0.0,0.0,...,0.255675,0.136474,0.413553,0.209979,0.145547,0.24279,0.354525,0.175467,0.201563,0.159064


Unnamed: 0,Squared Sum,Percentage
data_engineer f-score,1.762838,23.07
data_analyst f-score,1.747195,22.87
devops_engineer f-score,1.577746,20.65
data_scientist f-score,1.32143,17.29
fullstack f-score,0.861162,11.27
ml_engineer f-score,0.370853,4.85


In [57]:
calculate_role_distinct_visual(case_data_analyst,f_score_dict,0.3,'data_analyst',2,'data_scientist').sort_values(by='Percentage',ascending = False)

[38;2;0;150;0min [38;2;255;255;255m[38;2;0;150;0mthis [38;2;255;255;255m[38;2;0;0;180mposition [38;2;255;255;255m[38;2;0;150;0myou [38;2;255;255;255m[38;2;130;0;0msupport [38;2;255;255;255m[38;2;0;150;0mpleo [38;2;255;255;255m[38;2;0;150;0ms [38;2;255;255;255m[38;2;89;0;0mmarket [38;2;255;255;255m[38;2;0;150;0mexpansion [38;2;255;255;255m[38;2;0;150;0mand [38;2;255;255;255m[38;2;0;150;0msmb [38;2;255;255;255m[38;2;0;150;0macquisition [38;2;255;255;255m[38;2;75;0;0mgrowth [38;2;255;255;255m[38;2;0;150;0mdomains [38;2;255;255;255m[38;2;0;150;0mand [38;2;255;255;255m[38;2;0;150;0myou [38;2;255;255;255m[38;2;0;150;0mare [38;2;255;255;255m[38;2;0;150;0ma [38;2;255;255;255m[38;2;0;150;0mmember [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;0;150;0mthe [38;2;255;255;255m[38;2;151;0;134mdata [38;2;255;255;255m[38;2;126;0;207manalytics [38;2;255;255;255m[38;2;0;150;0mcompetence [38;2;255;255;255m[38;2;0;0;77mgroup [38;2;255;255;255m

Unnamed: 0,support,market,growth,data,analytics,customers,website,product,fast,cross,...,stakeholders,learning,variety,predictive,solve,problems,level,regression,experienced,enthusiasm
data_analyst f-score,0.255675,0.176372,0.148812,0.296161,0.247226,0.230884,0.319064,0.246459,0.156596,0.182568,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
data_scientist f-score,0.0,0.0,0.0,0.263723,0.406654,0.0,0.0,0.138335,0.15324,0.157371,...,0.17449,0.273951,0.149428,0.384243,0.255035,0.260358,0.165754,0.319249,0.16052,0.139381


Unnamed: 0,Squared Sum,Percentage
data_analyst f-score,5.245707,30.43
data_scientist f-score,4.015857,23.3
fullstack f-score,2.395894,13.9
data_engineer f-score,2.335766,13.55
ml_engineer f-score,1.664419,9.66
devops_engineer f-score,1.578596,9.16


In [63]:
calculate_role_distinct_visual(case_data_scientist,f_score_dict,0.3,'data_scientist',2,'data_analyst').sort_values(by='Percentage',ascending = False)

[38;2;0;150;0mthe [38;2;255;255;255m[38;2;0;150;0mplatform [38;2;255;255;255m[38;2;0;150;0mteam [38;2;255;255;255m[38;2;0;150;0mcreates [38;2;255;255;255m[38;2;0;150;0mthe [38;2;255;255;255m[38;2;74;0;0mtechnology [38;2;255;255;255m[38;2;0;150;0mthat [38;2;255;255;255m[38;2;0;150;0menables [38;2;255;255;255m[38;2;0;150;0mspotify [38;2;255;255;255m[38;2;0;150;0mto [38;2;255;255;255m[38;2;0;0;85mlearn [38;2;255;255;255m[38;2;0;150;0mquickly [38;2;255;255;255m[38;2;0;150;0mand [38;2;255;255;255m[38;2;126;0;0mscale [38;2;255;255;255m[38;2;0;150;0measily [38;2;255;255;255m[38;2;0;150;0menabling [38;2;255;255;255m[38;2;0;150;0mrapid [38;2;255;255;255m[38;2;0;0;75mgrowth [38;2;255;255;255m[38;2;0;150;0min [38;2;255;255;255m[38;2;0;150;0mour [38;2;255;255;255m[38;2;0;150;0musers [38;2;255;255;255m[38;2;0;150;0mand [38;2;255;255;255m[38;2;0;150;0mour [38;2;255;255;255m[38;2;114;0;164mbusiness [38;2;255;255;255m[38;2;0;150;0maround [38;2;255;255;

Unnamed: 0,technology,scale,business,work,tools,experience,deliver,people,data,scientist,...,thinking,personal,power,potential,driven,service,clients,guidelines,order,parties
data_scientist f-score,0.146966,0.247786,0.22428,0.222272,0.158868,0.191087,0.137699,0.167188,0.263723,0.200751,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
data_analyst f-score,0.0,0.0,0.323216,0.168499,0.181034,0.138641,0.0,0.0,0.296161,0.0,...,0.191244,0.24279,0.209979,0.15775,0.156956,0.162802,0.192683,0.15775,0.136302,0.255251


Unnamed: 0,Squared Sum,Percentage
data_scientist f-score,4.853755,26.25
data_analyst f-score,4.465341,24.15
data_engineer f-score,2.741993,14.83
fullstack f-score,2.386945,12.91
devops_engineer f-score,2.033026,11.0
ml_engineer f-score,2.008967,10.87


In [64]:
calculate_role_distinct_visual(case_ml_engineer,f_score_dict,0.3,'ml_engineer',2,'data_scientist').sort_values(by='Percentage',ascending = False)

[38;2;0;150;0mwhat [38;2;255;255;255m[38;2;0;150;0mif [38;2;255;255;255m[38;2;0;150;0myour [38;2;255;255;255m[38;2;0;0;88mjob [38;2;255;255;255m[38;2;0;150;0mhad [38;2;255;255;255m[38;2;0;150;0man [38;2;255;255;255m[38;2;0;150;0mimpact [38;2;255;255;255m[38;2;0;150;0mon [38;2;255;255;255m[38;2;0;150;0mshaping [38;2;255;255;255m[38;2;0;150;0mthe [38;2;255;255;255m[38;2;0;150;0mfuture [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;0;150;0murban [38;2;255;255;255m[38;2;0;150;0mmobility [38;2;255;255;255m[38;2;0;150;0mimagine [38;2;255;255;255m[38;2;0;150;0myour [38;2;255;255;255m[38;2;0;0;91mexperiments [38;2;255;255;255m[38;2;0;150;0mand [38;2;255;255;255m[38;2;0;0;183manalysis [38;2;255;255;255m[38;2;0;150;0mimproving [38;2;255;255;255m[38;2;0;150;0msustainable [38;2;255;255;255m[38;2;0;150;0mlast [38;2;255;255;255m[38;2;0;150;0mmile [38;2;255;255;255m[38;2;0;150;0mtransportation [38;2;255;255;255m[38;2;0;150;0mfor [38;2;255;

Unnamed: 0,industry,latest,products,development,experienced,machine,learning,engineer,excellent,experience,...,business,developing,3,technical,fast,organization,complex,data,predictive,required
ml_engineer f-score,0.151768,0.126966,0.180022,0.125445,0.160629,0.56362,0.531887,0.179469,0.243822,0.161526,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
data_scientist f-score,0.143262,0.0,0.0,0.0,0.16052,0.290238,0.273951,0.0,0.173654,0.191087,...,0.22428,0.146048,0.200547,0.274934,0.15324,0.236056,0.289107,0.263723,0.384243,0.200556


Unnamed: 0,Squared Sum,Percentage
ml_engineer f-score,3.439604,27.84
data_scientist f-score,3.162726,25.6
data_engineer f-score,1.790213,14.49
data_analyst f-score,1.428648,11.56
devops_engineer f-score,1.33372,10.79
fullstack f-score,1.201098,9.72


## Above is just a snippet of the whole lecture. 
## Web scraping, more optimizations (through for example bias adding and different stop words) and machine learning optimization are skipped
## However, even without optimization, most of the result show a clear message - That some cases ask for more than what a role defines. For example, that a post is asking for a data engineer when it actually also needs a data analyst. (Some do need better optimization though)
