## Imports and initializing

In [49]:
#Important pip installs (Once per hardware is generally enough) 
#Preferred to be done in an external terminal in a specific virtual environment.
#However, if not possible, run this cell to download it.
import sys
#!{sys.executable} -m pip install --upgrade pip
#!{sys.executable} -m pip install numpy
#!{sys.executable} -m pip install pandas

## if error (e.g. c++ error) open jupyter notebook through anaconda prompt
## installataion https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html
#!{sys.executable} -m pip install scattertext  


#!{sys.executable} -m pip install nltk
#!{sys.executable} -m pip install regex
#!{sys.executable} -m pip install requests

In [1]:
#Typical data science imports
import numpy as np
import pandas as pd

In [43]:
#Specific NLP imports

import scattertext as st


#Stopwords to remove from data to improve NLP
import requests     
stopwords_list = requests.get("https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt").content
global stopwords 
stopwords = set(stopwords_list.decode().splitlines()) 
special_words = ['ml','ci','cd']
for word in special_words:
    stopwords.remove(word)


In [44]:
# Stemmers (optional)  https://www.geeksforgeeks.org/python-stemming-words-with-nltk/
# In short: stemmers change words to their base form. (In ideal cases)
# program  :  program
# programs  :  program
# programmer  :  program
# programming  :  program
# programmers  :  program
# However, stemmers could make faulty decisions and ruin the data
from nltk.stem import PorterStemmer
ps = PorterStemmer()

from nltk.stem.snowball import SnowballStemmer
ss = SnowballStemmer('english')

from nltk.stem import LancasterStemmer
ls=LancasterStemmer()


In [45]:
#Regex for text processing
import re
def remove_special_characters(word_list,remove_stop_words = True):
    global stopwords
    res = re.findall('(?!_)\w+(?<!_)', word_list.lower())
    if remove_stop_words == True:
        no_stop_words = [word for word in res if word not in stopwords]
        res = " ".join(no_stop_words)
    else:
        with_stop_words = [word for word in res]
        res = " ".join(with_stop_words)
    return res

In [46]:
#Extra for specific situations
from scipy.stats import hmean
def compare_df_category_count(word_df_list):
    all_df = pd.concat(word_df_list)
    grouped_df = all_df.groupby(all_df.index.tolist(),as_index=False).size().sort_values(by='size', ascending = False)
    return grouped_df

## Functions

### 1. Functions for reading csv files to get the F-score

In [47]:
def csv_string_list_to_df_list(csv_list,drop_duplicates=True,drop_rows_to_lowest = True):
    """Reads csv files and converts to a list of data frame for each file(role).
    Args:
        text (str): .csv file names including .csv   (e.g. data_scientist.csv)
    Returns:
        list of pandas.DataFrame: representing each category. 
    """
    
    #Read csv file and prune it. Repeat for all csv files.
    df_list = []
    for csv in csv_list:
        category_df = pd.read_csv(csv)         #Reads csv file
        category_name = csv.split('.')[0]      #The category gets named based on file name. fullstack.csv => fullstack
        category_df['title'] = [category_name]*len(category_df) #continuation of above line
        category_df.dropna(subset=['requirements'],inplace=True)          #drop rows with null
        if drop_duplicates == True:
            category_df =  category_df[category_df.groupby(['title','requirements']).cumcount().le(0)] #remove duplicates
            #category_df.drop_duplicates(subset=['requirements'],inplace=True) 
        df_list.append(category_df)  
    
    #To remove participation bias, all roles should have the same amount of data.
    #Cut down rows so that all roles have the same rows as the role with the lowest amount of rows.
    if drop_rows_to_lowest == True:
        lowest_len = len(df_list[0])         
        for df in df_list:                   
            if lowest_len > len(df):
                lowest_len = len(df)
        df_list = list(map(lambda x: x[:lowest_len],df_list))    
    
    
    #Add the column to be parsed. requirements column will be used for the model.
    new_df =[]
    for df in df_list:                 
        df['parsed'] =df['requirements'].apply(remove_special_characters)     
        #df['parsed'] =df['parsed'].apply(ss.stem)       #stemming could be added
        df['parsed'] =df['parsed'].apply(st.whitespace_nlp_with_sentences)   #NLP Tokenizing 
        df = df[['title','parsed']]      #Drop every other column to get correct format.
        new_df.append(df)
    return new_df
    

In [48]:
def get_f_score(pruned_df_list,beta = 1):
    """Compares each df in df_list and returns word frequency and F-Score
    Args:
        list of pandas.DataFrame: (from method: csv_string_list_to_df_list)
    Returns:
        pandas.DataFrame: with frequency and F-score (roles*2 columns)
    """
    
    # Calculates frequency and f-score from df_list.
    all_categories_df = pd.concat(pruned_df_list) 
    corpus = (
        st.CorpusFromParsedDocuments(
            all_categories_df,                             #df of interest with parsed column
            category_col="title",                          #target column for comparison (x)
            parsed_col="parsed")                           #target column for parsed data (y)
            .build()                                       #instantiate build
            .get_unigram_corpus()                          #comparing single word by word. 
            #.compact(st.AssociationCompactor(2000))       #limiting only 2000 items to the corpus
    )
    # Makes a Dataframe out of the calculated data.
    #F-score = Harmonic mean of precision and frequency with beta 
    #beta < 1 => precision favored, vice verca.   beta = 0 => f-score = precision  vice verca
    #beta = 1 => freq and precision equally favored.

    freq_list = corpus.get_term_freq_df()   #Word frequency data frame
    f_score_list = freq_list.copy()      # to get the column names correct, temp1 and temp2 is needed
    for df in pruned_df_list:                #Adds F-score for each category
        precision = freq_list[df['title'].iloc[0] + ' freq']/freq_list.sum(axis=1)
        frequency = freq_list[df['title'].iloc[0] + ' freq']/freq_list[df['title'].iloc[0] + ' freq'].sum()
        f_score_list[df['title'].iloc[0] + ' f-score'] = (1+beta**2) * precision * frequency / (beta**2 * precision + frequency)
        
        
        
    return f_score_list.fillna(0) #Null occurs when 0-division happens, which is when presicion and freq = 0

### 2. Functions for calculating role based on F-score

In [13]:

def calculate_role(text,f_score_list,diminishing_repetition = False):
    """Gives each word in text score based on the f_score_list and 
    returns a data frame with scores for each role. 
    The score is squared to reward high-scoring/confident words.
    If diminishing_repetition is True, then repeating words are less valued (ratio: 1/n)
    Args:
        text (str): Text to calculate role. 
        f_score_list (dataframe): (from previous method: get_f_score)
        diminishing_repetition (bool): If True, f_score will be divided by 1/n where n = times word appeared
    Returns:
        pandas.DataFrame: n long dataframe Each roles column name and respective F-score and percentage
    """
    
    #Goes through text word by word, giving points to each role respectively.
    word_dict = {} 
    text = remove_special_characters(text)
    f_scores = f_score_list.filter(regex='f-score')**2 # Only squared f-scores (no frequency columns)
    score_array = [0 for x in range(len(f_scores.columns))]  # creates [0,0,0,...,n] for n roles.
    score_df = pd.DataFrame(score_array, index=f_scores.columns ,columns=['Squared sum']).transpose()
    
    for w in text.split(' '):
        if w in f_scores.index:                             # if word is in f_score_list
            if w in word_dict:                              # if word has appeared before(diminishing repetion)
                word_dict[w] += 1
            else:
                word_dict[w] = 1
            score_df+= (f_scores.loc[w])/word_dict[w] if diminishing_repetition == True else f_scores.loc[w]
            
    score_df = score_df.transpose()
    score_df['Percentage'] = score_df.apply(lambda x: round(100*x/x.sum(),2))
                
    return score_df                                                     

## An example of usage

### Initializing f-score list

In [14]:
#We have 6 different roles we want to analyze
csv_list = ['fullstack.csv','data_engineer.csv','data_analyst.csv','data_scientist.csv','ml_engineer.csv','devops_engineer.csv']


In [15]:
#Start with making dataframes of each role by reading the csv files.
df_list = csv_string_list_to_df_list(csv_list)

In [16]:
#Calculate the frequency and f-score of this df_list
#Lower beta => precision favored, vice verca.   beta = 0 => f-score = precision  vice verca
#For this data set, beta = 0.05 is good to find category specific words. 
#Supervised learning will optimize this later.
f_score_list = get_f_score(df_list,beta = 0.05)

In [17]:
#What does it look like?
f_score_list

Unnamed: 0_level_0,fullstack freq,data_engineer freq,data_analyst freq,data_scientist freq,ml_engineer freq,devops_engineer freq,fullstack f-score,data_engineer f-score,data_analyst f-score,data_scientist f-score,ml_engineer f-score,devops_engineer f-score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
computer,34,36,28,72,73,25,0.120414,0.125574,0.097322,0.248909,0.257866,0.088338
science,30,39,36,115,54,21,0.096996,0.124356,0.114417,0.363700,0.174180,0.067756
technical,22,27,22,51,14,28,0.123168,0.147587,0.119588,0.274901,0.078059,0.156196
field,11,27,29,59,25,10,0.062633,0.150042,0.160245,0.323237,0.141756,0.056731
relevant,10,11,13,20,7,7,0.120689,0.126173,0.147385,0.222815,0.083741,0.083831
...,...,...,...,...,...,...,...,...,...,...,...,...
acquiring,0,0,0,0,0,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.059994
isolate,0,0,0,0,0,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.059994
defects,0,0,0,0,0,2,0.000000,0.000000,0.000000,0.000000,0.000000,0.113213
reproduce,0,0,0,0,0,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.059994


In [18]:
#Lets order by fullstack f-score
f_score_list.sort_values(by="fullstack f-score",ascending = False)

Unnamed: 0_level_0,fullstack freq,data_engineer freq,data_analyst freq,data_scientist freq,ml_engineer freq,devops_engineer freq,fullstack f-score,data_engineer f-score,data_analyst f-score,data_scientist f-score,ml_engineer f-score,devops_engineer f-score
term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
react,44,1,0,0,4,1,0.677937,0.014445,0.000000,0.000000,0.060942,0.015256
javascript,49,5,5,4,3,10,0.539422,0.052542,0.051983,0.040925,0.032761,0.109312
node,27,1,0,0,3,6,0.519879,0.017775,0.000000,0.000000,0.056960,0.114115
competent,17,0,0,0,0,1,0.515424,0.000000,0.000000,0.000000,0.000000,0.029739
html5,15,0,0,0,0,0,0.500166,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
geographies,0,1,0,0,0,0,0.000000,0.049142,0.000000,0.000000,0.000000,0.000000
extends,0,1,0,1,0,0,0.000000,0.046846,0.000000,0.041797,0.000000,0.000000
groups,0,2,2,0,1,1,0.000000,0.078937,0.075875,0.000000,0.045986,0.046177
persuade,0,1,0,0,0,0,0.000000,0.049142,0.000000,0.000000,0.000000,0.000000


In [19]:
#If you want to know the f-score of a speicific word
f_score_list.loc['git']
#E.g. you know there is a new term that has not become widely known yet

fullstack freq             21.000000
data_engineer freq          5.000000
data_analyst freq           0.000000
data_scientist freq         3.000000
ml_engineer freq           11.000000
devops_engineer freq       35.000000
fullstack f-score           0.233748
data_engineer f-score       0.053099
data_analyst f-score        0.000000
data_scientist f-score      0.031010
ml_engineer f-score         0.121448
devops_engineer f-score     0.386810
Name: git, dtype: float64

### Using the f-score list to make predictions

In [20]:
##With this f_score_list, many calculations can be done
##One example could be to calcualte what role a CV best fits.

In [21]:
#Lets say that a consultant has this CV
cv_skills = "Python Java, React.js, SQL, NoSQL, Docker, TensorFlow, Machine Learning, Git"


In [41]:
#Calculate the score
calculate_role(cv_skills,f_score_list).sort_values(by='Percentage',ascending = False)

Unnamed: 0,Squared sum,Percentage
ml_engineer f-score,1.033664,39.58
fullstack f-score,0.607276,23.26
devops_engineer f-score,0.353532,13.54
data_scientist f-score,0.304234,11.65
data_engineer f-score,0.228319,8.74
data_analyst f-score,0.084325,3.23


In [20]:
##This works with any text

In [25]:
#Lets say we have a consultant description (Fullstack cv)
cv_description = "Christine is an enthousiastic and curious creator who works well with any team. She keeps her eye on the project as a whole with a focus on UX. With over 10 years experience in design she has the ability to code quick and goodlooking solutions, especially in the frontend. Her knowledge of backend allows her to jump in where needed to bring the product to completion. She is often described as creative, encouraging, capable and kind."

In [42]:
calculate_role(cv_description,f_score_list).sort_values(by='Percentage',ascending = False)

Unnamed: 0,Squared sum,Percentage
fullstack f-score,0.726795,30.84
data_analyst f-score,0.430747,18.28
data_scientist f-score,0.379495,16.11
data_engineer f-score,0.305552,12.97
devops_engineer f-score,0.273051,11.59
ml_engineer f-score,0.240664,10.21


In [32]:
#Lets copy different job posts and try applying the method on it
#The variables are named after what the post looked for. 
#e.g case_data_engineer was a announcement searching for a data engineer
test_case = "Searching for an intro-level data scientist eager to gain exposure to solving real-world problems with data-driven solutions. This applicant is expected to be familiar standard data science approaches such as data ingest, ETL, preprocessing, model building, model deploying, and monitoring models once deployed. More specifically, they will demonstrate experience working with raw data by processing and ingesting into a database, cleaning it to be fed into a model, and deploying it for some application. Along with this, the applicant can explain findings clearly to a general audience. Job Description Troubleshoot problems with data ingesting and processing Contribute to research and identification most accurate model architecture for current problem. Collaborate in data science meetings by presenting on theoretical or applied data science topics Demonstrate the ability to troubleshoot in cloud compute environments such as AWS, Azure, and Google Cloud Develop visualizations to convey meaningful insights Present insights from models to non-technical crowd to convey findings Suggest meaningful approaches, models, and services to addressing problems. Stay informed about trends and recent capabilities within data science. Requirements Undergraduate degree in or completing a degree data science, statistics, computer science, or a relative quantitative field. Moderate experience in Python or R is preferred. Other equivalent languages are considered. Understanding of relational database querying language such as SQL Experienced in machine learning and also, neural network techniques is preferred Ability to explain insights in laymen’s terms in order inform business decisions Exposure to one more neural network framework – Tensorflow, Torch/ PyTorch, ONNX, etc Previously worked with distributed compute environments such as Hadoop."
case_fullstack = "Java Full stack developer 100 Remote 6 months Contract to Hire Can Spons JD Mandatory Java, JavaScript, React, Angular What You will Need Candidate possess a bachelorrsquos degree in Computer Science, or related field, or equivalent experience 7+ years of experience creating modern web experiences across devices Expert in creating customer experiences web portal using JavaScript, CSS and HTML Project experience using React, Angular, Vue.JS or similar frameworks andor libraries Working experience with distributed SCM (GitHub a plus), DevOps, AWS Experience integrating with lightweight middleware technologies, integration patterns, microservices Proficient demonstration of SQL knowledge Track record of taking ownership and driving results in a data-driven, fast-paced environment Excellent interpersonal and communication skills, strong analytical skills, and ability to deal with ambiguity in a rapidly evolving business environment Bonus Understanding of testing automation, including the building of scrappy codingautomation for testing code faster Experience in Retail, E-Commerce or Technology Industry, Merchandising Vendor Management focus a plus Strong engineering background building Productsprototypes that are E-Commerce scale Experience in Software Quality Processes and Atlassian Tools (Confluence, JIRA, etc.) Feel free to reach out to me for questions or clarifications."
case_data_engineer = "Design and implement data extraction process of data from existing systems - Update current data lake data stream design making them capable of handling new types of data streams - with subsequent implementation - Support development of graphical user presentation interface that uses the new data streams Description Core skills: Cloud technical development skills - Experience with AWS services like Glue, EC2, Lambda, S3, Redshift, PostgreSQL etc. - IT architecture (preferably AWS) - Experience with data harmonisation and data entity identification - Experience with data contextualization and visualisation – Preferably Tableau, Power BI - Azure DevOps -Python 3 - Document the technical architecture and the process flows Personal skills: Analytical mindset Can work independently but also collaboratively Visualisation of complex scenarios Can translate between business needs and technical requirements Self-paced Fluent in English"
case_data_analyst = "In this position you support Pleo’s Market Expansion and SMB Acquisition & Growth domains, and you are a member of the Data & Analytics competence group. SMB Acquisition & Growth Domain We contribute 70% of Pleo’s revenue, and we do that by wow-ing customers at every touchpoint, from our ads, to our website, to our sign up journey, to our renewals, to our engagement comms, in-product nudges, etc. We design for the end users, apply consumer approach to growth tactics, build for scalability, make the product accessible & fast, and push for organic growth. We are a super-cross-functional domain, with marketing, sales, customer success, product, etc. all housed under the same “virtual” roof, but a huge focus on experimentation and customer obsession. Market Expansion Domain Pleo will become the go-to spending solution for companies in the SMB segment across Europe, empowering employees by enabling a healthy spending culture. We are launching 15 new markets in 15 months as a way to launch simultaneous bets to supercharge Pleo’s hyper-growth. So What's Data & Analytics At Pleo Like? Join our community of 20+ talented data professionals working from more than 10 different locations worldwide with backgrounds in data analytics, analytical engineering and data engineering. Work with a modern data & analytics toolkit including Kafka, BigQuery, dbt, Looker, Fivetran, Segment, & Amplitude. A shared vision to stop the guessing game and unlock growth for Pleo. A lot of attention on data culture, strategy and empowerment of our stakeholders. Dedicated learning & development guidance and support for data professionals. Deal with a huge variety of data and insights spanning the entire business, from the detailed inner workings of the Pleo platform to web & app metrics, CRM details, financial and banking records, human resource information, and everything else needed to fuel Pleo’s decision making processes in a data inspired manner. What Great Looks Like In This Role You and your team are company-wide trusted experts for insights related to Market Expansion and SMB Acquisition & Growth domains. You successfully nurture a culture of data inspired decision making in alignment with the other Data & Analytics leaders. You are an ambassador for excellence within the Data & Analytics community, you lead by example and inspire others. You and your team empowered the Market Expansion and SMB Acquisition & Growth domains to derive descriptive and diagnostic insights autonomously, while you focus on predictive and prescriptive insights. You and your team are highly satisfied in terms of career development and your sense of belonging to the Market Expansion and SMB Acquisition & Growth domains and the Data & Analytics competence group. You solve problems through collaboration rather than control. Your Responsibilities Be the Data Analytics Competence Lead for up to 4 Data Analysts of Market Expansion and SMB Acquisition & Growth domains, providing career development and coaching Ensure Data & Analytics best practices and standards are identified, cultivated and followed in your competence team and domains, own the quality of work and provide ways of solving problems Act as multiplier for Data & Analytics inside the domains and facilitate the feedback loop to other Data & Analytics functions at Pleo Be accountable for your team’s delivery and quality of metrics and dashboards Drive a culture of experimentation and data-informed decision making across the entire customer lifecycle Facilitate activities to increase the level of data literacy within the domains Your Colleagues Say You Have excellent hands-on knowledge of SQL and experience with more advanced analytics topics such as cohort and regression analysis Are an expert in product analytics Are experienced with data visualisation tools such as Tableau, Looker or similar Communicate with clarity and empathy on insights and recommendations to cross-functional stakeholders for decision making Are an authentic leader and a role model for radical candour Have an eye for details and quality is more important for you than speed Show Me The Benefits Get your own Pleo card, which means full autonomy and no out-of-pocket spending Ability to work remotely (anywhere between the east coast of the Americas to European time zones)... ...or onsite if you want to (Copenhagen, London, Berlin, Stockholm, Madrid, Lisbon) Catered lunch in our offices or daily budget if you work remotely 25 days of annual holidays, on top of the standardised festive and bank-related ones, of course 2500€ per year as flex benefit (maybe you want to buy additional holidays, pay the gym, book a professional coach, pay part of your MBA, or finally get that pet you always dreamed of) Great parental leave: 100% paid, 24 weeks for primary caretakers & 8 weeks for secondary Loads of weird and wonderful niche communities to join in the company Trips abroad for team camps and fun Wild enthusiasm and encouragement from us if you want to host MeetUps, events, etc - we'll help (venue, food etc)"
case_data_scientist = "The Platform team creates the technology that enables Spotify to learn quickly and scale easily, enabling rapid growth in our users and our business around the globe. Spanning many disciplines, we work to make the business work; creating the frameworks, capabilities and tools needed to welcome a billion customers. Join us and help to amplify productivity, quality and innovation across Spotify. At Client Platform, a part of the Platform Mission, we are passionate about amplifying productivity, quality and innovation across all client developers at Spotify. Client Platform strives to bring a great experience for client developers at Spotify, and through this deliver stable and reliable products for people to enjoy. We are looking for a Lead Data Scientist that will study the behavior of client developers at Spotify, help evolve our product strategy, drive and own work for our critical initiatives, and bring data and insights into high impact collaborations. Spotify is a fast paced company that believes that every decision should be data-informed and every feature be fueled with data. As a Lead Data Scientist you will be working independently in one of three Product Areas, and together with other Data Science Leads, you will be part of a team bridging insights work across our Tribe. What You’ll Do Lead insights work and establish collaboration with cross-functional roles, e.g. product managers, engineers, designers Partner with Product, Design and Tech leads to determine goals and priorities, as well as empowering them with data through the decision making process Define metrics, build dashboards, create reports and key datasets to empower data-informed product development Communicate insights and recommendations to key partners, helping activate data best practices in the Client Platform teams Perform exploratory analysis to understand who our users are, how they get value out of our offering and where we can further develop our product to bring greater value Who You Are 5+ years of working experience, with a degree in statistics, mathematics, computer science, engineering, economics or any other quantitative field A communicative person who values building strong relationships with colleagues and partners, you also enjoy mentoring and guiding others Able to navigate loosely defined problems, as well as coming up with impactful and actionable insights Have understanding of how to instrument products to accurately collect user and system behaviors through data, thus offering a wide variety of insights and product development use cases Skilled in advanced analytics, and you possess statistical competence (such as regression modeling and significance testing) Hands-on experience synthesizing insights from data using tools such as Python, R, BigQuery, SQL, Tableau Preferably have some level of leadership and management experience, as well as strong project management skills Where you'll be For this role, it can be within the EMEA region in which we have a work location and is within working hours. Prefer an office to work from home instead? Not a problem! We have plenty of options for your working preferences. Find more information about our Work From Anywhere options here . Spotify is an equal opportunity employer. You are welcome at Spotify for who you are, no matter where you come from, what you look like, or what’s playing in your headphones. Our platform is for everyone, and so is our workplace. The more voices we have represented and amplified in our business, the more we will all thrive, contribute, and be forward-thinking! So bring us your personal experience, your perspectives, and your background. It’s in our differences that we will find the power to keep revolutionizing the way the world listens. Spotify transformed music listening forever when we launched in 2008. Our mission is to unlock the potential of human creativity by giving a million creative artists the opportunity to live off their art and billions of fans the chance to enjoy and be passionate about these creators. Everything we do is driven by our love for music and podcasting. Today, we are the world’s most popular audio streaming subscription service. Global COVID and Vaccination Disclosure Spotify is committed to safety and well-being of our employees, vendors and clients. We are following regional guidelines mandating vaccination and testing requirements, including those requiring vaccinations and testing for in-person roles and event attendance. For the US, we have mandated that all employees and contractors be fully vaccinated in order to work in our offices and externally with any third-parties. For all other locations, we strongly encourage our employees to get vaccinated and also follow local COVID and safety protocols."
case_ml_engineer = "What if your job had an impact on shaping the future of urban mobility? Imagine your experiments and analysis improving sustainable last-mile transportation for cities all over Europe. Imagine changing an industry with your team's latest products. YOUR MISSION AT VOI At Voi, we are committed to make the 15 minute cities a reality and we do it by setting our riders and cities first in all aspects of development. We are looking for an experienced Machine Learning Engineer to join our Operations team, where we ensure our Vois are ready to meet the mobility needs in our cities. Here you'll be able to impact fleet efficiency and enable more users to have an excellent ride experience. Our team identifies algorithmically the mobility needs in the field and suggests appropriate actions, such as: Anticipating battery levels and keeping our scooters ready for the next ride Ensuring that users will have Vois available close by whenever they'll need one Identifying parking that is less than ideal and deciding incentives YOUR TEAM AT VOI You’ll be joining a small, skilled and motivated team with a high degree of autonomy. You will have the opportunity to be part of forming the team and influencing its core culture. Voi is a hybrid workplace that operates on trust and freedom, but we also love to hang out in person! We are innovative and curious team members, keen on learning and growing, which is why we make sure to dedicate time to it every sprint. We welcome diverse ideas and continuously aim for a workplace that feels like home. What You’ll Be Working On Driving ML projects end to end - starting from a practical goal, creating a prototype, implementing it into production and measuring the business impact Developing production grade ML ensuring reliability and scalability Building tailor made machine learning models to improve fleet utilization, uptime and user experience Make improvements to all modules in our ongoing projects What You’ll Need To Embark Have 3+ years of experience in working with Machine Learning Have a solid technical and academic background with a MSc or similar Proven experience in building end-to-end machine learning in production Previous experience in a fast moving organization, approaching complex problems with iterative pragmatic solutions Eagerness to learn and contribute to the team's perpetual technical growth Knowledge of tooling related to: cloud solutions (we use GCP), orchestration (e.g. we use Airflow and Prefect) and data processing (e.g. dbt and Snowflake) is meriting Previous experience in forecasting, predictive modeling, optimization and geospatial techniques are meriting Are able to work from Stockholm a few days per quarter Professional working proficiency in English, Swedish is not required"
case_devops_engineer = "Our ambition is to amplify all business processes in H&M using AI and Advanced Analytics by 2025. This means we need to improve our existing platforms or build novel platforms to efficiently and cost-effectively prepare and serve hundreds of millions of AI modeling features train, manage and serve tens of thousands of AI models run thousands of AB tests deploy hundreds of exploration/dev/test/prod environments integrate with key IT systems supporting critical business processes As a Senior Devops Engineer in the AI Foundation, your mission is to contribute to these challenging, yet rewarding causes which will enable us to roll out scalable and production-ready AI and Advanced Analytics software. You will be part of an agile team that not only takes the responsibility of delivering such platforms but also ensures the success of the stakeholders through ensuring the best experience. In addition, you will be part of guilds that establish and promote best practices throughout the organization by adopting inner source culture that champions collaborations, reusable codes, and knowledge sharing. Qualifications Who you are? You feel that our AI and Advanced Analytics ambitions are very exciting. You have at least 5+ years of experience in a similar role. Automation is a part of your DNA, you are passionate about building zero touch systems. You are experienced in managing infrastructure with public cloud providers (Azure or Google cloud is preferred). Working with CI/CD pipelines is your specialty. You have a good understanding of Infrastructure as Code tools, such as Terraform, ARM, Ansible. You have experiences in setting up systems for better observability, such as logging, monitoring, alerting, etc. You have experiences with Docker, Kubernetes and micro service management systems. You believe in knowledge sharing and upskilling. You are a role model for the team not just for your technical skills, but also for adopting agile ways of working. You bear a team-first mentality and truly believe in striving in diversity. You have good communication skills, preferably in English, both verbal and written. You are not shy in presenting your work."

In [34]:
#Try it yourself by replacing the first argument with the variable name
calculate_role(case_data_scientist,f_score_list).sort_values(by='Percentage',ascending = False)

#It seems to be able to find correct answer for all of these cases
#However, some are very close
#cases_data_engineer might actually need a data_analyst and a data_scientist
#This is where the definitions collide. As the web scraped data is from 

Unnamed: 0,Squared sum,Percentage
data_scientist f-score,6.60893,24.17
data_analyst f-score,6.511693,23.82
data_engineer f-score,4.412741,16.14
fullstack f-score,3.717925,13.6
devops_engineer f-score,3.097872,11.33
ml_engineer f-score,2.990895,10.94


### Improve F-score calculation

### Visual improvement

In [50]:
# Adding color to calculate role
def colored(r, g, b, text):
    return "\033[38;2;{};{};{}m{} \033[38;2;255;255;255m".format(r, g, b, text)

#Lets expand on the calcluate role method and add visuals
#You can compare 2 categories
def calculate_role_visual(text,f_score_list,diminishing_repetition = False, category1 = 'none',brightness = 1,category2 = 'none'):
    """Calculate_role but with visuals
        Args:
            text (str): Text to calculate role. 
            f_score_dict (dict<dataframe>): By separating f_score_list by category and adding each category in one dictionary
            diminishing_repetition (bool): If True, f_score will be divided by 1/n where n = times word appeared
            category1 (str): name of category to be visually shown Red
            brightness (int): color sharpness
            category1 (str): name of category to be visually shown Blue and compared with category1 
    """
    score_df = calculate_role(text,f_score_list,diminishing_repetition)
    
    ##here comes the visual
    # red are words that category1 picks up
    # blue are words that category2 picks up
    # green is when no category is picked up
    if category1 != 'none':
        processed_text = remove_special_characters(text,remove_stop_words = False)
        colored_list = []
        word_list1 = {}
        word_list2 = {}

        for word in processed_text.split():
            color1 = 0
            color2 = 0
            color3 = 150
            if word in f_score_list[category1 + ' f-score'].index:
                if f_score_list[category1 + ' f-score'].loc[word] > 0:
                    word_list1[word] = f_score_list[category1 + ' f-score'].loc[word]
                    score1 = f_score_list[category1 + ' f-score'].loc[word]
                    color1 = score1*brightness if score1*brightness <= 1 else 1
                    color3 = 0
            if category2 != 'none':
                if word in f_score_list[category2 + ' f-score'].index:   
                    if f_score_list[category2 + ' f-score'].loc[word] > 0:
                        word_list2[word] = f_score_list[category2 + ' f-score'].loc[word]
                        score2 = f_score_list[category2 + ' f-score'].loc[word]
                        color2 = score2*brightness if score2*brightness <= 1 else 1
                        color3 = 0
            colored_list.append(colored(int(color1*255),color3,int(color2*255),word))
        print("".join(colored_list))

        df1 = pd.DataFrame.from_dict(word_list1, orient='index',columns=[category1 + ' f-score'])
        if category2 == 'none':
            display(df1.transpose().fillna(0))
            return   score_df
        df2 = pd.DataFrame.from_dict(word_list2, orient='index',columns=[category2 + ' f-score'])
        display(pd.concat([df1, df2], axis=1).transpose().fillna(0))
    return   score_df

In [51]:
#f score list initialization
csv_list = ['fullstack.csv','data_engineer.csv','data_analyst.csv','data_scientist.csv','ml_engineer.csv','devops_engineer.csv']
df_list = csv_string_list_to_df_list(csv_list)
f_score_list = get_f_score(df_list,beta = 0.05)

In [55]:
    # red are words that category1 picks up
    # green is when no category is picked up
    # interesting to see that machine learning are so red together, even though f_score_list only reads single words
calculate_role_visual(uppdrag_ml_engineer,f_score_list,False,'ml_engineer',2).sort_values(by='Percentage',ascending = False)

[38;2;0;150;0mwhat [38;2;255;255;255m[38;2;0;150;0mif [38;2;255;255;255m[38;2;0;150;0myour [38;2;255;255;255m[38;2;13;0;0mjob [38;2;255;255;255m[38;2;0;150;0mhad [38;2;255;255;255m[38;2;0;150;0man [38;2;255;255;255m[38;2;37;0;0mimpact [38;2;255;255;255m[38;2;0;150;0mon [38;2;255;255;255m[38;2;0;150;0mshaping [38;2;255;255;255m[38;2;0;150;0mthe [38;2;255;255;255m[38;2;23;0;0mfuture [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;0;150;0murban [38;2;255;255;255m[38;2;0;150;0mmobility [38;2;255;255;255m[38;2;0;150;0mimagine [38;2;255;255;255m[38;2;0;150;0myour [38;2;255;255;255m[38;2;23;0;0mexperiments [38;2;255;255;255m[38;2;0;150;0mand [38;2;255;255;255m[38;2;40;0;0manalysis [38;2;255;255;255m[38;2;21;0;0mimproving [38;2;255;255;255m[38;2;0;150;0msustainable [38;2;255;255;255m[38;2;0;150;0mlast [38;2;255;255;255m[38;2;0;150;0mmile [38;2;255;255;255m[38;2;49;0;0mtransportation [38;2;255;255;255m[38;2;0;150;0mfor [38;2;255;255;2

Unnamed: 0,job,impact,future,experiments,analysis,improving,transportation,industry,team,latest,...,predictive,modeling,optimization,techniques,work,days,professional,proficiency,english,required
ml_engineer f-score,0.026805,0.073713,0.045986,0.045986,0.079712,0.042122,0.096394,0.151607,0.112535,0.126366,...,0.098284,0.156641,0.24224,0.17721,0.146368,0.096394,0.153586,0.190857,0.127707,0.054484


Unnamed: 0,Squared sum,Percentage
ml_engineer f-score,5.386756,30.22
data_scientist f-score,4.064853,22.81
data_engineer f-score,2.38691,13.39
data_analyst f-score,2.087298,11.71
devops_engineer f-score,2.012819,11.29
fullstack f-score,1.885243,10.58


In [None]:
3

In [40]:
    # red are words that category1 picks up
    # blue are words that category2 picks up
    # green is when no category is picked up
# interesting that glue, ec2, lambda, s3, redshift, postgresql are all picked up as mostly data_engineer
calculate_role_visual(uppdrag_data_engineer,f_score_list,True,'data_analyst',2,'data_engineer').sort_values(by='Percentage',ascending = False)

[38;2;59;0;77mdesign [38;2;255;255;255m[38;2;0;150;0mand [38;2;255;255;255m[38;2;115;0;14mimplement [38;2;255;255;255m[38;2;151;0;147mdata [38;2;255;255;255m[38;2;0;0;19mextraction [38;2;255;255;255m[38;2;164;0;45mprocess [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;151;0;147mdata [38;2;255;255;255m[38;2;0;150;0mfrom [38;2;255;255;255m[38;2;17;0;54mexisting [38;2;255;255;255m[38;2;63;0;129msystems [38;2;255;255;255m[38;2;40;0;20mupdate [38;2;255;255;255m[38;2;49;0;63mcurrent [38;2;255;255;255m[38;2;151;0;147mdata [38;2;255;255;255m[38;2;0;0;45mlake [38;2;255;255;255m[38;2;151;0;147mdata [38;2;255;255;255m[38;2;0;0;157mstream [38;2;255;255;255m[38;2;59;0;77mdesign [38;2;255;255;255m[38;2;95;0;14mmaking [38;2;255;255;255m[38;2;0;150;0mthem [38;2;255;255;255m[38;2;0;150;0mcapable [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;0;0;65mhandling [38;2;255;255;255m[38;2;0;150;0mnew [38;2;255;255;255m[38;2;17;0;36mtypes 

Unnamed: 0,design,implement,data,process,existing,systems,update,current,making,types,implementation,support,development,graphical,user,presentation,core,skills,cloud,technical,experience,aws,services,redshift,postgresql,architecture,preferably,tableau,power,azure,python,3,document,personal,analytical,mindset,work,independently,collaboratively,complex,translate,business,requirements,paced,fluent,english,extraction,lake,stream,handling,streams,interface,glue,ec2,lambda,s3,devops,flows
data_analyst f-score,0.117191,0.226394,0.29616,0.322324,0.03407,0.124619,0.078859,0.096801,0.187508,0.03407,0.0242,0.255658,0.060065,0.089409,0.136461,0.097472,0.09452,0.242057,0.016486,0.119588,0.138641,0.005682,0.373585,0.044139,0.025428,0.028071,0.131352,0.413527,0.209948,0.031612,0.103102,0.145542,0.095704,0.242763,0.354511,0.03295,0.168497,0.17545,0.037938,0.201554,0.037938,0.323209,0.124577,0.159048,0.058243,0.059473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
data_engineer f-score,0.152735,0.029142,0.288678,0.089243,0.105898,0.254144,0.041086,0.124072,0.027541,0.070599,0.1737,0.103618,0.177463,0.0,0.059673,0.039782,0.04843,0.142199,0.28185,0.147587,0.180937,0.217189,0.208131,0.474155,0.052214,0.156641,0.11479,0.08621,0.061883,0.176723,0.173535,0.114644,0.131908,0.090315,0.073116,0.0,0.152254,0.059673,0.078937,0.092482,0.0,0.204592,0.23869,0.060881,0.090045,0.166077,0.037973,0.089509,0.309414,0.128526,0.049142,0.044754,0.171368,0.210105,0.189867,0.264978,0.044622,0.093692


Unnamed: 0,Squared sum,Percentage
data_engineer f-score,1.706747,22.01
data_analyst f-score,1.696887,21.88
devops_engineer f-score,1.565356,20.18
data_scientist f-score,1.338064,17.25
fullstack f-score,0.903436,11.65
ml_engineer f-score,0.545087,7.03


In [41]:
#As you can see, the visuals are very muddled with words that are from other categories. e.g 'devops'
#Which is why we need to only include words that are distinct to that category to
#this will make the visuals much clearer

## Improving F-score calculations and visuals


### Distinct F-score calculating with dictionaries to get distinct f-scores

In [42]:


def calculate_role_distinct(text,f_score_dict,diminishing_repetition = False):
    """Gives each word in text score based on the f_score_list and 
    returns a array(score_array) with scores for each role.
    Different from calculate_role, calculate_role_distinct adds only to its own category.
    It does this by separating each category in a dictionary. 
    For example, the top 50 f-scoring words in each category gets added to its seperate key in a dictionary(f_score_dict)
    If for example 'Data' were to be top 50 words in data_scientist, but not in fullstack, 
    then 'Data' would obly give points to data_scientist.
    Reason to use this: to get more clear visuals(each word is more prone connected to only one category).
    
    If diminishing_repetition is True, then repeating words are less valued (ratio: 1/n)
    The score is squared to reward high-scoring/confident words.
    Args:
        text (str): Text to calculate role. 
        f_score_dict (dict<dataframe>): By separating f_score_list by category and adding each category in one dictionary
        diminishing_repetition (bool): If True, f_score will be divided by 1/n where n = times word appeared
    Returns:
        pandas.DataFrame: n long dataframe Each roles column name and respective F-score and percentage
    """
    
    #Goes through text word by word, giving points to each role respectively.
    #OBS. negative i because f-scores in f_score_list are formatted to be the last columns 
    text = remove_special_characters(text)
    score_dict = {}
    word_dict = {} 
    for df_name in f_score_dict:
        score_dict[df_name+' f-score'] = 0  # creates [0,0,0,...,n] for n roles.
    for w in text.split(' '):
        if w in word_dict:
            word_dict[w] += 1
        else:
            word_dict[w] = 1
        for df_name in f_score_dict:
            if w in f_score_dict[df_name].index:    # if word is in f_score_list
                score = f_score_dict[df_name].loc[w][df_name + ' f-score']**2
                score_dict[df_name+' f-score'] += score/word_dict[w] if diminishing_repetition == True else score
    score_df = pd.DataFrame.from_dict(score_dict,orient='index',columns = ['Squared Sum'])
    score_df['Percentage'] = score_df.apply(lambda x: round(100*x/x.sum(),2))
    return score_df 


In [43]:
def colored(r, g, b, text):
    return "\033[38;2;{};{};{}m{} \033[38;2;255;255;255m".format(r, g, b, text)


def calculate_role_distinct_visual(text,f_score_dict,diminishing_repetition = False, category1 = 'none',brightness = 1,category2 = 'none'):
    """Calculate_role_distinct but with visuals
        Args:
            text (str): Text to calculate role. 
            f_score_dict (dict<dataframe>): By separating f_score_list by category and adding each category in one dictionary
            diminishing_repetition (bool): If True, f_score will be divided by 1/n where n = times word appeared
            category1 (str): name of category to be visually shown Red
            brightness (int): color sharpness
            category1 (str): name of category to be visually shown Blue and compared with category1
    """
    score_df = calculate_role_distinct(text,f_score_dict,diminishing_repetition)
    
    ##here comes the visuals
    # red are words that category1 picks up
    # blue are words that category2 picks up
    # green is when no category is picked up
    if category1 != 'none':
        processed_text = remove_special_characters(text,remove_stop_words = False)
        colored_list = []
        word_list1 = {}
        word_list2 = {}

        for word in processed_text.split():
            color1 = 0
            color2 = 0
            color3 = 150
            for df_name in f_score_dict:
                if word in f_score_dict[category1].index:
                    color3 = 0
                    word_list1[word] = f_score_dict[category1].loc[word][category1 + ' f-score']
                    score1 = f_score_dict[category1].loc[word][category1 + ' f-score']
                    color1 = score1*brightness if score1*brightness <= 1 else 1
                if category2 != 'none':
                    if word in f_score_dict[category2].index and category2 != 'none':    
                        color3 = 0
                        word_list2[word] = f_score_dict[category2].loc[word][category2 + ' f-score']
                        score2 = f_score_dict[category2].loc[word][category2 + ' f-score']
                        color2 = score2*brightness if score2*brightness <= 1 else 1
            colored_list.append(colored(int(color1*255),color3,int(color2*255),word))
        print("".join(colored_list))

        df1 = pd.DataFrame.from_dict(word_list1, orient='index',columns=[category1 + ' f-score'])
        if category2 == 'none':
            display(df1.transpose())
            return   score_df
        df2 = pd.DataFrame.from_dict(word_list2, orient='index',columns=[category2 + ' f-score'])
        display(pd.concat([df1, df2], axis=1).transpose().fillna(0))
    return   score_df

#### Initializing

In [44]:
csv_list = ['fullstack.csv','data_engineer.csv','data_analyst.csv','data_scientist.csv','ml_engineer.csv','devops_engineer.csv']
df_list = csv_string_list_to_df_list(csv_list)
f_score_list = get_f_score(df_list,beta = 0.05)

In [45]:

max_word_count = 200  
chosen_f_score_list = f_score_list

f_score_dict={}
f_score_dict['fullstack'] = chosen_f_score_list.sort_values(by="fullstack f-score",ascending = False)[:max_word_count]
f_score_dict['data_engineer'] = chosen_f_score_list.sort_values(by="data_engineer f-score",ascending = False)[:max_word_count]
f_score_dict['data_analyst'] = chosen_f_score_list.sort_values(by="data_analyst f-score",ascending = False)[:max_word_count]
f_score_dict['data_scientist'] = chosen_f_score_list.sort_values(by="data_scientist f-score",ascending = False)[:max_word_count]
f_score_dict['ml_engineer'] = chosen_f_score_list.sort_values(by="ml_engineer f-score",ascending = False)[:max_word_count]
f_score_dict['devops_engineer'] = chosen_f_score_list.sort_values(by="devops_engineer f-score",ascending = False)[:max_word_count]


##### this initialisation can be shortened

In [46]:
def initialize_f_score_dict(chosen_f_score_list,max_word_count):
    chosen_f_score_list =chosen_f_score_list.copy()
    f_score_dict={}
    categories = chosen_f_score_list.filter(regex = 'f-score')
    for category_name in categories:
        f_score_dict[category_name.split(' ')[0]] = chosen_f_score_list.sort_values(by=category_name,ascending = False)[:max_word_count]
    return f_score_dict

In [47]:
f_score_dict = initialize_f_score_dict(f_score_list,200)

#### Calculating

In [48]:
test_uppdrag = "- Design and implement data extraction process of data from existing systems - Update current data lake data stream design making them capable of handling new types of data streams - with subsequent implementation - Support development of graphical user presentation interface that uses the new data streams Description Core skills: Cloud technical development skills - Experience with AWS services like Glue, EC2, Lambda, S3, Redshift, PostgreSQL etc. - IT architecture (preferably AWS) - Experience with data harmonisation and data entity identification - Experience with data contextualization and visualisation – Preferably Tableau, Power BI - Azure DevOps -Python 3 - Document the technical architecture and the process flows Personal skills: Analytical mindset Can work independently but also collaboratively Visualisation of complex scenarios Can translate between business needs and technical requirements Self-paced Fluent in English"
uppdrag_fullstack = "Java Full stack developer 100 Remote 6 months Contract to Hire Can Spons JD Mandatory Java, JavaScript, React, Angular What You will Need Candidate possess a bachelorrsquos degree in Computer Science, or related field, or equivalent experience 7+ years of experience creating modern web experiences across devices Expert in creating customer experiences web portal using JavaScript, CSS and HTML Project experience using React, Angular, Vue.JS or similar frameworks andor libraries Working experience with distributed SCM (GitHub a plus), DevOps, AWS Experience integrating with lightweight middleware technologies, integration patterns, microservices Proficient demonstration of SQL knowledge Track record of taking ownership and driving results in a data-driven, fast-paced environment Excellent interpersonal and communication skills, strong analytical skills, and ability to deal with ambiguity in a rapidly evolving business environment Bonus Understanding of testing automation, including the building of scrappy codingautomation for testing code faster Experience in Retail, E-Commerce or Technology Industry, Merchandising Vendor Management focus a plus Strong engineering background building Productsprototypes that are E-Commerce scale Experience in Software Quality Processes and Atlassian Tools (Confluence, JIRA, etc.) Feel free to reach out to me for questions or clarifications."
uppdrag_data_engineer = "Design and implement data extraction process of data from existing systems - Update current data lake data stream design making them capable of handling new types of data streams - with subsequent implementation - Support development of graphical user presentation interface that uses the new data streams Description Core skills: Cloud technical development skills - Experience with AWS services like Glue, EC2, Lambda, S3, Redshift, PostgreSQL etc. - IT architecture (preferably AWS) - Experience with data harmonisation and data entity identification - Experience with data contextualization and visualisation – Preferably Tableau, Power BI - Azure DevOps -Python 3 - Document the technical architecture and the process flows Personal skills: Analytical mindset Can work independently but also collaboratively Visualisation of complex scenarios Can translate between business needs and technical requirements Self-paced Fluent in English"
uppdrag_data_analyst = "In this position you support Pleo’s Market Expansion and SMB Acquisition & Growth domains, and you are a member of the Data & Analytics competence group. SMB Acquisition & Growth Domain We contribute 70% of Pleo’s revenue, and we do that by wow-ing customers at every touchpoint, from our ads, to our website, to our sign up journey, to our renewals, to our engagement comms, in-product nudges, etc. We design for the end users, apply consumer approach to growth tactics, build for scalability, make the product accessible & fast, and push for organic growth. We are a super-cross-functional domain, with marketing, sales, customer success, product, etc. all housed under the same “virtual” roof, but a huge focus on experimentation and customer obsession. Market Expansion Domain Pleo will become the go-to spending solution for companies in the SMB segment across Europe, empowering employees by enabling a healthy spending culture. We are launching 15 new markets in 15 months as a way to launch simultaneous bets to supercharge Pleo’s hyper-growth. So What's Data & Analytics At Pleo Like? Join our community of 20+ talented data professionals working from more than 10 different locations worldwide with backgrounds in data analytics, analytical engineering and data engineering. Work with a modern data & analytics toolkit including Kafka, BigQuery, dbt, Looker, Fivetran, Segment, & Amplitude. A shared vision to stop the guessing game and unlock growth for Pleo. A lot of attention on data culture, strategy and empowerment of our stakeholders. Dedicated learning & development guidance and support for data professionals. Deal with a huge variety of data and insights spanning the entire business, from the detailed inner workings of the Pleo platform to web & app metrics, CRM details, financial and banking records, human resource information, and everything else needed to fuel Pleo’s decision making processes in a data inspired manner. What Great Looks Like In This Role You and your team are company-wide trusted experts for insights related to Market Expansion and SMB Acquisition & Growth domains. You successfully nurture a culture of data inspired decision making in alignment with the other Data & Analytics leaders. You are an ambassador for excellence within the Data & Analytics community, you lead by example and inspire others. You and your team empowered the Market Expansion and SMB Acquisition & Growth domains to derive descriptive and diagnostic insights autonomously, while you focus on predictive and prescriptive insights. You and your team are highly satisfied in terms of career development and your sense of belonging to the Market Expansion and SMB Acquisition & Growth domains and the Data & Analytics competence group. You solve problems through collaboration rather than control. Your Responsibilities Be the Data Analytics Competence Lead for up to 4 Data Analysts of Market Expansion and SMB Acquisition & Growth domains, providing career development and coaching Ensure Data & Analytics best practices and standards are identified, cultivated and followed in your competence team and domains, own the quality of work and provide ways of solving problems Act as multiplier for Data & Analytics inside the domains and facilitate the feedback loop to other Data & Analytics functions at Pleo Be accountable for your team’s delivery and quality of metrics and dashboards Drive a culture of experimentation and data-informed decision making across the entire customer lifecycle Facilitate activities to increase the level of data literacy within the domains Your Colleagues Say You Have excellent hands-on knowledge of SQL and experience with more advanced analytics topics such as cohort and regression analysis Are an expert in product analytics Are experienced with data visualisation tools such as Tableau, Looker or similar Communicate with clarity and empathy on insights and recommendations to cross-functional stakeholders for decision making Are an authentic leader and a role model for radical candour Have an eye for details and quality is more important for you than speed Show Me The Benefits Get your own Pleo card, which means full autonomy and no out-of-pocket spending Ability to work remotely (anywhere between the east coast of the Americas to European time zones)... ...or onsite if you want to (Copenhagen, London, Berlin, Stockholm, Madrid, Lisbon) Catered lunch in our offices or daily budget if you work remotely 25 days of annual holidays, on top of the standardised festive and bank-related ones, of course 2500€ per year as flex benefit (maybe you want to buy additional holidays, pay the gym, book a professional coach, pay part of your MBA, or finally get that pet you always dreamed of) Great parental leave: 100% paid, 24 weeks for primary caretakers & 8 weeks for secondary Loads of weird and wonderful niche communities to join in the company Trips abroad for team camps and fun Wild enthusiasm and encouragement from us if you want to host MeetUps, events, etc - we'll help (venue, food etc)"
uppdrag_data_scientist = "The Platform team creates the technology that enables Spotify to learn quickly and scale easily, enabling rapid growth in our users and our business around the globe. Spanning many disciplines, we work to make the business work; creating the frameworks, capabilities and tools needed to welcome a billion customers. Join us and help to amplify productivity, quality and innovation across Spotify. At Client Platform, a part of the Platform Mission, we are passionate about amplifying productivity, quality and innovation across all client developers at Spotify. Client Platform strives to bring a great experience for client developers at Spotify, and through this deliver stable and reliable products for people to enjoy. We are looking for a Lead Data Scientist that will study the behavior of client developers at Spotify, help evolve our product strategy, drive and own work for our critical initiatives, and bring data and insights into high impact collaborations. Spotify is a fast paced company that believes that every decision should be data-informed and every feature be fueled with data. As a Lead Data Scientist you will be working independently in one of three Product Areas, and together with other Data Science Leads, you will be part of a team bridging insights work across our Tribe. What You’ll Do Lead insights work and establish collaboration with cross-functional roles, e.g. product managers, engineers, designers Partner with Product, Design and Tech leads to determine goals and priorities, as well as empowering them with data through the decision making process Define metrics, build dashboards, create reports and key datasets to empower data-informed product development Communicate insights and recommendations to key partners, helping activate data best practices in the Client Platform teams Perform exploratory analysis to understand who our users are, how they get value out of our offering and where we can further develop our product to bring greater value Who You Are 5+ years of working experience, with a degree in statistics, mathematics, computer science, engineering, economics or any other quantitative field A communicative person who values building strong relationships with colleagues and partners, you also enjoy mentoring and guiding others Able to navigate loosely defined problems, as well as coming up with impactful and actionable insights Have understanding of how to instrument products to accurately collect user and system behaviors through data, thus offering a wide variety of insights and product development use cases Skilled in advanced analytics, and you possess statistical competence (such as regression modeling and significance testing) Hands-on experience synthesizing insights from data using tools such as Python, R, BigQuery, SQL, Tableau Preferably have some level of leadership and management experience, as well as strong project management skills Where you'll be For this role, it can be within the EMEA region in which we have a work location and is within working hours. Prefer an office to work from home instead? Not a problem! We have plenty of options for your working preferences. Find more information about our Work From Anywhere options here . Spotify is an equal opportunity employer. You are welcome at Spotify for who you are, no matter where you come from, what you look like, or what’s playing in your headphones. Our platform is for everyone, and so is our workplace. The more voices we have represented and amplified in our business, the more we will all thrive, contribute, and be forward-thinking! So bring us your personal experience, your perspectives, and your background. It’s in our differences that we will find the power to keep revolutionizing the way the world listens. Spotify transformed music listening forever when we launched in 2008. Our mission is to unlock the potential of human creativity by giving a million creative artists the opportunity to live off their art and billions of fans the chance to enjoy and be passionate about these creators. Everything we do is driven by our love for music and podcasting. Today, we are the world’s most popular audio streaming subscription service. Global COVID and Vaccination Disclosure Spotify is committed to safety and well-being of our employees, vendors and clients. We are following regional guidelines mandating vaccination and testing requirements, including those requiring vaccinations and testing for in-person roles and event attendance. For the US, we have mandated that all employees and contractors be fully vaccinated in order to work in our offices and externally with any third-parties. For all other locations, we strongly encourage our employees to get vaccinated and also follow local COVID and safety protocols."
uppdrag_ml_engineer = "What if your job had an impact on shaping the future of urban mobility? Imagine your experiments and analysis improving sustainable last-mile transportation for cities all over Europe. Imagine changing an industry with your team's latest products. YOUR MISSION AT VOI At Voi, we are committed to make the 15 minute cities a reality and we do it by setting our riders and cities first in all aspects of development. We are looking for an experienced Machine Learning Engineer to join our Operations team, where we ensure our Vois are ready to meet the mobility needs in our cities. Here you'll be able to impact fleet efficiency and enable more users to have an excellent ride experience. Our team identifies algorithmically the mobility needs in the field and suggests appropriate actions, such as: Anticipating battery levels and keeping our scooters ready for the next ride Ensuring that users will have Vois available close by whenever they'll need one Identifying parking that is less than ideal and deciding incentives YOUR TEAM AT VOI You’ll be joining a small, skilled and motivated team with a high degree of autonomy. You will have the opportunity to be part of forming the team and influencing its core culture. Voi is a hybrid workplace that operates on trust and freedom, but we also love to hang out in person! We are innovative and curious team members, keen on learning and growing, which is why we make sure to dedicate time to it every sprint. We welcome diverse ideas and continuously aim for a workplace that feels like home. What You’ll Be Working On Driving ML projects end to end - starting from a practical goal, creating a prototype, implementing it into production and measuring the business impact Developing production grade ML ensuring reliability and scalability Building tailor made machine learning models to improve fleet utilization, uptime and user experience Make improvements to all modules in our ongoing projects What You’ll Need To Embark Have 3+ years of experience in working with Machine Learning Have a solid technical and academic background with a MSc or similar Proven experience in building end-to-end machine learning in production Previous experience in a fast moving organization, approaching complex problems with iterative pragmatic solutions Eagerness to learn and contribute to the team's perpetual technical growth Knowledge of tooling related to: cloud solutions (we use GCP), orchestration (e.g. we use Airflow and Prefect) and data processing (e.g. dbt and Snowflake) is meriting Previous experience in forecasting, predictive modeling, optimization and geospatial techniques are meriting Are able to work from Stockholm a few days per quarter Professional working proficiency in English, Swedish is not required"
uppdrag_devops_engineer = "Our ambition is to amplify all business processes in H&M using AI and Advanced Analytics by 2025. This means we need to improve our existing platforms or build novel platforms to efficiently and cost-effectively prepare and serve hundreds of millions of AI modeling features train, manage and serve tens of thousands of AI models run thousands of AB tests deploy hundreds of exploration/dev/test/prod environments integrate with key IT systems supporting critical business processes As a Senior Devops Engineer in the AI Foundation, your mission is to contribute to these challenging, yet rewarding causes which will enable us to roll out scalable and production-ready AI and Advanced Analytics software. You will be part of an agile team that not only takes the responsibility of delivering such platforms but also ensures the success of the stakeholders through ensuring the best experience. In addition, you will be part of guilds that establish and promote best practices throughout the organization by adopting inner source culture that champions collaborations, reusable codes, and knowledge sharing. Qualifications Who you are? You feel that our AI and Advanced Analytics ambitions are very exciting. You have at least 5+ years of experience in a similar role. Automation is a part of your DNA, you are passionate about building zero touch systems. You are experienced in managing infrastructure with public cloud providers (Azure or Google cloud is preferred). Working with CI/CD pipelines is your specialty. You have a good understanding of Infrastructure as Code tools, such as Terraform, ARM, Ansible. You have experiences in setting up systems for better observability, such as logging, monitoring, alerting, etc. You have experiences with Docker, Kubernetes and micro service management systems. You believe in knowledge sharing and upskilling. You are a role model for the team not just for your technical skills, but also for adopting agile ways of working. You bear a team-first mentality and truly believe in striving in diversity. You have good communication skills, preferably in English, both verbal and written. You are not shy in presenting your work."

In [49]:
calculate_role_distinct_visual(uppdrag_fullstack,f_score_dict,True,'devops_engineer',3,'fullstack').sort_values(by='Percentage',ascending = False)

[38;2;0;0;118mjava [38;2;255;255;255m[38;2;0;150;0mfull [38;2;255;255;255m[38;2;0;0;240mstack [38;2;255;255;255m[38;2;199;0;0mdeveloper [38;2;255;255;255m[38;2;0;0;163m100 [38;2;255;255;255m[38;2;0;0;255mremote [38;2;255;255;255m[38;2;0;150;0m6 [38;2;255;255;255m[38;2;0;150;0mmonths [38;2;255;255;255m[38;2;0;150;0mcontract [38;2;255;255;255m[38;2;0;150;0mto [38;2;255;255;255m[38;2;0;150;0mhire [38;2;255;255;255m[38;2;0;150;0mcan [38;2;255;255;255m[38;2;0;150;0mspons [38;2;255;255;255m[38;2;0;150;0mjd [38;2;255;255;255m[38;2;0;150;0mmandatory [38;2;255;255;255m[38;2;0;0;118mjava [38;2;255;255;255m[38;2;0;0;255mjavascript [38;2;255;255;255m[38;2;0;0;255mreact [38;2;255;255;255m[38;2;0;0;255mangular [38;2;255;255;255m[38;2;0;150;0mwhat [38;2;255;255;255m[38;2;0;150;0myou [38;2;255;255;255m[38;2;0;150;0mwill [38;2;255;255;255m[38;2;0;150;0mneed [38;2;255;255;255m[38;2;0;150;0mcandidate [38;2;255;255;255m[38;2;0;150;0mpossess [38;2;255;255;2

Unnamed: 0,developer,experience,web,customer,working,distributed,devops,aws,integration,fast,environment,understanding,testing,automation,code,technology,management,background,software,tools,jira,java,stack,100,remote,javascript,react,angular,modern,css,html,vue,frameworks,libraries,github,technologies,patterns,microservices,proficient,driven,building
devops_engineer f-score,0.261153,0.160975,0.212658,0.207281,0.169206,0.161473,0.582208,0.478743,0.21761,0.167821,0.238545,0.1691,0.187143,0.496611,0.188457,0.173617,0.215579,0.196546,0.163612,0.194934,0.195865,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fullstack f-score,0.0,0.159383,0.412597,0.0,0.0,0.0,0.0,0.0,0.274413,0.0,0.0,0.0,0.329983,0.0,0.324945,0.0,0.0,0.0,0.208425,0.0,0.200067,0.154776,0.314109,0.214324,0.375328,0.539422,0.677937,0.485105,0.327704,0.394148,0.225897,0.210333,0.235732,0.153974,0.156323,0.245796,0.282286,0.258168,0.16443,0.286207,0.198806


Unnamed: 0,Squared Sum,Percentage
fullstack f-score,3.385886,34.15
devops_engineer f-score,1.611574,16.25
data_engineer f-score,1.401997,14.14
data_scientist f-score,1.394478,14.06
data_analyst f-score,1.16974,11.8
ml_engineer f-score,0.95163,9.6


#### Compare with normal calculate

In [50]:
max_word_count = 100
chosen_f_score_list = f_score_list

fullstack_words = chosen_f_score_list.sort_values(by="fullstack f-score",ascending = False)[:max_word_count]
data_engineer_words = chosen_f_score_list.sort_values(by="data_engineer f-score",ascending = False)[:max_word_count]
data_analyst_words = chosen_f_score_list.sort_values(by="data_analyst f-score",ascending = False)[:max_word_count]
data_scientist_words = chosen_f_score_list.sort_values(by="data_scientist f-score",ascending = False)[:max_word_count]
ml_engineer_words = chosen_f_score_list.sort_values(by="ml_engineer f-score",ascending = False)[:max_word_count]
devops_engineer_words = chosen_f_score_list.sort_values(by="devops_engineer f-score",ascending = False)[:max_word_count]

all_df =pd.concat([fullstack_words,data_engineer_words,data_analyst_words,data_scientist_words,ml_engineer_words,devops_engineer_words])
all_df = all_df[all_df.groupby('term').cumcount().le(0)]


##### this can be shortened

In [51]:
def initialize_f_score_list_cut(chosen_f_score_list,max_word_count):
    if 'size' in chosen_f_score_list.columns: #to not get duplicate sizes
        chosen_f_score_list = chosen_f_score_list.drop('size',axis=1)
    f_score_list_cut=[]
    categories = chosen_f_score_list.filter(regex = 'f-score')
    for category_name in categories:
        f_score_list_cut.append(chosen_f_score_list.sort_values(by=category_name,ascending = False)[:max_word_count])
    all_df = pd.concat(f_score_list_cut)
    
    category_count_df = compare_df_category_count(f_score_list_cut)
    f_score_list_cut_size = category_count_df.merge(all_df,left_on='index',right_on='term').set_index('index')
    f_score_list_cut_size.index.name = 'term'
    
    # delete duplicates since .drop_duplicates does not work correctly
    f_score_list_cut_size = f_score_list_cut_size[f_score_list_cut_size.groupby('term').cumcount().le(0)] 
    return  f_score_list_cut_size

In [52]:
f_score_list_cut = initialize_f_score_list_cut(f_score_list,100)

##### comparing

In [53]:
calculate_role_visual(uppdrag_data_engineer,f_score_list_cut,True,'data_scientist',3,'data_engineer').sort_values(by='Percentage',ascending = False)

[38;2;73;0;116mdesign [38;2;255;255;255m[38;2;0;150;0mand [38;2;255;255;255m[38;2;82;0;22mimplement [38;2;255;255;255m[38;2;201;0;220mdata [38;2;255;255;255m[38;2;0;150;0mextraction [38;2;255;255;255m[38;2;98;0;68mprocess [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;201;0;220mdata [38;2;255;255;255m[38;2;0;150;0mfrom [38;2;255;255;255m[38;2;0;150;0mexisting [38;2;255;255;255m[38;2;91;0;194msystems [38;2;255;255;255m[38;2;0;150;0mupdate [38;2;255;255;255m[38;2;0;150;0mcurrent [38;2;255;255;255m[38;2;201;0;220mdata [38;2;255;255;255m[38;2;0;150;0mlake [38;2;255;255;255m[38;2;201;0;220mdata [38;2;255;255;255m[38;2;21;0;236mstream [38;2;255;255;255m[38;2;73;0;116mdesign [38;2;255;255;255m[38;2;0;150;0mmaking [38;2;255;255;255m[38;2;0;150;0mthem [38;2;255;255;255m[38;2;0;150;0mcapable [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;0;150;0mhandling [38;2;255;255;255m[38;2;0;150;0mnew [38;2;255;255;255m[38;2;0;150;0mty

Unnamed: 0,design,implement,data,process,systems,stream,support,development,presentation,skills,cloud,technical,aws,services,architecture,tableau,azure,python,3,personal,analytical,work,complex,business,requirements,ec2,s3,redshift,postgresql,devops
data_scientist f-score,0.095486,0.108422,0.263718,0.12892,0.119997,0.028655,0.125336,0.089632,0.340568,0.201754,0.059937,0.274901,0.04506,0.039876,0.027471,0.21486,0.04628,0.228303,0.200519,0.021337,0.238719,0.22226,0.289048,0.224258,0.176151,0.0,0.0,0.0,0.0,0.0
data_engineer f-score,0.152735,0.029142,0.288678,0.089243,0.254144,0.309414,0.103618,0.177463,0.039782,0.142199,0.28185,0.147587,0.217189,0.208131,0.156641,0.08621,0.176723,0.173535,0.114644,0.090315,0.073116,0.152254,0.092482,0.204592,0.23869,0.210105,0.264978,0.474155,0.052214,0.044622


Unnamed: 0,Squared sum,Percentage
data_analyst f-score,1.426353,22.82
data_engineer f-score,1.380635,22.09
devops_engineer f-score,1.340761,21.45
data_scientist f-score,1.040955,16.65
fullstack f-score,0.693507,11.1
ml_engineer f-score,0.368076,5.89


In [54]:
#As you can see, above is not very clear. 
#some words that should not be picked up are picked up because some other category has that word
#For example, 'devops' being colored is very misleading, since it is actually a more of a devops_engineer word.
f_score_list.loc['devops']


fullstack freq              7.000000
data_engineer freq          3.000000
data_analyst freq           0.000000
data_scientist freq         0.000000
ml_engineer freq            1.000000
devops_engineer freq       37.000000
fullstack f-score           0.111274
data_engineer f-score       0.044622
data_analyst f-score        0.000000
data_scientist f-score      0.000000
ml_engineer f-score         0.015713
devops_engineer f-score     0.582208
Name: devops, dtype: float64

In [55]:
#However, with below when doing distinct, the colors and words are much more clear.
#If the word is colored, it is one of the top 200 words of that category

In [56]:
calculate_role_distinct_visual(uppdrag_data_engineer,f_score_dict,True,'data_analyst',3,'data_engineer').sort_values(by='Percentage',ascending = False)

[38;2;0;0;116mdesign [38;2;255;255;255m[38;2;0;150;0mand [38;2;255;255;255m[38;2;173;0;0mimplement [38;2;255;255;255m[38;2;226;0;220mdata [38;2;255;255;255m[38;2;0;150;0mextraction [38;2;255;255;255m[38;2;246;0;0mprocess [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;226;0;220mdata [38;2;255;255;255m[38;2;0;150;0mfrom [38;2;255;255;255m[38;2;0;150;0mexisting [38;2;255;255;255m[38;2;0;0;194msystems [38;2;255;255;255m[38;2;0;150;0mupdate [38;2;255;255;255m[38;2;0;150;0mcurrent [38;2;255;255;255m[38;2;226;0;220mdata [38;2;255;255;255m[38;2;0;150;0mlake [38;2;255;255;255m[38;2;226;0;220mdata [38;2;255;255;255m[38;2;0;0;236mstream [38;2;255;255;255m[38;2;0;0;116mdesign [38;2;255;255;255m[38;2;143;0;0mmaking [38;2;255;255;255m[38;2;0;150;0mthem [38;2;255;255;255m[38;2;0;150;0mcapable [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;0;150;0mhandling [38;2;255;255;255m[38;2;0;150;0mnew [38;2;255;255;255m[38;2;0;150;0mtypes 

Unnamed: 0,implement,data,process,making,support,skills,services,tableau,power,personal,analytical,work,independently,complex,business,design,systems,stream,implementation,development,cloud,experience,aws,glue,ec2,lambda,s3,redshift,architecture,azure,python,requirements,english
data_analyst f-score,0.226394,0.29616,0.322324,0.187508,0.255658,0.242057,0.373585,0.413527,0.209948,0.242763,0.354511,0.168497,0.17545,0.201554,0.323209,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
data_engineer f-score,0.0,0.288678,0.0,0.0,0.0,0.0,0.208131,0.0,0.0,0.0,0.0,0.152254,0.0,0.0,0.204592,0.152735,0.254144,0.309414,0.1737,0.177463,0.28185,0.180937,0.217189,0.171368,0.210105,0.189867,0.264978,0.474155,0.156641,0.176723,0.173535,0.23869,0.166077


Unnamed: 0,Squared Sum,Percentage
data_engineer f-score,1.414357,23.34
data_analyst f-score,1.406683,23.22
devops_engineer f-score,1.207791,19.93
data_scientist f-score,1.101773,18.18
fullstack f-score,0.660548,10.9
ml_engineer f-score,0.26757,4.42


### Adding score to influence f-score

#### Reason: to lessen the impact of high frequency words like 'data'

In [57]:
def compare_df_category_count(word_df_list):
    all_df = pd.concat(word_df_list)
    grouped_df = all_df.groupby(all_df.index.tolist(),as_index=False).size().sort_values(by='size', ascending = False)
    return grouped_df

In [58]:
def add_score_to_highest_f_score_with_size(f_score_list_cut,amount):
    """Adds score to the first, second, and third with descending amount. Requires f_score_list_cut to include size"""
    f_score_list_cut = f_score_list_cut.copy()
    highest_value_in_row = f_score_list_cut.filter(regex='f-score').idxmax(axis=1)
    
    for row_name in f_score_list_cut[f_score_list_cut['size']==1].index:
        f_score_list_cut[highest_value_in_row[row_name]][row_name] += amount
    for row_name in f_score_list_cut[f_score_list_cut['size']==2].index:
        f_score_list_cut[highest_value_in_row[row_name]][row_name] += amount/2
    for row_name in f_score_list_cut[f_score_list_cut['size']==3].index:
        f_score_list_cut[highest_value_in_row[row_name]][row_name] += amount/4
    return f_score_list_cut

def add_score_disperse(f_score_list,amount):
    """Returns f_score list after adding amount multiplied by category_f_score/total_f_score"""
    f_score_list  = f_score_list.copy()
    sum_in_row = f_score_list.filter(regex='f-score').sum(axis=1)
    for row_name in f_score_list.index:
        for category in f_score_list.filter(regex='f-score'):
            ratio = f_score_list[category][row_name]/sum_in_row[row_name]
            #print(ratio)
            f_score_list[category][row_name] += (amount * ratio)
    return f_score_list

#### Initialize the f-score list with added score

In [59]:
max_word_count = 1000  #Amount of words in each category to be in the adding process, Chosen by testing
#will be optimised with machine learning
chosen_f_score_list = f_score_list

#Take the n highest f-score words of each category
f_score_list_cut = initialize_f_score_list_cut(chosen_f_score_list, max_word_count)

#Add score to highest, second, third with diminishing amount
added_f_score_list = add_score_to_highest_f_score_with_size(f_score_list_cut,0.5)


#### use this added-f-score list instead

In [60]:
max_word_count = 500  #Amount of words in each category, chosen by testing, will be optimised with ml
chosen_f_score_list = added_f_score_list

f_score_list_cut = initialize_f_score_list_cut(chosen_f_score_list, max_word_count)
f_score_dict= initialize_f_score_dict(chosen_f_score_list,max_word_count)

In [61]:
calculate_role_visual(uppdrag_data_engineer,f_score_list_cut,True,'data_analyst',3,'data_engineer').sort_values(by='Percentage',ascending = False)

[38;2;0;150;0mdesign [38;2;255;255;255m[38;2;0;150;0mand [38;2;255;255;255m[38;2;255;0;22mimplement [38;2;255;255;255m[38;2;226;0;220mdata [38;2;255;255;255m[38;2;0;0;29mextraction [38;2;255;255;255m[38;2;246;0;68mprocess [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;226;0;220mdata [38;2;255;255;255m[38;2;0;150;0mfrom [38;2;255;255;255m[38;2;0;150;0mexisting [38;2;255;255;255m[38;2;95;0;194msystems [38;2;255;255;255m[38;2;255;0;31mupdate [38;2;255;255;255m[38;2;0;150;0mcurrent [38;2;255;255;255m[38;2;226;0;220mdata [38;2;255;255;255m[38;2;0;0;255mlake [38;2;255;255;255m[38;2;226;0;220mdata [38;2;255;255;255m[38;2;0;0;255mstream [38;2;255;255;255m[38;2;0;150;0mdesign [38;2;255;255;255m[38;2;239;0;21mmaking [38;2;255;255;255m[38;2;0;150;0mthem [38;2;255;255;255m[38;2;0;150;0mcapable [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;0;0;255mhandling [38;2;255;255;255m[38;2;0;150;0mnew [38;2;255;255;255m[38;2;26;0;54mty

Unnamed: 0,implement,data,process,systems,update,making,types,support,graphical,presentation,skills,cloud,technical,experience,aws,services,redshift,tableau,power,python,3,document,personal,analytical,work,collaboratively,complex,translate,business,requirements,paced,extraction,lake,stream,handling,glue,ec2,lambda,s3,devops,flows
data_analyst f-score,0.351394,0.29616,0.322324,0.124619,0.578859,0.312508,0.03407,0.255658,0.589409,0.097472,0.242057,0.016486,0.119588,0.138641,0.005682,0.373585,0.044139,0.538527,0.334948,0.103102,0.145542,0.095704,0.367763,0.354511,0.168497,0.037938,0.201554,0.037938,0.323209,0.124577,0.159048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
data_engineer f-score,0.029142,0.288678,0.089243,0.254144,0.041086,0.027541,0.070599,0.103618,0.0,0.039782,0.142199,0.28185,0.147587,0.180937,0.217189,0.208131,0.974155,0.08621,0.061883,0.173535,0.114644,0.256908,0.090315,0.073116,0.152254,0.578937,0.092482,0.0,0.204592,0.23869,0.060881,0.037973,0.589509,0.809414,0.628526,0.671368,0.460105,0.689867,0.389978,0.044622,0.593692


Unnamed: 0,Squared sum,Percentage
data_engineer f-score,5.259917,39.87
data_analyst f-score,2.631125,19.94
data_scientist f-score,2.420754,18.35
devops_engineer f-score,1.862489,14.12
ml_engineer f-score,0.706666,5.36
fullstack f-score,0.312947,2.37


In [62]:
calculate_role_distinct_visual(uppdrag_data_engineer,f_score_dict,True,'data_analyst',3,'data_engineer').sort_values(by='Percentage',ascending = False)

[38;2;0;150;0mdesign [38;2;255;255;255m[38;2;0;150;0mand [38;2;255;255;255m[38;2;255;0;0mimplement [38;2;255;255;255m[38;2;226;0;220mdata [38;2;255;255;255m[38;2;0;150;0mextraction [38;2;255;255;255m[38;2;246;0;0mprocess [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;226;0;220mdata [38;2;255;255;255m[38;2;0;150;0mfrom [38;2;255;255;255m[38;2;0;150;0mexisting [38;2;255;255;255m[38;2;0;0;194msystems [38;2;255;255;255m[38;2;255;0;0mupdate [38;2;255;255;255m[38;2;0;150;0mcurrent [38;2;255;255;255m[38;2;226;0;220mdata [38;2;255;255;255m[38;2;0;0;255mlake [38;2;255;255;255m[38;2;226;0;220mdata [38;2;255;255;255m[38;2;0;0;255mstream [38;2;255;255;255m[38;2;0;150;0mdesign [38;2;255;255;255m[38;2;239;0;0mmaking [38;2;255;255;255m[38;2;0;150;0mthem [38;2;255;255;255m[38;2;0;150;0mcapable [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;0;0;255mhandling [38;2;255;255;255m[38;2;0;150;0mnew [38;2;255;255;255m[38;2;0;150;0mtypes 

Unnamed: 0,implement,data,process,update,making,support,graphical,skills,services,tableau,power,personal,analytical,business,systems,lake,stream,handling,cloud,aws,glue,ec2,lambda,s3,redshift,document,flows,collaboratively,requirements
data_analyst f-score,0.351394,0.29616,0.322324,0.578859,0.312508,0.255658,0.589409,0.242057,0.373585,0.538527,0.334948,0.367763,0.354511,0.323209,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
data_engineer f-score,0.0,0.288678,0.0,0.0,0.0,0.0,0.0,0.0,0.208131,0.0,0.0,0.0,0.0,0.204592,0.254144,0.589509,0.809414,0.628526,0.28185,0.217189,0.671368,0.460105,0.689867,0.389978,0.974155,0.256908,0.593692,0.578937,0.23869


Unnamed: 0,Squared Sum,Percentage
data_engineer f-score,4.983451,45.03
data_analyst f-score,2.387526,21.57
data_scientist f-score,2.314067,20.91
devops_engineer f-score,1.006164,9.09
ml_engineer f-score,0.375328,3.39
fullstack f-score,0.0,0.0


### Disperse score by ratio

#### Initialize added_f_score_list with disperse method

In [63]:
max_word_count = 1000
chosen_f_score_list = f_score_list

f_score_list_cut = initialize_f_score_list_cut(chosen_f_score_list, max_word_count)
added_f_score_list = add_score_disperse(f_score_list_cut,1)

#### Initialize dictionary and list for both calculate methods

In [64]:
max_word_count = 500
chosen_f_score_list = added_f_score_list

f_score_list_cut = initialize_f_score_list_cut(chosen_f_score_list, max_word_count)
f_score_dict= initialize_f_score_dict(chosen_f_score_list,max_word_count)

In [65]:
calculate_role_visual(uppdrag_data_engineer,f_score_list_cut,True,'data_analyst',1,'data_engineer').sort_values(by='Percentage',ascending = False)

[38;2;0;150;0mdesign [38;2;255;255;255m[38;2;0;150;0mand [38;2;255;255;255m[38;2;192;0;24mimplement [38;2;255;255;255m[38;2;152;0;148mdata [38;2;255;255;255m[38;2;0;0;47mextraction [38;2;255;255;255m[38;2;197;0;54mprocess [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;152;0;148mdata [38;2;255;255;255m[38;2;0;150;0mfrom [38;2;255;255;255m[38;2;0;150;0mexisting [38;2;255;255;255m[38;2;65;0;134msystems [38;2;255;255;255m[38;2;0;150;0mupdate [38;2;255;255;255m[38;2;0;150;0mcurrent [38;2;255;255;255m[38;2;152;0;148mdata [38;2;255;255;255m[38;2;0;0;182mlake [38;2;255;255;255m[38;2;152;0;148mdata [38;2;255;255;255m[38;2;0;0;255mstream [38;2;255;255;255m[38;2;0;150;0mdesign [38;2;255;255;255m[38;2;150;0;22mmaking [38;2;255;255;255m[38;2;0;150;0mthem [38;2;255;255;255m[38;2;0;150;0mcapable [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;0;0;228mhandling [38;2;255;255;255m[38;2;0;150;0mnew [38;2;255;255;255m[38;2;36;0;75mtyp

Unnamed: 0,implement,data,process,systems,making,types,implementation,support,development,graphical,presentation,skills,cloud,technical,aws,services,redshift,tableau,power,azure,python,document,personal,analytical,mindset,complex,translate,business,requirements,paced,extraction,lake,stream,handling,glue,ec2,lambda,s3,devops,flows
data_analyst f-score,0.753815,0.597388,0.773443,0.258426,0.589917,0.142251,0.069082,0.593306,0.123633,1.089409,0.258192,0.493323,0.034687,0.252537,0.011963,0.791656,0.121856,0.962515,0.743464,0.075744,0.212686,0.359107,0.661887,0.775436,0.126253,0.446506,0.212724,0.676467,0.294666,0.419484,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
data_engineer f-score,0.097034,0.582297,0.214147,0.527027,0.086647,0.294769,0.49584,0.240465,0.365278,0.0,0.105377,0.289808,0.593008,0.311664,0.457269,0.441047,1.309022,0.20066,0.219138,0.423437,0.357981,0.494954,0.246242,0.159929,0.0,0.204876,0.0,0.428206,0.564582,0.16057,0.186961,0.716176,1.06667,0.897654,1.171368,0.680082,0.869312,0.705405,0.103816,1.093692


Unnamed: 0,Squared sum,Percentage
data_engineer f-score,12.913005,28.84
data_analyst f-score,9.428894,21.06
data_scientist f-score,8.976409,20.05
devops_engineer f-score,8.519023,19.03
ml_engineer f-score,2.953616,6.6
fullstack f-score,1.986363,4.44


In [66]:
calculate_role_distinct_visual(uppdrag_data_engineer,f_score_dict,True,'data_analyst',1,'data_engineer').sort_values(by='Percentage',ascending = False)

[38;2;0;150;0mdesign [38;2;255;255;255m[38;2;0;150;0mand [38;2;255;255;255m[38;2;192;0;0mimplement [38;2;255;255;255m[38;2;152;0;148mdata [38;2;255;255;255m[38;2;0;150;0mextraction [38;2;255;255;255m[38;2;197;0;0mprocess [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;152;0;148mdata [38;2;255;255;255m[38;2;0;150;0mfrom [38;2;255;255;255m[38;2;0;150;0mexisting [38;2;255;255;255m[38;2;0;0;134msystems [38;2;255;255;255m[38;2;0;150;0mupdate [38;2;255;255;255m[38;2;0;150;0mcurrent [38;2;255;255;255m[38;2;152;0;148mdata [38;2;255;255;255m[38;2;0;0;182mlake [38;2;255;255;255m[38;2;152;0;148mdata [38;2;255;255;255m[38;2;0;0;255mstream [38;2;255;255;255m[38;2;0;150;0mdesign [38;2;255;255;255m[38;2;150;0;0mmaking [38;2;255;255;255m[38;2;0;150;0mthem [38;2;255;255;255m[38;2;0;150;0mcapable [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;0;0;228mhandling [38;2;255;255;255m[38;2;0;150;0mnew [38;2;255;255;255m[38;2;0;150;0mtypes 

Unnamed: 0,implement,data,process,making,support,graphical,skills,services,tableau,power,personal,analytical,business,systems,lake,stream,handling,implementation,cloud,glue,ec2,lambda,s3,redshift,document,flows,requirements
data_analyst f-score,0.753815,0.597388,0.773443,0.589917,0.593306,1.089409,0.493323,0.791656,0.962515,0.743464,0.661887,0.775436,0.676467,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
data_engineer f-score,0.0,0.582297,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.527027,0.716176,1.06667,0.897654,0.49584,0.593008,1.171368,0.680082,0.869312,0.705405,1.309022,0.494954,1.093692,0.564582


Unnamed: 0,Squared Sum,Percentage
data_engineer f-score,10.852312,32.05
data_analyst f-score,8.411039,24.84
data_scientist f-score,7.507557,22.17
devops_engineer f-score,5.086622,15.02
ml_engineer f-score,1.237969,3.66
fullstack f-score,0.76323,2.25


### Comparison and conclusions

In [67]:
test_uppdrag = "Searching for an intro-level data scientist eager to gain exposure to solving real-world problems with data-driven solutions. This applicant is expected to be familiar standard data science approaches such as data ingest, ETL, preprocessing, model building, model deploying, and monitoring models once deployed. More specifically, they will demonstrate experience working with raw data by processing and ingesting into a database, cleaning it to be fed into a model, and deploying it for some application. Along with this, the applicant can explain findings clearly to a general audience. Job Description Troubleshoot problems with data ingesting and processing Contribute to research and identification most accurate model architecture for current problem. Collaborate in data science meetings by presenting on theoretical or applied data science topics Demonstrate the ability to troubleshoot in cloud compute environments such as AWS, Azure, and Google Cloud Develop visualizations to convey meaningful insights Present insights from models to non-technical crowd to convey findings Suggest meaningful approaches, models, and services to addressing problems. Stay informed about trends and recent capabilities within data science. Requirements Undergraduate degree in or completing a degree data science, statistics, computer science, or a relative quantitative field. Moderate experience in Python or R is preferred. Other equivalent languages are considered. Understanding of relational database querying language such as SQL Experienced in machine learning and also, neural network techniques is preferred Ability to explain insights in laymen’s terms in order inform business decisions Exposure to one more neural network framework – Tensorflow, Torch/ PyTorch, ONNX, etc Previously worked with distributed compute environments such as Hadoop."
uppdrag_fullstack = "Java Full stack developer 100 Remote 6 months Contract to Hire Can Spons JD Mandatory Java, JavaScript, React, Angular What You will Need Candidate possess a bachelorrsquos degree in Computer Science, or related field, or equivalent experience 7+ years of experience creating modern web experiences across devices Expert in creating customer experiences web portal using JavaScript, CSS and HTML Project experience using React, Angular, Vue.JS or similar frameworks andor libraries Working experience with distributed SCM (GitHub a plus), DevOps, AWS Experience integrating with lightweight middleware technologies, integration patterns, microservices Proficient demonstration of SQL knowledge Track record of taking ownership and driving results in a data-driven, fast-paced environment Excellent interpersonal and communication skills, strong analytical skills, and ability to deal with ambiguity in a rapidly evolving business environment Bonus Understanding of testing automation, including the building of scrappy codingautomation for testing code faster Experience in Retail, E-Commerce or Technology Industry, Merchandising Vendor Management focus a plus Strong engineering background building Productsprototypes that are E-Commerce scale Experience in Software Quality Processes and Atlassian Tools (Confluence, JIRA, etc.) Feel free to reach out to me for questions or clarifications."
uppdrag_data_engineer = "Design and implement data extraction process of data from existing systems - Update current data lake data stream design making them capable of handling new types of data streams - with subsequent implementation - Support development of graphical user presentation interface that uses the new data streams Description Core skills: Cloud technical development skills - Experience with AWS services like Glue, EC2, Lambda, S3, Redshift, PostgreSQL etc. - IT architecture (preferably AWS) - Experience with data harmonisation and data entity identification - Experience with data contextualization and visualisation – Preferably Tableau, Power BI - Azure DevOps -Python 3 - Document the technical architecture and the process flows Personal skills: Analytical mindset Can work independently but also collaboratively Visualisation of complex scenarios Can translate between business needs and technical requirements Self-paced Fluent in English"
uppdrag_data_analyst = "In this position you support Pleo’s Market Expansion and SMB Acquisition & Growth domains, and you are a member of the Data & Analytics competence group. SMB Acquisition & Growth Domain We contribute 70% of Pleo’s revenue, and we do that by wow-ing customers at every touchpoint, from our ads, to our website, to our sign up journey, to our renewals, to our engagement comms, in-product nudges, etc. We design for the end users, apply consumer approach to growth tactics, build for scalability, make the product accessible & fast, and push for organic growth. We are a super-cross-functional domain, with marketing, sales, customer success, product, etc. all housed under the same “virtual” roof, but a huge focus on experimentation and customer obsession. Market Expansion Domain Pleo will become the go-to spending solution for companies in the SMB segment across Europe, empowering employees by enabling a healthy spending culture. We are launching 15 new markets in 15 months as a way to launch simultaneous bets to supercharge Pleo’s hyper-growth. So What's Data & Analytics At Pleo Like? Join our community of 20+ talented data professionals working from more than 10 different locations worldwide with backgrounds in data analytics, analytical engineering and data engineering. Work with a modern data & analytics toolkit including Kafka, BigQuery, dbt, Looker, Fivetran, Segment, & Amplitude. A shared vision to stop the guessing game and unlock growth for Pleo. A lot of attention on data culture, strategy and empowerment of our stakeholders. Dedicated learning & development guidance and support for data professionals. Deal with a huge variety of data and insights spanning the entire business, from the detailed inner workings of the Pleo platform to web & app metrics, CRM details, financial and banking records, human resource information, and everything else needed to fuel Pleo’s decision making processes in a data inspired manner. What Great Looks Like In This Role You and your team are company-wide trusted experts for insights related to Market Expansion and SMB Acquisition & Growth domains. You successfully nurture a culture of data inspired decision making in alignment with the other Data & Analytics leaders. You are an ambassador for excellence within the Data & Analytics community, you lead by example and inspire others. You and your team empowered the Market Expansion and SMB Acquisition & Growth domains to derive descriptive and diagnostic insights autonomously, while you focus on predictive and prescriptive insights. You and your team are highly satisfied in terms of career development and your sense of belonging to the Market Expansion and SMB Acquisition & Growth domains and the Data & Analytics competence group. You solve problems through collaboration rather than control. Your Responsibilities Be the Data Analytics Competence Lead for up to 4 Data Analysts of Market Expansion and SMB Acquisition & Growth domains, providing career development and coaching Ensure Data & Analytics best practices and standards are identified, cultivated and followed in your competence team and domains, own the quality of work and provide ways of solving problems Act as multiplier for Data & Analytics inside the domains and facilitate the feedback loop to other Data & Analytics functions at Pleo Be accountable for your team’s delivery and quality of metrics and dashboards Drive a culture of experimentation and data-informed decision making across the entire customer lifecycle Facilitate activities to increase the level of data literacy within the domains Your Colleagues Say You Have excellent hands-on knowledge of SQL and experience with more advanced analytics topics such as cohort and regression analysis Are an expert in product analytics Are experienced with data visualisation tools such as Tableau, Looker or similar Communicate with clarity and empathy on insights and recommendations to cross-functional stakeholders for decision making Are an authentic leader and a role model for radical candour Have an eye for details and quality is more important for you than speed Show Me The Benefits Get your own Pleo card, which means full autonomy and no out-of-pocket spending Ability to work remotely (anywhere between the east coast of the Americas to European time zones)... ...or onsite if you want to (Copenhagen, London, Berlin, Stockholm, Madrid, Lisbon) Catered lunch in our offices or daily budget if you work remotely 25 days of annual holidays, on top of the standardised festive and bank-related ones, of course 2500€ per year as flex benefit (maybe you want to buy additional holidays, pay the gym, book a professional coach, pay part of your MBA, or finally get that pet you always dreamed of) Great parental leave: 100% paid, 24 weeks for primary caretakers & 8 weeks for secondary Loads of weird and wonderful niche communities to join in the company Trips abroad for team camps and fun Wild enthusiasm and encouragement from us if you want to host MeetUps, events, etc - we'll help (venue, food etc)"
uppdrag_data_scientist = "The Platform team creates the technology that enables Spotify to learn quickly and scale easily, enabling rapid growth in our users and our business around the globe. Spanning many disciplines, we work to make the business work; creating the frameworks, capabilities and tools needed to welcome a billion customers. Join us and help to amplify productivity, quality and innovation across Spotify. At Client Platform, a part of the Platform Mission, we are passionate about amplifying productivity, quality and innovation across all client developers at Spotify. Client Platform strives to bring a great experience for client developers at Spotify, and through this deliver stable and reliable products for people to enjoy. We are looking for a Lead Data Scientist that will study the behavior of client developers at Spotify, help evolve our product strategy, drive and own work for our critical initiatives, and bring data and insights into high impact collaborations. Spotify is a fast paced company that believes that every decision should be data-informed and every feature be fueled with data. As a Lead Data Scientist you will be working independently in one of three Product Areas, and together with other Data Science Leads, you will be part of a team bridging insights work across our Tribe. What You’ll Do Lead insights work and establish collaboration with cross-functional roles, e.g. product managers, engineers, designers Partner with Product, Design and Tech leads to determine goals and priorities, as well as empowering them with data through the decision making process Define metrics, build dashboards, create reports and key datasets to empower data-informed product development Communicate insights and recommendations to key partners, helping activate data best practices in the Client Platform teams Perform exploratory analysis to understand who our users are, how they get value out of our offering and where we can further develop our product to bring greater value Who You Are 5+ years of working experience, with a degree in statistics, mathematics, computer science, engineering, economics or any other quantitative field A communicative person who values building strong relationships with colleagues and partners, you also enjoy mentoring and guiding others Able to navigate loosely defined problems, as well as coming up with impactful and actionable insights Have understanding of how to instrument products to accurately collect user and system behaviors through data, thus offering a wide variety of insights and product development use cases Skilled in advanced analytics, and you possess statistical competence (such as regression modeling and significance testing) Hands-on experience synthesizing insights from data using tools such as Python, R, BigQuery, SQL, Tableau Preferably have some level of leadership and management experience, as well as strong project management skills Where you'll be For this role, it can be within the EMEA region in which we have a work location and is within working hours. Prefer an office to work from home instead? Not a problem! We have plenty of options for your working preferences. Find more information about our Work From Anywhere options here . Spotify is an equal opportunity employer. You are welcome at Spotify for who you are, no matter where you come from, what you look like, or what’s playing in your headphones. Our platform is for everyone, and so is our workplace. The more voices we have represented and amplified in our business, the more we will all thrive, contribute, and be forward-thinking! So bring us your personal experience, your perspectives, and your background. It’s in our differences that we will find the power to keep revolutionizing the way the world listens. Spotify transformed music listening forever when we launched in 2008. Our mission is to unlock the potential of human creativity by giving a million creative artists the opportunity to live off their art and billions of fans the chance to enjoy and be passionate about these creators. Everything we do is driven by our love for music and podcasting. Today, we are the world’s most popular audio streaming subscription service. Global COVID and Vaccination Disclosure Spotify is committed to safety and well-being of our employees, vendors and clients. We are following regional guidelines mandating vaccination and testing requirements, including those requiring vaccinations and testing for in-person roles and event attendance. For the US, we have mandated that all employees and contractors be fully vaccinated in order to work in our offices and externally with any third-parties. For all other locations, we strongly encourage our employees to get vaccinated and also follow local COVID and safety protocols."
uppdrag_ml_engineer = "What if your job had an impact on shaping the future of urban mobility? Imagine your experiments and analysis improving sustainable last-mile transportation for cities all over Europe. Imagine changing an industry with your team's latest products. YOUR MISSION AT VOI At Voi, we are committed to make the 15 minute cities a reality and we do it by setting our riders and cities first in all aspects of development. We are looking for an experienced Machine Learning Engineer to join our Operations team, where we ensure our Vois are ready to meet the mobility needs in our cities. Here you'll be able to impact fleet efficiency and enable more users to have an excellent ride experience. Our team identifies algorithmically the mobility needs in the field and suggests appropriate actions, such as: Anticipating battery levels and keeping our scooters ready for the next ride Ensuring that users will have Vois available close by whenever they'll need one Identifying parking that is less than ideal and deciding incentives YOUR TEAM AT VOI You’ll be joining a small, skilled and motivated team with a high degree of autonomy. You will have the opportunity to be part of forming the team and influencing its core culture. Voi is a hybrid workplace that operates on trust and freedom, but we also love to hang out in person! We are innovative and curious team members, keen on learning and growing, which is why we make sure to dedicate time to it every sprint. We welcome diverse ideas and continuously aim for a workplace that feels like home. What You’ll Be Working On Driving ML projects end to end - starting from a practical goal, creating a prototype, implementing it into production and measuring the business impact Developing production grade ML ensuring reliability and scalability Building tailor made machine learning models to improve fleet utilization, uptime and user experience Make improvements to all modules in our ongoing projects What You’ll Need To Embark Have 3+ years of experience in working with Machine Learning Have a solid technical and academic background with a MSc or similar Proven experience in building end-to-end machine learning in production Previous experience in a fast moving organization, approaching complex problems with iterative pragmatic solutions Eagerness to learn and contribute to the team's perpetual technical growth Knowledge of tooling related to: cloud solutions (we use GCP), orchestration (e.g. we use Airflow and Prefect) and data processing (e.g. dbt and Snowflake) is meriting Previous experience in forecasting, predictive modeling, optimization and geospatial techniques are meriting Are able to work from Stockholm a few days per quarter Professional working proficiency in English, Swedish is not required"
uppdrag_devops_engineer = "Our ambition is to amplify all business processes in H&M using AI and Advanced Analytics by 2025. This means we need to improve our existing platforms or build novel platforms to efficiently and cost-effectively prepare and serve hundreds of millions of AI modeling features train, manage and serve tens of thousands of AI models run thousands of AB tests deploy hundreds of exploration/dev/test/prod environments integrate with key IT systems supporting critical business processes As a Senior Devops Engineer in the AI Foundation, your mission is to contribute to these challenging, yet rewarding causes which will enable us to roll out scalable and production-ready AI and Advanced Analytics software. You will be part of an agile team that not only takes the responsibility of delivering such platforms but also ensures the success of the stakeholders through ensuring the best experience. In addition, you will be part of guilds that establish and promote best practices throughout the organization by adopting inner source culture that champions collaborations, reusable codes, and knowledge sharing. Qualifications Who you are? You feel that our AI and Advanced Analytics ambitions are very exciting. You have at least 5+ years of experience in a similar role. Automation is a part of your DNA, you are passionate about building zero touch systems. You are experienced in managing infrastructure with public cloud providers (Azure or Google cloud is preferred). Working with CI/CD pipelines is your specialty. You have a good understanding of Infrastructure as Code tools, such as Terraform, ARM, Ansible. You have experiences in setting up systems for better observability, such as logging, monitoring, alerting, etc. You have experiences with Docker, Kubernetes and micro service management systems. You believe in knowledge sharing and upskilling. You are a role model for the team not just for your technical skills, but also for adopting agile ways of working. You bear a team-first mentality and truly believe in striving in diversity. You have good communication skills, preferably in English, both verbal and written. You are not shy in presenting your work."

victor_desc = "Java, .NET 5, .NET Core, .NET MVC, .NET Framework, .NET, .NET Web API, Agile Development, Agile, ASP.NET, ASP.NET MVC 5, Backend, BitBucket, Git, GitHub, C#, Spring Boot, Spring Security, JavaScript, Vue.js, Visual Studio Code, IntelliJ IDEA, SQL, NoSQL, MongoDB, MariaDB, HTML, CSS, Rest API, JWT, Node.js, React.js, Docker Compose, Docker"
victor = "Victor is a Java / .Net and web developer who is eager to learn multiple languages ​​and frameworks. As a person, he is a team player who is always willing to help others. Victor has been described as a responsible, knowledgeable, dependable and someone who sees alot of pride in his work. Outside of work, Victor plays football and cook alot of different food from different cuisines, favorites are the Japanese and Italian cuisine. Furthermore Victor also have an interest in computer components and building computers."

In [68]:
#### 1. increase a words f-score dispersively

added_f_score_list = add_score_disperse(f_score_list,0.2)  

calculate_role(uppdrag_data_analyst,added_f_score_list).sort_values(by='Percentage',ascending = False)

Unnamed: 0,Squared sum,Percentage
data_analyst f-score,16.229385,29.29
data_scientist f-score,12.789605,23.08
data_engineer f-score,8.987659,16.22
fullstack f-score,7.001706,12.64
devops_engineer f-score,5.431268,9.8
ml_engineer f-score,4.974301,8.98


In [69]:
#### 2. Diminishing value when word is repeated
#Notice True at the end of calculate_role
calculate_role(uppdrag_data_analyst,f_score_list,True).sort_values(by='Percentage',ascending = False)

Unnamed: 0,Squared sum,Percentage
data_analyst f-score,5.134841,27.64
data_scientist f-score,3.827942,20.61
fullstack f-score,2.798261,15.06
data_engineer f-score,2.633039,14.17
ml_engineer f-score,2.108334,11.35
devops_engineer f-score,2.074608,11.17


In [70]:
#### 3. Only keeping the n highest f-score words in each category.
max_word_count = 300
chosen_f_score_list = f_score_list
#### 1. increase a words f-score if it is the category with the highest score (only for specific cases)

f_score_list_cut = initialize_f_score_list_cut(chosen_f_score_list,max_word_count)

calculate_role(uppdrag_data_analyst,f_score_list_cut).sort_values(by='Percentage',ascending = False)



Unnamed: 0,Squared sum,Percentage
data_analyst f-score,8.127217,28.81
data_scientist f-score,7.144302,25.33
data_engineer f-score,4.663468,16.53
fullstack f-score,3.269696,11.59
ml_engineer f-score,2.560459,9.08
devops_engineer f-score,2.444318,8.66


#### Combine all the methods compare

In [71]:
#to compare
calculate_role(uppdrag_data_analyst,f_score_list).sort_values(by='Percentage',ascending = False)

Unnamed: 0,Squared sum,Percentage
data_analyst f-score,8.503251,28.63
data_scientist f-score,7.3857,24.87
data_engineer f-score,4.978756,16.76
fullstack f-score,3.512615,11.83
ml_engineer f-score,2.703691,9.1
devops_engineer f-score,2.614912,8.8


### cut the f-score

In [72]:
max_word_count = 1000  #Amount of words in each category to be in the adding process
chosen_f_score_list = f_score_list

#Take the n highest f-score words of each category
f_score_list_cut = initialize_f_score_list_cut(chosen_f_score_list, max_word_count)

### Test all the good combinations

In [73]:
#Add 0.4 score to highest, only 500 highest, diminishing repetition, not distinct

#1. Add score. 0.4 to highest, 0.4/2 to second, 0.4/4 to third
added_f_score_list = add_score_to_highest_f_score_with_size(f_score_list_cut,0.4)

#2. Only take highest 500 f-scoring words for each category after adding
max_word_count = 500  
chosen_f_score_list = added_f_score_list

f_score_list_cut = initialize_f_score_list_cut(chosen_f_score_list, max_word_count)

# 3. Diminishing repetition True
calculate_role(uppdrag_data_analyst,f_score_list_cut,True).sort_values(by='Percentage',ascending = False)

Unnamed: 0,Squared sum,Percentage
data_analyst f-score,12.405506,38.47
data_scientist f-score,5.908655,18.32
data_engineer f-score,4.653679,14.43
fullstack f-score,3.260625,10.11
ml_engineer f-score,3.169721,9.83
devops_engineer f-score,2.848656,8.83


In [74]:
#Add 1 score dispersively, only 500 highest, diminishing repetition, not distinct

#1. Add score dispersively
added_f_score_list = add_score_disperse(f_score_list_cut,1)

#2. Only take highest 500 f-scoring words for each category after adding
max_word_count = 500
chosen_f_score_list = added_f_score_list
f_score_list_cut = initialize_f_score_list_cut(chosen_f_score_list, max_word_count)

# Diminishing repetition True
calculate_role(uppdrag_data_analyst,f_score_list_cut,True).sort_values(by='Percentage',ascending = False)

Unnamed: 0,Squared sum,Percentage
data_analyst f-score,84.201201,37.82
data_scientist f-score,35.312368,15.86
data_engineer f-score,33.463778,15.03
fullstack f-score,24.784415,11.13
devops_engineer f-score,22.560481,10.13
ml_engineer f-score,22.309557,10.02


In [75]:
#Add 0.4 score to highest, only 500 highest, diminishing repetition, distinct

#1. Add score. 0.4 to highest, 0.4/2 to second, 0.4/4 to third
added_f_score_list = add_score_to_highest_f_score_with_size(f_score_list_cut,0.4)

#2. Only take highest 500 f-scoring words for each category in a dictionary
max_word_count = 500
chosen_f_score_list = added_f_score_list
f_score_dict= initialize_f_score_dict(chosen_f_score_list,max_word_count)

# Diminishing repetition True Distinct
calculate_role_distinct(uppdrag_data_analyst,f_score_dict,True).sort_values(by='Percentage',ascending = False)

Unnamed: 0,Squared Sum,Percentage
data_analyst f-score,140.81674,40.04
data_scientist f-score,59.038059,16.79
data_engineer f-score,53.755297,15.29
fullstack f-score,36.387432,10.35
devops_engineer f-score,31.843899,9.06
ml_engineer f-score,29.808041,8.48


In [76]:
#Add 1 score disperse, only 500 highest, diminishing repetition, distinct
#1. Add score Disperse
added_f_score_list = add_score_disperse(f_score_list_cut,1)

#2. Only take highest 500 f-scoring words for each category in a dictionary
max_word_count = 500
chosen_f_score_list = added_f_score_list
f_score_dict= initialize_f_score_dict(chosen_f_score_list,max_word_count)

# Diminishing repetition True Distinct
calculate_role_distinct(uppdrag_data_analyst,f_score_dict,True).sort_values(by='Percentage',ascending = False)

Unnamed: 0,Squared Sum,Percentage
data_analyst f-score,212.860931,39.32
data_scientist f-score,83.591447,15.44
data_engineer f-score,82.067731,15.16
fullstack f-score,60.317638,11.14
devops_engineer f-score,53.593776,9.9
ml_engineer f-score,48.963931,9.04


## Other interesting info

In [77]:
def compare_df_category_count(word_df_list):
    """Takes in a word list returns a dataframe with duplicate amount as 'size'"""
    all_df = pd.concat(word_df_list)
    grouped_df = all_df.groupby(all_df.index.tolist(),as_index=False).size().sort_values(by='size', ascending = False)
    return grouped_df

In [78]:
#Lets say we want to know if any new words appeared or disappeard after 1 month
f_score_list_new = f_score_list.drop('computer')

category_count_df = compare_df_category_count([f_score_list,f_score_list_new])

category_count_df

Unnamed: 0,index,size
0,0,2
3383,pharmaceuitical,2
3390,phishing,2
3389,philosophy,2
3388,philosophies,2
...,...,...
1691,examine,2
1690,exam,2
1689,evolving,2
1688,evolution,2


In [79]:
#If count = 1, then that means that it only exists in one of the DF, which means that it either appeared or disapeared
category_count_df[category_count_df['size'] == 1]


Unnamed: 0,index,size
952,computer,1


#### This function could also be used to compare category words

In [80]:
max_word_count = 50
chosen_f_score_list = f_score_list

fullstack_words = chosen_f_score_list.sort_values(by="fullstack f-score",ascending = False)[:max_word_count]
data_engineer_words = chosen_f_score_list.sort_values(by="data_engineer f-score",ascending = False)[:max_word_count]
data_analyst_words = chosen_f_score_list.sort_values(by="data_analyst f-score",ascending = False)[:max_word_count]
data_scientist_words = chosen_f_score_list.sort_values(by="data_scientist f-score",ascending = False)[:max_word_count]
ml_engineer_words = chosen_f_score_list.sort_values(by="ml_engineer f-score",ascending = False)[:max_word_count]
devops_engineer_words = chosen_f_score_list.sort_values(by="devops_engineer f-score",ascending = False)[:max_word_count]

category_count_df = compare_df_category_count([fullstack_words,data_engineer_words,
                                               data_analyst_words,data_scientist_words
                                               ,ml_engineer_words,devops_engineer_words])

#We can see here that data is a very common word(top 50) in 3 different categories
display(category_count_df)

Unnamed: 0,index,size
71,data,3
143,machine,2
65,continuous,2
243,spark,2
238,security,2
...,...,...
101,equal,1
102,equivalent,1
103,etl,1
104,excel,1


### Extra functions

In [81]:
def calculate(text,f_score,diminishing_repetition = False,category1='none',brightness = 1,category2 = 'none'):
    """calculate_role_visual and calculate_role_distinct_visual in one function"""
    if type(f_score) is pd.DataFrame:
        score_df = calculate_role_visual(text,f_score,diminishing_repetition,category1,brightness,category2)
    if type(f_score) is dict:
        score_df = calculate_role_distinct_visual(text,f_score,diminishing_repetition,category1,brightness,category2)
    return score_df

## Test on your own

#### test with one of the better f_score methods

In [82]:
max_word_count_adding = 600  #Amount of words in each category to be in the adding process
max_word_count = 300  # how many words in each category

#Take the n highest f-score words of each category to be in the adding process
f_score_list_cut = initialize_f_score_list_cut(f_score_list, max_word_count_adding)

#1. Add to highest
added_f_score_list = add_score_to_highest_f_score_with_size(f_score_list_cut,0.4)

#1. or add disperse
#added_f_score_list = add_score_disperse(all_df,1)

f_score_dict= initialize_f_score_dict(added_f_score_list,max_word_count)


#### now just replace the first argument in calculate_role_distinct_visual

In [83]:
uppdrag_fullstack = "Java Full stack developer 100 Remote 6 months Contract to Hire Can Spons JD Mandatory Java, JavaScript, React, Angular What You will Need Candidate possess a bachelorrsquos degree in Computer Science, or related field, or equivalent experience 7+ years of experience creating modern web experiences across devices Expert in creating customer experiences web portal using JavaScript, CSS and HTML Project experience using React, Angular, Vue.JS or similar frameworks andor libraries Working experience with distributed SCM (GitHub a plus), DevOps, AWS Experience integrating with lightweight middleware technologies, integration patterns, microservices Proficient demonstration of SQL knowledge Track record of taking ownership and driving results in a data-driven, fast-paced environment Excellent interpersonal and communication skills, strong analytical skills, and ability to deal with ambiguity in a rapidly evolving business environment Bonus Understanding of testing automation, including the building of scrappy codingautomation for testing code faster Experience in Retail, E-Commerce or Technology Industry, Merchandising Vendor Management focus a plus Strong engineering background building Productsprototypes that are E-Commerce scale Experience in Software Quality Processes and Atlassian Tools (Confluence, JIRA, etc.) Feel free to reach out to me for questions or clarifications."
uppdrag_data_engineer = "Design and implement data extraction process of data from existing systems - Update current data lake data stream design making them capable of handling new types of data streams - with subsequent implementation - Support development of graphical user presentation interface that uses the new data streams Description Core skills: Cloud technical development skills - Experience with AWS services like Glue, EC2, Lambda, S3, Redshift, PostgreSQL etc. - IT architecture (preferably AWS) - Experience with data harmonisation and data entity identification - Experience with data contextualization and visualisation – Preferably Tableau, Power BI - Azure DevOps -Python 3 - Document the technical architecture and the process flows Personal skills: Analytical mindset Can work independently but also collaboratively Visualisation of complex scenarios Can translate between business needs and technical requirements Self-paced Fluent in English"
uppdrag_data_analyst = "In this position you support Pleo’s Market Expansion and SMB Acquisition & Growth domains, and you are a member of the Data & Analytics competence group. SMB Acquisition & Growth Domain We contribute 70% of Pleo’s revenue, and we do that by wow-ing customers at every touchpoint, from our ads, to our website, to our sign up journey, to our renewals, to our engagement comms, in-product nudges, etc. We design for the end users, apply consumer approach to growth tactics, build for scalability, make the product accessible & fast, and push for organic growth. We are a super-cross-functional domain, with marketing, sales, customer success, product, etc. all housed under the same “virtual” roof, but a huge focus on experimentation and customer obsession. Market Expansion Domain Pleo will become the go-to spending solution for companies in the SMB segment across Europe, empowering employees by enabling a healthy spending culture. We are launching 15 new markets in 15 months as a way to launch simultaneous bets to supercharge Pleo’s hyper-growth. So What's Data & Analytics At Pleo Like? Join our community of 20+ talented data professionals working from more than 10 different locations worldwide with backgrounds in data analytics, analytical engineering and data engineering. Work with a modern data & analytics toolkit including Kafka, BigQuery, dbt, Looker, Fivetran, Segment, & Amplitude. A shared vision to stop the guessing game and unlock growth for Pleo. A lot of attention on data culture, strategy and empowerment of our stakeholders. Dedicated learning & development guidance and support for data professionals. Deal with a huge variety of data and insights spanning the entire business, from the detailed inner workings of the Pleo platform to web & app metrics, CRM details, financial and banking records, human resource information, and everything else needed to fuel Pleo’s decision making processes in a data inspired manner. What Great Looks Like In This Role You and your team are company-wide trusted experts for insights related to Market Expansion and SMB Acquisition & Growth domains. You successfully nurture a culture of data inspired decision making in alignment with the other Data & Analytics leaders. You are an ambassador for excellence within the Data & Analytics community, you lead by example and inspire others. You and your team empowered the Market Expansion and SMB Acquisition & Growth domains to derive descriptive and diagnostic insights autonomously, while you focus on predictive and prescriptive insights. You and your team are highly satisfied in terms of career development and your sense of belonging to the Market Expansion and SMB Acquisition & Growth domains and the Data & Analytics competence group. You solve problems through collaboration rather than control. Your Responsibilities Be the Data Analytics Competence Lead for up to 4 Data Analysts of Market Expansion and SMB Acquisition & Growth domains, providing career development and coaching Ensure Data & Analytics best practices and standards are identified, cultivated and followed in your competence team and domains, own the quality of work and provide ways of solving problems Act as multiplier for Data & Analytics inside the domains and facilitate the feedback loop to other Data & Analytics functions at Pleo Be accountable for your team’s delivery and quality of metrics and dashboards Drive a culture of experimentation and data-informed decision making across the entire customer lifecycle Facilitate activities to increase the level of data literacy within the domains Your Colleagues Say You Have excellent hands-on knowledge of SQL and experience with more advanced analytics topics such as cohort and regression analysis Are an expert in product analytics Are experienced with data visualisation tools such as Tableau, Looker or similar Communicate with clarity and empathy on insights and recommendations to cross-functional stakeholders for decision making Are an authentic leader and a role model for radical candour Have an eye for details and quality is more important for you than speed Show Me The Benefits Get your own Pleo card, which means full autonomy and no out-of-pocket spending Ability to work remotely (anywhere between the east coast of the Americas to European time zones)... ...or onsite if you want to (Copenhagen, London, Berlin, Stockholm, Madrid, Lisbon) Catered lunch in our offices or daily budget if you work remotely 25 days of annual holidays, on top of the standardised festive and bank-related ones, of course 2500€ per year as flex benefit (maybe you want to buy additional holidays, pay the gym, book a professional coach, pay part of your MBA, or finally get that pet you always dreamed of) Great parental leave: 100% paid, 24 weeks for primary caretakers & 8 weeks for secondary Loads of weird and wonderful niche communities to join in the company Trips abroad for team camps and fun Wild enthusiasm and encouragement from us if you want to host MeetUps, events, etc - we'll help (venue, food etc)"
uppdrag_data_scientist = "The Platform team creates the technology that enables Spotify to learn quickly and scale easily, enabling rapid growth in our users and our business around the globe. Spanning many disciplines, we work to make the business work; creating the frameworks, capabilities and tools needed to welcome a billion customers. Join us and help to amplify productivity, quality and innovation across Spotify. At Client Platform, a part of the Platform Mission, we are passionate about amplifying productivity, quality and innovation across all client developers at Spotify. Client Platform strives to bring a great experience for client developers at Spotify, and through this deliver stable and reliable products for people to enjoy. We are looking for a Lead Data Scientist that will study the behavior of client developers at Spotify, help evolve our product strategy, drive and own work for our critical initiatives, and bring data and insights into high impact collaborations. Spotify is a fast paced company that believes that every decision should be data-informed and every feature be fueled with data. As a Lead Data Scientist you will be working independently in one of three Product Areas, and together with other Data Science Leads, you will be part of a team bridging insights work across our Tribe. What You’ll Do Lead insights work and establish collaboration with cross-functional roles, e.g. product managers, engineers, designers Partner with Product, Design and Tech leads to determine goals and priorities, as well as empowering them with data through the decision making process Define metrics, build dashboards, create reports and key datasets to empower data-informed product development Communicate insights and recommendations to key partners, helping activate data best practices in the Client Platform teams Perform exploratory analysis to understand who our users are, how they get value out of our offering and where we can further develop our product to bring greater value Who You Are 5+ years of working experience, with a degree in statistics, mathematics, computer science, engineering, economics or any other quantitative field A communicative person who values building strong relationships with colleagues and partners, you also enjoy mentoring and guiding others Able to navigate loosely defined problems, as well as coming up with impactful and actionable insights Have understanding of how to instrument products to accurately collect user and system behaviors through data, thus offering a wide variety of insights and product development use cases Skilled in advanced analytics, and you possess statistical competence (such as regression modeling and significance testing) Hands-on experience synthesizing insights from data using tools such as Python, R, BigQuery, SQL, Tableau Preferably have some level of leadership and management experience, as well as strong project management skills Where you'll be For this role, it can be within the EMEA region in which we have a work location and is within working hours. Prefer an office to work from home instead? Not a problem! We have plenty of options for your working preferences. Find more information about our Work From Anywhere options here . Spotify is an equal opportunity employer. You are welcome at Spotify for who you are, no matter where you come from, what you look like, or what’s playing in your headphones. Our platform is for everyone, and so is our workplace. The more voices we have represented and amplified in our business, the more we will all thrive, contribute, and be forward-thinking! So bring us your personal experience, your perspectives, and your background. It’s in our differences that we will find the power to keep revolutionizing the way the world listens. Spotify transformed music listening forever when we launched in 2008. Our mission is to unlock the potential of human creativity by giving a million creative artists the opportunity to live off their art and billions of fans the chance to enjoy and be passionate about these creators. Everything we do is driven by our love for music and podcasting. Today, we are the world’s most popular audio streaming subscription service. Global COVID and Vaccination Disclosure Spotify is committed to safety and well-being of our employees, vendors and clients. We are following regional guidelines mandating vaccination and testing requirements, including those requiring vaccinations and testing for in-person roles and event attendance. For the US, we have mandated that all employees and contractors be fully vaccinated in order to work in our offices and externally with any third-parties. For all other locations, we strongly encourage our employees to get vaccinated and also follow local COVID and safety protocols."
uppdrag_ml_engineer = "What if your job had an impact on shaping the future of urban mobility? Imagine your experiments and analysis improving sustainable last-mile transportation for cities all over Europe. Imagine changing an industry with your team's latest products. YOUR MISSION AT VOI At Voi, we are committed to make the 15 minute cities a reality and we do it by setting our riders and cities first in all aspects of development. We are looking for an experienced Machine Learning Engineer to join our Operations team, where we ensure our Vois are ready to meet the mobility needs in our cities. Here you'll be able to impact fleet efficiency and enable more users to have an excellent ride experience. Our team identifies algorithmically the mobility needs in the field and suggests appropriate actions, such as: Anticipating battery levels and keeping our scooters ready for the next ride Ensuring that users will have Vois available close by whenever they'll need one Identifying parking that is less than ideal and deciding incentives YOUR TEAM AT VOI You’ll be joining a small, skilled and motivated team with a high degree of autonomy. You will have the opportunity to be part of forming the team and influencing its core culture. Voi is a hybrid workplace that operates on trust and freedom, but we also love to hang out in person! We are innovative and curious team members, keen on learning and growing, which is why we make sure to dedicate time to it every sprint. We welcome diverse ideas and continuously aim for a workplace that feels like home. What You’ll Be Working On Driving ML projects end to end - starting from a practical goal, creating a prototype, implementing it into production and measuring the business impact Developing production grade ML ensuring reliability and scalability Building tailor made machine learning models to improve fleet utilization, uptime and user experience Make improvements to all modules in our ongoing projects What You’ll Need To Embark Have 3+ years of experience in working with Machine Learning Have a solid technical and academic background with a MSc or similar Proven experience in building end-to-end machine learning in production Previous experience in a fast moving organization, approaching complex problems with iterative pragmatic solutions Eagerness to learn and contribute to the team's perpetual technical growth Knowledge of tooling related to: cloud solutions (we use GCP), orchestration (e.g. we use Airflow and Prefect) and data processing (e.g. dbt and Snowflake) is meriting Previous experience in forecasting, predictive modeling, optimization and geospatial techniques are meriting Are able to work from Stockholm a few days per quarter Professional working proficiency in English, Swedish is not required"
uppdrag_devops_engineer = "Our ambition is to amplify all business processes in H&M using AI and Advanced Analytics by 2025. This means we need to improve our existing platforms or build novel platforms to efficiently and cost-effectively prepare and serve hundreds of millions of AI modeling features train, manage and serve tens of thousands of AI models run thousands of AB tests deploy hundreds of exploration/dev/test/prod environments integrate with key IT systems supporting critical business processes As a Senior Devops Engineer in the AI Foundation, your mission is to contribute to these challenging, yet rewarding causes which will enable us to roll out scalable and production-ready AI and Advanced Analytics software. You will be part of an agile team that not only takes the responsibility of delivering such platforms but also ensures the success of the stakeholders through ensuring the best experience. In addition, you will be part of guilds that establish and promote best practices throughout the organization by adopting inner source culture that champions collaborations, reusable codes, and knowledge sharing. Qualifications Who you are? You feel that our AI and Advanced Analytics ambitions are very exciting. You have at least 5+ years of experience in a similar role. Automation is a part of your DNA, you are passionate about building zero touch systems. You are experienced in managing infrastructure with public cloud providers (Azure or Google cloud is preferred). Working with CI/CD pipelines is your specialty. You have a good understanding of Infrastructure as Code tools, such as Terraform, ARM, Ansible. You have experiences in setting up systems for better observability, such as logging, monitoring, alerting, etc. You have experiences with Docker, Kubernetes and micro service management systems. You believe in knowledge sharing and upskilling. You are a role model for the team not just for your technical skills, but also for adopting agile ways of working. You bear a team-first mentality and truly believe in striving in diversity. You have good communication skills, preferably in English, both verbal and written. You are not shy in presenting your work."

In [84]:
#Feel free to change anything. Check docstring or past usage for the function if any confusion

calculate_role_distinct_visual( text = uppdrag_data_engineer,
                               f_score_dict = f_score_dict,
                               diminishing_repetition = True,
                               category1='data_analyst',
                               brightness=3,
                               category2='data_engineer').sort_values(by='Percentage',ascending = False)

[38;2;0;150;0mdesign [38;2;255;255;255m[38;2;0;150;0mand [38;2;255;255;255m[38;2;255;0;0mimplement [38;2;255;255;255m[38;2;0;150;0mdata [38;2;255;255;255m[38;2;0;150;0mextraction [38;2;255;255;255m[38;2;246;0;0mprocess [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;0;150;0mdata [38;2;255;255;255m[38;2;0;150;0mfrom [38;2;255;255;255m[38;2;0;150;0mexisting [38;2;255;255;255m[38;2;0;150;0msystems [38;2;255;255;255m[38;2;0;150;0mupdate [38;2;255;255;255m[38;2;0;150;0mcurrent [38;2;255;255;255m[38;2;0;150;0mdata [38;2;255;255;255m[38;2;0;0;255mlake [38;2;255;255;255m[38;2;0;150;0mdata [38;2;255;255;255m[38;2;0;0;255mstream [38;2;255;255;255m[38;2;0;150;0mdesign [38;2;255;255;255m[38;2;0;150;0mmaking [38;2;255;255;255m[38;2;0;150;0mthem [38;2;255;255;255m[38;2;0;150;0mcapable [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;0;0;255mhandling [38;2;255;255;255m[38;2;0;150;0mnew [38;2;255;255;255m[38;2;0;150;0mtypes [38;2;2

Unnamed: 0,implement,process,graphical,services,tableau,power,personal,analytical,business,lake,stream,handling,glue,ec2,lambda,s3,redshift,document,flows
data_analyst f-score,0.426394,0.322324,0.489409,0.473585,0.513527,0.409948,0.342763,0.454511,0.323209,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
data_engineer f-score,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.489509,0.709414,0.528526,0.571368,0.410105,0.589867,0.364978,0.874155,0.331908,0.493692


Unnamed: 0,Squared Sum,Percentage
data_engineer f-score,3.116067,38.06
data_analyst f-score,1.661752,20.3
data_scientist f-score,1.568955,19.17
devops_engineer f-score,1.516011,18.52
ml_engineer f-score,0.323598,3.95
fullstack f-score,0.0,0.0


### Test not-distinct

In [85]:
max_word_count_adding = 600  #Amount of words in each category to be in the adding process
max_word_count = 300  # how many words in each category

#Take the n highest f-score words of each category to be in the adding process
f_score_list_cut = initialize_f_score_list_cut(f_score_list, max_word_count_adding)

#1. Add to highest
added_f_score_list = add_score_to_highest_f_score_with_size(f_score_list_cut,0.4)

#1. or add disperse
#added_f_score_list = add_score_disperse(all_df,1)

f_score_list_cut= initialize_f_score_list_cut(added_f_score_list,max_word_count)

In [86]:
#Feel free to change anything. Check docstring or past usage for the function if any confusion

calculate_role_visual( text = uppdrag_data_engineer,
                               f_score_list = f_score_list_cut,
                               diminishing_repetition = True,
                               category1='data_analyst',
                               brightness=3,
                               category2='data_engineer').sort_values(by='Percentage',ascending = False)

[38;2;0;150;0mdesign [38;2;255;255;255m[38;2;0;150;0mand [38;2;255;255;255m[38;2;255;0;22mimplement [38;2;255;255;255m[38;2;226;0;220mdata [38;2;255;255;255m[38;2;0;0;29mextraction [38;2;255;255;255m[38;2;246;0;68mprocess [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;226;0;220mdata [38;2;255;255;255m[38;2;0;150;0mfrom [38;2;255;255;255m[38;2;0;150;0mexisting [38;2;255;255;255m[38;2;0;150;0msystems [38;2;255;255;255m[38;2;0;150;0mupdate [38;2;255;255;255m[38;2;0;150;0mcurrent [38;2;255;255;255m[38;2;226;0;220mdata [38;2;255;255;255m[38;2;0;0;255mlake [38;2;255;255;255m[38;2;226;0;220mdata [38;2;255;255;255m[38;2;0;0;255mstream [38;2;255;255;255m[38;2;0;150;0mdesign [38;2;255;255;255m[38;2;0;150;0mmaking [38;2;255;255;255m[38;2;0;150;0mthem [38;2;255;255;255m[38;2;0;150;0mcapable [38;2;255;255;255m[38;2;0;150;0mof [38;2;255;255;255m[38;2;0;0;255mhandling [38;2;255;255;255m[38;2;0;150;0mnew [38;2;255;255;255m[38;2;26;0;54mtypes

Unnamed: 0,implement,data,process,types,graphical,presentation,technical,aws,services,redshift,tableau,power,azure,python,document,personal,analytical,mindset,complex,translate,business,paced,extraction,lake,stream,handling,glue,ec2,lambda,s3,devops,flows
data_analyst f-score,0.426394,0.29616,0.322324,0.03407,0.489409,0.097472,0.119588,0.005682,0.473585,0.044139,0.513527,0.409948,0.031612,0.103102,0.095704,0.342763,0.454511,0.03295,0.201554,0.037938,0.323209,0.159048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
data_engineer f-score,0.029142,0.288678,0.089243,0.070599,0.0,0.039782,0.147587,0.217189,0.208131,0.874155,0.08621,0.061883,0.176723,0.173535,0.331908,0.090315,0.073116,0.0,0.092482,0.0,0.204592,0.060881,0.037973,0.489509,0.709414,0.528526,0.571368,0.410105,0.589867,0.364978,0.044622,0.493692


Unnamed: 0,Squared sum,Percentage
data_engineer f-score,3.668851,37.19
data_analyst f-score,2.037993,20.66
data_scientist f-score,1.847552,18.73
devops_engineer f-score,1.725743,17.5
ml_engineer f-score,0.443058,4.49
fullstack f-score,0.140983,1.43


In [94]:
import copy

In [88]:
import tensorflow as tf

In [89]:
tf.keras

<module 'keras.api._v2.keras' from 'C:\\Users\\Tony\\anaconda4\\lib\\site-packages\\keras\\api\\_v2\\keras\\__init__.py'>

In [91]:
class Neuron:
    def __init__(self,nbrOfInputs):
        self.w = [];
        self.inputs = [];
        self.errorGrad = 0;
        for i in range(0,nbrOfInputs):
            self.w.append(np.random.uniform(-1,1));
        self.b = np.random.uniform(-1,1);
    def updateWeights(self,ts):
        for data in ts:
            error = data[1] - self.calcOutput(data[0]);
            #print("\tWeights = {}".format(self.w));
            #print("\tBias = %f" % self.b);
            #print("\tERROR = %f" % error);
            for i in range(0,len(self.w)):
                self.w[i] = data[0][i] * error + self.w[i];
            self.b += error;
    def calcOutput(self,x,actFunc = 0):
        self.inputs = x;
        if not isinstance(self.inputs, (list,)):
            self.inputs =[self.inputs]
        #return 1 if self.dotBias(x) > 0 else 0;
        return sigmoid(self.dotBias(x)) if actFunc == 0 else lReLU(self.dotBias(x));
    def dotBias(self,x):
        return np.dot(x,self.w)+self.b;
    def predict(self,x):
        return self.calcOutput(x);

    
class Layer:
    def __init__(self,numInputs,numNeurons,hiddenLayer=0):
        self.neuronList = [];
        self.numNeurons = numNeurons;
        self.outputs = [];
        self.hiddenLayer = hiddenLayer;
        for i in range(0,numNeurons):
            self.neuronList.append(Neuron(numInputs));
    def calcOutputs(self,inputs):
        self.outputs = [];
        for i in range(0,len(self.neuronList)):
            self.outputs.append(self.neuronList[i].calcOutput(inputs,self.hiddenLayer));
        return self.outputs;
    
    
class NeuralNetwork: # With bias node, different activationfunc, adaptive alpha
    def __init__(self,numInputNeurons,numHiddenLayers,numNeuronsPerHiddenLayer,numOutputNeurons, alpha):
        self.layers = [];
        self.numHiddenLayers = numHiddenLayers;
        self.alpha = alpha;
        if(numHiddenLayers > 0):
            self.layers.append(Layer(numInputNeurons+1,numNeuronsPerHiddenLayer,1));
            for i in range(0,numHiddenLayers-1):
                self.layers.append(Layer(self.layers[i].numNeurons+1,numNeuronsPerHiddenLayer,1));
            self.layers.append(Layer(numNeuronsPerHiddenLayer+1,numOutputNeurons,0));
        else:
            self.layers.append(Layer(numInputNeurons+1,numOutputNeurons,0));
    def calcOutputs(self,inputs):
        copyOfInputs = copy.copy(inputs);
        if not isinstance(copyOfInputs, (list,)):
            #print("CPI: {}".format(copyOfInputs))
            copyOfInputs = [copyOfInputs];
        copyOfInputs.append(1);
        #print(copyOfInputs)
        #print("layer 0: {}".format(
        self.layers[0].calcOutputs(copyOfInputs);
        #print(self.layers[0].outputs)
        for layerNbr in range(1,self.numHiddenLayers+1):
            biasedInputs = copy.copy(self.layers[layerNbr-1].outputs);
            biasedInputs.append(1);
            #print(biasedInputs)
            #print("layer {}: {}".format(layerNbr,
            self.layers[layerNbr].calcOutputs(biasedInputs);
        return self.layers[len(self.layers)-1].outputs;
    def updateWeights(self,ts,printResult = False):
        prevSSE = 9999;
        prevLayers = copy.copy(self.layers);
        ts =[ts]
        for data in ts:
            SSE = 0.0;
            #print("data: {}".format(data))
            if not isinstance(data[0], (list,)):
                data[0] =[data[0]]
            if not isinstance(data[1], (list,)):
                data[1] =[data[1]]
            calculatedOutputs = self.calcOutputs(data[0]);############## Calc OUTPUT              
            desiredOutputs = data[1];
            
            ############################################################ Error Backpropagation
            for layerNbr in range(self.numHiddenLayers,-1,-1):
                #if printResult == True: print("layer {}".format(layerNbr));
                for neuronNbr in range(0,len(self.layers[layerNbr].neuronList)):
                    if layerNbr == self.numHiddenLayers:
                        error = desiredOutputs[neuronNbr] - calculatedOutputs[neuronNbr];
                        SSE = SSE + error**2
                        if printResult == True: print("\tError: %f" %error)
                        self.layers[layerNbr].neuronList[neuronNbr].errorGrad = dSigmoid(calculatedOutputs[neuronNbr])*error; #Derivata av sigmoid funktionen
                    else:
                        self.layers[layerNbr].neuronList[neuronNbr].errorGrad = dSigmoid(self.layers[layerNbr].outputs[neuronNbr]);
                        errorGradSum = 0;
                        for neuronNext in range(0,len(self.layers[layerNbr+1].neuronList)):
                            errorGradSum += self.layers[layerNbr+1].neuronList[neuronNext].w[neuronNbr] * self.layers[layerNbr+1].neuronList[neuronNext].errorGrad;
                        self.layers[layerNbr].neuronList[neuronNbr].errorGrad *= errorGradSum;
                    #    if printResult == True: print("\tNeuron {}: EGS: {}".format(neuronNbr,errorGradSum));
                    #if printResult == True: print("\t\tinputs: {}".format(self.layers[layerNbr].neuronList[neuronNbr].inputs))
                    #if printResult == True: print("\t\tWeights: {}".format(self.layers[layerNbr].neuronList[neuronNbr].w))
                    for inputNbr in range(0,len(self.layers[layerNbr].neuronList[neuronNbr].inputs)):
                        if layerNbr == self.numHiddenLayers:
                            #error = self.layers[layerNbr].neuronList[neuronNbr].errorGrad
                            error = desiredOutputs[neuronNbr] - calculatedOutputs[neuronNbr];
                            self.layers[layerNbr].neuronList[neuronNbr].w[inputNbr] += self.alpha * self.layers[layerNbr].neuronList[neuronNbr].inputs[inputNbr] * error 
                        else: 
                            #print(self.layers[layerNbr].neuronList[neuronNbr].inputs[inputNbr])
                            self.layers[layerNbr].neuronList[neuronNbr].w[inputNbr] += self.alpha * self.layers[layerNbr].neuronList[neuronNbr].inputs[inputNbr] * self.layers[layerNbr].neuronList[neuronNbr].errorGrad;
                        
                    self.layers[layerNbr].neuronList[neuronNbr].b += self.alpha * self.layers[layerNbr].neuronList[neuronNbr].errorGrad;
            
            
            #print(self.alpha) #ADAPTIVE ALPHA
            #if SSE > prevSSE:
                
             #   self.layers = copy.copy(prevLayers);
             #   if(self.alpha> 0.01):
             #       self.alpha -= 0.001
            #else:
             #   prevLayers = copy.copy(self.layers);
              #  prevSSE = copy.copy(SSE);
              #  if(self.alpha < 0.95):
              #      self.alpha += 0.001
        #print("\tSSE: %f" %SSE)
        return SSE;
    def predict(self,inputs):
        return self.calcOutputs(inputs);
    

def sigmoid(value):
    return (1/ (1 + np.exp(-value)));
def dSigmoid(value):
    return value * (1-value);
def lReLU(x):   # YOU CANT DO BACKWARDS PROPAGATION (ERRORGRAD) ON RELU; SINCE ITS A LINE so use dsigmoid instead for backpropagation(yes you can use that one even though you used this one)
    return x if x>0 else 0.01*x;
def dlReLU(x):
    return 1 if x>0 else 0.01;
def tanh(x):
    return (2/(1+np.exp(-2*x)) -1)

def train(NN, ts,epochs):
    prevSSE = 9999;
    for i in range(0,epochs):
        print("Epoch: %d"% (i+1));
        NN.updateWeights(ts,False)
    print("Epoch: %d" % (epochs+1));
    NN.updateWeights(ts,True);
    
def printNN(nn):
    layerNbr = 0;
    neuronNbr = 0;
    weightNbr = 0;
    for layer in nn.layers:
        #print("layer %d:" % (layerNbr) + "{}".format(layer.outputs));
        for neuron in layer.neuronList:
            #print("\tneuron %d[ErrorGrad:%f]:" % (neuronNbr,neuron.errorGrad));
            for weight in neuron.w:
                #print("\t\tweight %d:" % weightNbr);
                print("\t\t\t%f" % weight);
                weightNbr+=1;
            weightNbr=0;
            neuronNbr+=1;
        neuronNbr = 0;
        layerNbr +=1;

In [92]:
NN = NeuralNetwork(2,1,2,1,0.8)

In [103]:
NN.predict([1000,100])

[1.0]

### References

In [90]:
https://github.com/JasonKessler/scattertext/blob/master/README.md


SyntaxError: invalid syntax (Temp/ipykernel_1144/3973760709.py, line 1)