In [1]:
import pandas as pd
import numpy as np
import nltk
import re
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import gensim.downloader as api


[nltk_data] Downloading package stopwords to /Users/alex/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/alex/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/alex/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
dataset = pd.read_csv('data/jobs_energy.csv', delimiter=';', names=['job_id', 'company_name', 'job_title', 'job_location', 'job_description_text', 'language', 'job_url'], dtype={'job_id':int})

# Some job descriptions are in languages other than englisg, we will remove them
dataset = dataset[dataset['language'] == 'en']
print("Dataset size after removing non-english jobs: ", dataset.shape)
print("Dataset columns: ", dataset.columns)
# Keep only needed columns
dataset = dataset[['company_name', 'job_title', 'job_description_text', 'job_url', 'job_location']]

# Remove duplicates
dataset = dataset.drop_duplicates(subset=['job_description_text'])
print("Dataset size after removing duplicates: ", dataset.shape)

# Drop rows with less than 100 characters
dataset = dataset[dataset['job_description_text'].str.len() > 100]
print("Dataset size after removing short job descriptions: ", dataset.shape)

# Remove rows whose description does not contain the word energy or a related word
# dataset = dataset[dataset['job_description_text'].str.contains(r'(nerg|renewable)', flags=re.IGNORECASE, regex=True, na=False)]
# print("Dataset size after removing non-energy jobs: ", dataset.shape)


Dataset size after removing non-english jobs:  (2877, 7)
Dataset columns:  Index(['job_id', 'company_name', 'job_title', 'job_location',
       'job_description_text', 'language', 'job_url'],
      dtype='object')
Dataset size after removing duplicates:  (2626, 5)
Dataset size after removing short job descriptions:  (2622, 5)


In [3]:
def preprocess_text(text, lemmatize=True):
    # Remove punctuation
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # Some pair of words are written together, we will split them
    text = re.sub(r'([a-z]+)([A-Z])', r'\1 \2', text)
    
    # Lowercase
    text = text.lower()

    # Remove stop words and words with less than 3 characters
    stop_words = set(stopwords.words('english'))
    extra = {'experience', 'work', 'working', 'show', 'skills', 'skilled', 'role', 'technical', 'support', 'client', 'clients'
            'less', 'team', 'ability', 'knowledge', 'new', 'company', 'within', 'solution', 'solutions'
            'including', 'year', 'years', 'looking', 'opportunity', 'including','part', 'join',
            'people', 'tool', 'job', 'jobs', 'worked', 'requirement', 'long', 'term'
            'related', 'global', 'sexual', 'orientation', 'equal', 'employer', 'gender', 'identity'}
    stop_words = stop_words.union(extra)
    text = ' '.join([word for word in text.split() if word not in (stop_words)])

    # Lemmatize
    if lemmatize:
        lemmatizer = WordNetLemmatizer()
        text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

    return text

def find_total_words(text):
    return len(text.split())

In [4]:
df = dataset.copy()
unlemmatized_corpus = dataset['job_description_text'].apply(preprocess_text, lemmatize=False)
df['job_description_text'] = df['job_description_text'].apply(preprocess_text)
print("Jobs before removing empty descriptions: ", df.shape)

# Remove empty descriptions
df = df.dropna(subset=['job_description_text'])
unlemmatized_corpus = unlemmatized_corpus.dropna()
print("Jobs after removing empty descriptions: ", df.shape)

Jobs before removing empty descriptions:  (2622, 5)
Jobs after removing empty descriptions:  (2622, 5)


In [5]:
# Analyze job descriptions
corpus = " ".join(df['job_description_text'])

# Total number of words
total_words = find_total_words(corpus)
print("Total number of words in the corpus: ", total_words)

# Total number of unique words
unique_words = len(set(corpus.split()))
print("Total number of unique words in the corpus: ", unique_words)

tokens = word_tokenize(corpus)
fdist = nltk.FreqDist(tokens)
print("Most common words in the corpus: ", fdist.most_common(30))

bgs = nltk.bigrams(tokens)
fdist = nltk.FreqDist(bgs)
print("Most common bigrams in the corpus: ", fdist.most_common(30))

Total number of words in the corpus:  788873
Total number of unique words in the corpus:  24025
Most common words in the corpus:  [('data', 9320), ('energy', 7714), ('business', 6022), ('project', 5962), ('development', 3710), ('market', 3577), ('engineering', 3541), ('management', 3490), ('system', 3313), ('process', 3235), ('analysis', 3192), ('service', 2923), ('customer', 2824), ('le', 2739), ('environment', 2719), ('solution', 2691), ('technology', 2679), ('u', 2550), ('product', 2548), ('strong', 2516), ('industry', 2352), ('requirement', 2277), ('power', 2228), ('application', 2226), ('design', 2195), ('analyst', 2129), ('position', 2103), ('engineer', 2055), ('responsibility', 2042), ('degree', 1957)]
Most common bigrams in the corpus:  [(('renewable', 'energy'), 708), (('data', 'analyst'), 661), (('e', 'g'), 659), (('data', 'analysis'), 642), (('bachelor', 'degree'), 606), (('problem', 'solving'), 574), (('master', 'degree'), 491), (('business', 'analyst'), 485), (('project', 

In [6]:
# see what words are most common in the job descriptions
vectorizer = TfidfVectorizer(use_idf=True)
tf_idf_matrix = vectorizer.fit_transform(df['job_description_text'])
print(tf_idf_matrix.shape)

(2622, 23985)


In [7]:
# total words vs unique words
total = sum(df['job_description_text'].apply(find_total_words))
unique = len(vectorizer.get_feature_names_out())

print("Total words: ", total)
print("Unique words: ", unique)
print("Ratio of unique words to total words: ", unique / total)


Total words:  788873
Unique words:  23985
Ratio of unique words to total words:  0.03040413349170272


In [8]:
from IPython.display import Markdown, display

def job_recommendation(job_description, corpus, n=5):
    job_description = preprocess_text(job_description)
    print(job_description)
    # Create a tf-idf matrix for the job description
    job_description_matrix = vectorizer.transform([job_description])
    
    # Calculate cosine similarity between the job description and the corpus
    cosine_similarities = cosine_similarity(job_description_matrix, corpus).flatten()
    
    # Get the indices of the n most similar job descriptions
    most_similar = cosine_similarities.argsort()[-n:][::-1]

    display(Markdown("### Most similar jobs:"))
    for i in most_similar:
        display(Markdown("#### " + dataset.iloc[i]['job_title'] + " at " + dataset.iloc[i]['company_name'] + ", " + dataset.iloc[i]['job_location'] + ", " + dataset.iloc[i]['job_url'] + " (Cosine similarity: " + str(cosine_similarities[i]) + ")" ))
        display(Markdown(dataset.iloc[i]['job_description_text']))
        display(Markdown("___"))

    return most_similar


In [9]:
job_recommendation("We are looking for an energy market analyst with experience in Hydro power and monte carlo simulation", tf_idf_matrix, 5)

energy market analyst hydro power monte carlo simulation


### Most similar jobs:

#### Investor Relations Officer at Norsk Hydro, Norway, https://www.linkedin.com/jobs/view/3828185205 (Cosine similarity: 0.22214218232237803)

Investor Relations (IR) is responsible for Hydro's contact with analysts, investors and stock exchanges, with the aim to build and maintain Hydroâ€™s strong credibility in the financial markets. Based in Oslo, the IR team is a part of the CFO organization and consists of three people working closely with Hydro's senior management.We are looking for a proactive person with business and financial understanding, strong attention to detail, and an ability to thrive and contribute efficiently in periods of high workload. We need you to support on key IR analyses and deliveries, while also supporting on our teamâ€™s logistics and administration on a day to day basis.We are offering an exciting opportunity to gain a broad understanding of Hydroâ€™s total value chain and strategy, through close cooperation with all the business areas in Hydro, as well as the perspectives of the financial markets. The position will work closely with the members of the Corporate Management Board. For the right candidate, this position offers an attractive platform for personal growth and development in areas such as finance, communication, strategy and M&A. Hydroâ€™s strategy toward 2030 focuses on pioneering the green aluminium transition, powered by renewable energy.Areas of responsibility Analyze the financial market's view of the company and peers and convey this view to internal stakeholders.Interact, run analyses and prepare input in response to investor and analyst questionnaires.Maintain and improve internal IR financial models and tools.Contribute on presentations towards investors, analysts and internally, including presentations for quarterly releases and Capital Markets Day Ensure compliance with relevant exchanges and agencies, owning relationship to stakeholders in these organizations.Support the development of press releases and stock exchange announcements.Contribute to investor targeting and company outreach.Manage or support Hydroâ€™s calendar of investor conferences and roadshows and coordinate participation with corporate management.Manage or support IR team calendar of key meetings and deliveries.Manage logistics for main IR events, such as quarterly releases and Capital Markets Day.Qualifications Masterâ€™s degree, preferably within economics or finance 1-3 years of relevant post-university work experience, ideally in a structured analytical or financial role. We will consider both newly graduated and more experienced candidates.Experience with modelling and financial valuation, investment banking, investor relations, consulting, or equity analyses is seen as an advantage.Strong understanding of and interest in capital markets and finance/accounting Strong analytical and problem-solving skills Strong administrative and organizational skills, and the ability to coordinate multiple deliveries simultaneously.Strong communication and storytelling skills, comfortable presenting analytical findings and strategic topics Advanced in Microsoft Office package, primarily Power Point and Excel You work independently and thrive in a role that requires both financial analysis and organizational skills.You provide accurate and timely information and service to a wide range of internal and external stakeholders. You can develop a network internally and externally, are curious, service minded, collaborative, and have the drive and willingness to learn.You are motivated to work in a hectic and demanding environment.High level of written and oral English is required.Additional Information Some travel is expected.Please enclose a cover letter and copies of relevant academic transcripts, diplomas, certificates etc. Only applications received through our online system will be considered, not via e-mail.As part of our recruitment process, we use Semac for background check.Apply before: 03.03.2024If you have any questions, please contact:Hiring manager Martine Ramboel Hagen Email: Martine.Rambol.Hagen@hydro.com Phone: (Norway +47) 91708918Recruiter Birgitta Sanden Email:Birgitta.Sanden@hydro.com Phone:(Norway +47) 97031556                    Show more                          Show less

___

#### Advisory Energy Forecasting Consultant at Inceed, United States, https://www.linkedin.com/jobs/view/3597724975 (Cosine similarity: 0.15853867461467955)

Compensation: $120,000-$150,000Location: Remote, TexasAdvisory Energy Forecasting Consultant :Inceed has partnered with a global leader in the electrical power grids and software industry to help find a skilled Advisory Energy Forecasting Consultant to join their team!We are seeking an experienced Advisory Energy Forecasting Consultant with expertise in production cost modeling within the NYISO and ISO New England markets. The successful candidate will provide expert guidance to clients on the most effective energy strategies to optimize their operations and reduce costs.Responsibilities:Develop and execute production cost models for clients operating within the NYISO and ISO New England markets Analyze market trends and provide insights into energy price movements Develop and recommend energy strategies to clients based on market conditions and analysis Monitor and evaluate the effectiveness of energy strategies implemented by clients Conduct research and analysis of energy markets, technologies and regulatory policies Build and maintain relationships with clients to ensure continued satisfaction with services provided Provide training and support to clients on energy market fundamentals and production cost modeling techniques Required Qualifications & Experience:Bachelor's degree in Engineering, Economics, Business or related field 5+ years of experience in energy consulting with a focus on production cost modeling and capacity expansion planning required. Strong knowledge of NYISO and ISO New England energy markets Proficient in the use of production cost modeling tools, such as PROMOD, PLEXOS, Monte Carlo, GE Maps, or similar tool is required. Excellent analytical, problem-solving and decision-making skills Strong communication and interpersonal skills with the ability to build and maintain relationships with clients Ability to work independently and in a team environment Must have experience with either Zonal or Nodal modeling/forecasting is required. Flexibility to travel as needed Nice to Have Skills & Experience:Graduate degree and/or PhD preferred, not required Experience as an economics or financial modeling consultant is a plus. Prior experience as an Electrical Power Systems Engineer or Software Engineer is nice to have. Perks & Benefits: Position is fully remote! Long term, stable career opportunity, with plans for global growth across the company! Competitive base, bonus, and benefits plans! If you are interested in learning more about the Advisory Energy Forecasting Consultant opportunity, please submit your resume for consideration.We are Inceed, a staffing and direct placement firm who believes in the possibility of something better. Our mission is simple: Weâ€™re here to help every person, whether client, candidate, or employee, find and secure whatâ€™s better for them. Inceed is an equal opportunity employer. Inceed prohibits discrimination and harassment of any type and affords equal employment opportunities to employees and applicants without regard to race, color, religion, sex, sexual orientation, gender identity or expression, pregnancy, age, national origin, disability status, genetic information, protected veteran status, or any other characteristic protected by law.           Show more                     Show less

___

#### Chief of Staff - Renewable Energy Software at HYDROGRID, Austria, https://www.linkedin.com/jobs/view/3815511739 (Cosine similarity: 0.15156992552842882)

About us  We are a fast-growing B2B SaaS company dedicated to empowering Hydro Operators to maximise both their revenue and total (renewable) energy production - making hydro a power for the future and the backbone of the zero carbon economy. Join us in our mission to make hydro a power for the future!  Your mission  Strategic Oversight:  Collaborate closely with the CEO and executive team to develop and execute strategic initiatives. Conduct research and analysis to inform decision-making.   Project Management:  Lead and manage key projects from inception to completion, ensuring deadlines are met and objectives are achieved. Streamline processes and workflows to optimize operational efficiency.   Communication and Coordination:.  	 Coordinate and facilitate cross-functional meetings, ensuring key decisions are documented and implemented.   Problem Solving:  Anticipate challenges and proactively address them, finding creative solutions to complex issues.   Leadership Support:  Provide direct support to the CEO and executive team in day-to-day activities. Assist in preparing materials for presentations, meetings, and reports.   Data Analysis:  Utilize data-driven insights to inform decision-making and identify opportunities for improvement. Track and analyze key performance indicators to assess the success of strategic initiatives.   Culture and Team Building:  Assist the executive team with recruitment and Team Building. Collaborate with the executive team to implement initiatives that enhance team cohesion and employee satisfaction.     Your profile  Proven Experience:  Minimum of 4 years experience, with at least 2 years in a similar role, preferably in a startup or other fast paced environment. Demonstrated success in project management and strategic planning.   Strategic Thinking:  Strong analytical and problem-solving skills with the ability to think strategically and act tactically.   Communication Skills:  Excellent written and verbal communication skills in both English andGerman, with the ability to articulate complex ideas clearly and concisely.   Leadership:  Proven ability to lead and motivate teams, fostering a collaborative and results-driven environment.   Adaptability:  Comfortable working in a fast-paced, evolving startup environment. Ability to adapt to changing priorities and navigate ambiguity.   Initiative:  Self-starter with a proactive approach to identifying and addressing challenges. Willingness to take on new responsibilities and go above and beyond to achieve goals.   Tech-Savvy:  Proficiency in using various software and tools for project management, communication, and data analysis.   Team Player:  Ability to work collaboratively with diverse teams and stakeholders. Demonstrated commitment to fostering a positive and inclusive workplace culture.     Why us?  A clear vision: work on the transformation of renewables into smart hydro power plants to enable grid balancing with purely green energy A strong ambition: we are at the beginning of our exciting scale-up story to contribute globally to the energy transformation A great spirit:the ability to work on a green future is what unites us and motives each of us every day. Latest technology: our DNA is R&D driven allowing us to not only using the latest technology but also developing our own methodology beyond the state of the art to stay one step ahead of competition Room for career development and personal growth:As a growing company, we continuously need to expand our skills, team and leadership â€“ we can offer rapid career development opportunities. A flexible work environment Competitive compensation           Show more                     Show less

___

#### Quantitative Analyst at Randstad Hellas, Athens Metropolitan Area, https://www.linkedin.com/jobs/view/3823157226 (Cosine similarity: 0.1434863147709692)

Would you like to make your next career step as Quantitative Analyst? Are you willing to be member of the 1st provider of integrated energy in Greece?Randstad, on behalf of Î–enith, is looking for an experienced Quantitative Analyst!The Quantitative Analyst position is full-time and permanent, based in Thessaloniki or Athens! If you're the type of person we are looking for, then apply now!ResponsibilitiesAs a Quantitative Analyst, your responsibilities will be following:Coordinate and execute the quantitative analysis for the development, maintenance and enhancement of the models used in portfolio management, pricing and forecasting activities pertinent to the energy market (power, gas, oil, green certificates)Work with large datasets. Produce comprehensible data structuresConfigure and optimize the utilization of internal software applications (Energy Trading and Risk Management / ETRM, BI, machine learning models, etc.)Continuous reporting to the upper managementDefine the requirements, develop, test, review and enhance the calculation models for:forecasting, hedging, portfolio optimization strategies in the energy sectorpricing / costing activities including the valuation of the risks associated with the purchase or sale of commodities (price risk, volume risk, shaping risk, proxy hedging risk, etc.)machine learning, optimization-based models, historical simulation, Monte-Carlo simulation, bootstrapping, parametric modellingprice and volume scenario derivation to represent stochasticity (e.g. consumption scenarios, wind / solar production scenarios, power / gas price scenarios)risk metric calculation for the energy portfolio (e.g., value at risk / VAR, conditional value at risk / CVAR, profit at risk / PAR, Variance, Cumulative Distribution Function / CDF, etc.)QualificationsIn order for your profile to match the Quantitative Analyst position, you need to have:Experience in quantitative analysis, models and processes to support risk management activities (stochastic processes, econometric methods, optimization) and in the functional areas portfolio management and pricingExperience with programming languages such as Python/PySpark, Matlab, optimization software (e.g. GAMS), VBA, C# (.NET)Experience with scenario generation (e.g. ARIMA models), scenario reduction techniquesFamiliarity with stochastic and deterministic programming (e.g., linear programming, mixed integer linear programming)Familiarity with machine learning algorithmsData visualization / BI tools skills will be considered an assetKnowledge of database structure languages, such as MySQL, is a plusBachelorâ€™s or Diploma degree in Mathematics, Finance, Economics, Statistics, Engineering, Informatics, Physics, or related field with a focus on Statistics or Quantitative FinanceMasterâ€™s or PhD degree in Data / Quantitative Analysis, Risk Management, Economics or related field will be highly appreciatedExperience3+ years of experience in companies operating in the electricity and natural gas, banking or financial sectorsSkillsExcellent analytical and problem solving skillsAbility to work under high pressureResults oriented and self-motivatedAttention to detailsStrong time management, project management and priorities management skillsCommand of the Greek and English language, both oral and writtenBenefits The company offers the following benefits to the Quantitative Analyst position:Attractive remuneration packageMonthly meal ticketsPrivate medical insuranceFresh & friendly work environmentEnrichment of technical skills with new technology stackWorking in the 1st provider of integrated energy in Greece, with natural gas and electricity (!!!)Further career opportunitiesInformationIn case you want to discuss further details regarding the Quantitative Analyst position, feel free to reach Michalis Isaakidis at 6952361070 for further details!Please note that for transparency and equity reasons, only those applications made online via our site will be assessed. After the screening of all the CVs received, we will only contact the candidates who meet the requirements of the job to arrange an interview. All applications are considered strictly confidential.           Show more                     Show less

___

#### Transportation Engineer Modeler at Energy Jobline, VA, https://www.linkedin.com/jobs/view/3771427779 (Cosine similarity: 0.1430202516538447)

DescriptionJoin a forward-thinking team committed to innovating the transportation sector! At Leidos, we support the Federal Highway Administrationâ€™s (FHWA) Saxton Transportation Operations Laboratory (STOL), focusing on enhancing transportation operations, safety, mobility, and reducing environmental impacts. STOL champions the integration of emerging technologies such as cooperative driving automation (CDA) and Vehicle-to-Everything (V2X) to revolutionize transportation.Learn more about STOL here!Role OverviewWe are seeking a dedicated Transportation Engineer/Modeler to join our McLean, Virginia group. You will engage in dynamic projects involving Operations, Safety, Freight, and Access Management and in the growing and rapidly advancing fields of Intelligent Transportation Systems (ITS), CDA, V2X, and Connected Vehicle (CV) technologies.Key Responsibilities, But Are Not Limited ToConduct research on a variety of transportation engineering projects, which encompass traffic simulation, CDA, V2X, CV, safety, and Intelligent Transportation Systems (ITS).Execute traffic simulation projects across multiple scales, ranging from microscopic to macroscopic levels.Develop, refine, and maintain simulation networks, manage comprehensive data analysis, and implement cutting-edge algorithms in an open-source environment.Perform cross-disciplinary CDA simulations, including automated driving simulation, vehicle dynamics simulation, V2X communication simulation, and sensor simulation.Participate in the creation of digital twins and high-definition (HD) mapsUtilize data mining techniques and algorithms to analyze Connected and Automated Vehicle (CAV) data.Create guidance, training, and outreach materials aimed at technical audiences, practitioners, and various stakeholders, focusing on diverse aspects of transportation.Develop documentation, reports, manuals, and presentations to effectively communicate project results to intended audiences, ensuring clarity and practical applicability in both oral and written forms.Contribute to the development, support, and review of technical proposals.Author white papers, proposals, and briefings that demonstrate technical and thought leadership, catering to the needs of Leidos leadership or clients.Required QualificationsAdvanced degree (Masterâ€™s or higher) in Transportation Engineering, Systems Engineering, Computer Science, or related fields.Extensive experience with at least one of microscopic traffic simulation tools (e.g., SUMO, Vissim, Aimsun, Transmolder, or similar).Ability to obtain and maintain a Public Trust clearance (which includes three years of immediate residency in the US).Proficiency in programming for implementing developed algorithms into microscopic traffic simulation tools (C++, Java, Python, VBA, or Matlab).Demonstrated ability to author technical research papers and reports.Expertise in advanced data analytics for transportation.Knowledge of CAV, Intelligent Transportation Systems, and traffic operation and management.Clear communication skills, both verbal and written.Preferred ExperienceProficient in autonomous driving and sensor simulation tools, with direct hands-on experience (e.g., CARLA, LGSVL, CARMAKER, or similar).Experience with developing digital twin and 3D simulation scenes for autonomous driving and sensor simulation tools by using professional tools (e.g., Roadrunner).Experience with creating high-definition (HD) maps.Familiarity with concepts of co-simulation or co-simulation frameworks (FMI/HLA/MOSAIC)Solid understanding of Vehicle-to-Everything (V2X) communication protocols and experience with communication simulation tools such as NS-3 and OMNET++.Experience with vehicle dynamics simulators (e.g., CARSIM and TruckSIM).Hands-on experience or familiarity with CARMA ecosystem (e.g., CARMA platform, CARMA streets, CARMA messenger, and CARMA Cloud)Experience with implementing artificial intelligence (AI) and machine learning in transportationFamiliar with augmented reality (AR) and virtual reality (VR) technologies.Demonstrated project management skills, with a track record of successfully overseeing projects from inception to completion.Capable of developing Requests for Information (RFI), Requests for Proposal (RFP), and managing subcontractor agreements.Proven experience in managing subcontractors, ensuring adherence to project specifications and timelines.Experience with leading or assisting in the development of technical proposals, showcasing the ability to convey complex ideas effectively.Competent in developing cost models, basis of estimates (BOEs), and Bill of Materials (BOM) for diverse projects.Agile methodology proficiency, with experience in applying agile practices in development environments.Team DynamicsStrong team collaboration and ability to engage in constructive feedback.Clear communication skills, both verbal and written.Adherence to schedules with a focused work ethic.Receptive to professional growth and learning.High quality standards and work pride.Enthusiasm for the field of transportation simulation and technology.Anticipated pay range for this position: $80,000-$100,000Pay RangePay Range $65,000.00 - $117,500.00The Leidos pay range for this job level is a general guideline only and not a guarantee of compensation or salary. Additional factors considered in extending an offer include (but are not limited to) responsibilities of the job, education, experience, knowledge, skills, and abilities, as well as internal equity, alignment with market data, applicable bargaining agreement (if any), or other law.Original Posting Date11/20/2023While subject to change based on business needs, Leidos reasonably anticipates that this job requisition will remain open for at least 3 days with an anticipated close date of no earlier than 3 days after the original posting date as listed above.                 Show more                     Show less

___

array([2532,  955, 1297, 1456,  505])

## TF-IDF Weighted Word2Vec

In [10]:
print(list(api.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [21]:
try:
    print("Loading model from file...")
    google_model = joblib.load('joblib/google_model.pickle')
except:
    print("Downloading model...")
    google_model = api.load('word2vec-google-news-300')
    joblib.dump(google_model, 'joblib/google_model.pickle')

Loading model from file...


In [12]:
def build_tfw2v(corpus, vectors, embeddings_size, vectorizer=None):
    """
    Build TF-IDF weighted Word2Vec embeddings for a given corpus.

    Args:
        corpus (list): List of text descriptions.
        vectors (gensim.models.KeyedVectors): Pre-trained Word2Vec vectors.
        embeddings_size (int): Size of the Word2Vec embeddings.
        vectorizer (sklearn.feature_extraction.text.TfidfVectorizer, optional): TF-IDF vectorizer. 
            If not provided, a new vectorizer will be created.

    Returns:
        numpy.ndarray: TF-IDF weighted Word2Vec embeddings for the corpus.
    """

    if vectorizer is None:
        vectorizer = TfidfVectorizer(use_idf=True)
        tf_idf = vectorizer.fit_transform(corpus).toarray()
    else:
        tf_idf = vectorizer.transform(corpus).toarray()

    vector_vocabulary = vectors.index_to_key
    tf_idf_vocabulary = vectorizer.vocabulary_
    w2v_corpus = np.zeros((len(corpus), embeddings_size))
    
    for index, description in enumerate(corpus):
        words = description.split()

        # For every word that exists in the vocabulary, multiply its vector by the TF-IDF of the word
        numerator = [vectors[word]*tf_idf[index][tf_idf_vocabulary[word]] for word in words if word in vector_vocabulary and word in tf_idf_vocabulary]

        # Matrix addition
        numerator = np.array(np.sum(numerator, axis=0))
        
        # Sum of all words' TF-IDF, existing in the vocabulary 
        denominator = np.sum([tf_idf[index][tf_idf_vocabulary[word]] for word in words if word in vector_vocabulary and word in tf_idf_vocabulary])

        w2v_desc = np.array(numerator/denominator)
        w2v_corpus[index] = w2v_desc

    return w2v_corpus

def job_recommendation_w2v(candidate, corpus, vectors, vectorizer, embeddings_size, n=5, verbose=0):
    """
    Recommends similar jobs based on a candidate's job description using weighted word2vec representation.

    Args:
        candidate (str): The candidate's job description.
        corpus (array-like): The corpus of job descriptions to compare against.
        vectors (array-like): The pre-trained word vectors.
        vectorizer (object): The vectorizer used to transform text into vectors.
        embeddings_size (int): The size of the word embeddings.
        n (int, optional): The number of most similar jobs to recommend. Defaults to 5.
        verbose (bool, optional): Whether to display detailed information about the recommended jobs. Defaults to False.

    Returns:
        array-like: The indices of the n most similar job descriptions.

    """
    candidate_description = preprocess_text(candidate, lemmatize=False)
    print(candidate_description)

    # Calculate the weighted word2vec representation of the job description
    w2v_job_description = build_tfw2v([candidate_description], vectors, embeddings_size, vectorizer)

    # Calculate cosine similarity between the job description and the corpus
    cosine_similarities = cosine_similarity(w2v_job_description, corpus).flatten()

    # Get the indices of the n most similar job descriptions
    most_similar = cosine_similarities.argsort()[-n:][::-1]

    display(Markdown("### Most similar jobs:"))
    if verbose == 2:
        for i in most_similar:
            display(Markdown("#### " + dataset.iloc[i]['job_title'] + " at " + dataset.iloc[i]['company_name'] +  ", " + dataset.iloc[i]['job_location'] +  ", " + dataset.iloc[i]['job_url'] + " (Cosine similarity: " + str(cosine_similarities[i]) + ")" ))
            display(Markdown(dataset.iloc[i]['job_description_text']))
            display(Markdown("___"))
    elif verbose == 1:
        for i in most_similar:
            display(Markdown(dataset.iloc[i]['job_title'] + " at " + dataset.iloc[i]['company_name'] +  ", " + dataset.iloc[i]['job_location'] +  ", " + dataset.iloc[i]['job_url'] + " (Cosine similarity: " + str(cosine_similarities[i]) + ")" ))
            display(Markdown("___"))
    return most_similar, cosine_similarities
    
def job_recommendation_from_file(candidate, job_file, vectors, vectorizer, embeddings_size, n=5, verbose=0):
    jobs = pd.read_csv(job_file, delimiter=';', names=['job_id', 'company_name', 'job_title', 'job_location', 'job_description_text', 'language', 'job_url'])
    jobs = jobs[jobs['language'] == 'en']
    jobs = jobs.drop_duplicates(subset=['job_description_text'])
    jobs = jobs[jobs['job_description_text'].str.len() > 100]


    corpus = jobs['job_description_text'].apply(preprocess_text, lemmatize=False)
    corpus = build_tfw2v(corpus, vectors, embeddings_size, vectorizer)

    shape = corpus.shape
    # Delete empty rows
    jobs = jobs.dropna(subset=['job_description_text'])

    if shape[0] != jobs.shape[0]:
        print(shape[0] - jobs.shape[0], "empty rows deleted")

    indices, cosine_similarities = job_recommendation_w2v(candidate, corpus, vectors, vectorizer, embeddings_size, n, 0)

    if verbose == 2:
        for i in indices:
            display(Markdown("#### " + jobs.iloc[i]['job_title'] + " at " + jobs.iloc[i]['company_name'] +  ", " + jobs.iloc[i]['job_location'] +  ", " + jobs.iloc[i]['job_url'] + " (Cosine similarity: " + str(cosine_similarities[i]) + ")" ))
            display(Markdown(jobs.iloc[i]['job_description_text']))
            display(Markdown("___"))
    elif verbose == 1:
        for i in indices:
            display(Markdown(jobs.iloc[i]['job_title'] + " at " + jobs.iloc[i]['company_name'] +  ", " + jobs.iloc[i]['job_location'] +  ", " + jobs.iloc[i]['job_url'] + " (Cosine similarity: " + str(cosine_similarities[i]) + ")" ))
            display(Markdown("___"))
    
    return indices, cosine_similarities, jobs



In [13]:
try:
    corpora_google = joblib.load('joblib/corpora_google_energy.pkl')
except:
    print("Building google corpus")
    corpora_google = build_tfw2v(unlemmatized_corpus, google_model, 300)
    print("Corpus built")
    joblib.dump(corpora_google, 'joblib/corpora_google_energy.pkl')  # Add this line to assign a value to 'y'


In [14]:
job_recommendation_w2v("We are looking for an entry-level junior energy market analyst with experience in Hydro power and monte carlo simulation. With masters degree in energy power systems and experience as a renewable energy sources engineer. Skilled in MATLAB.", corpora_google, google_model, vectorizer, 300, 5, verbose=True)

entry level junior energy market analyst hydro power monte carlo simulation masters degree energy power systems renewable energy sources engineer matlab


### Most similar jobs:

Energy Market Modeller (Plexos) at Turner Lovell, Germany, https://www.linkedin.com/jobs/view/3811037342 (Cosine similarity: 0.8363361207517792)

___

Head Energy is looking for a skilled/experienced System Engineer. at Head Energy AS, Norway, https://www.linkedin.com/jobs/view/3820587503 (Cosine similarity: 0.8361680537094076)

___

Public Affairs Officer Transport & Energy at Virya Energy, Belgium, https://www.linkedin.com/jobs/view/3798655943 (Cosine similarity: 0.8350234011814053)

___

Energy Consultant at PT Renewables ðŸŒ³, United Kingdom, https://www.linkedin.com/jobs/view/3811010351 (Cosine similarity: 0.8251357946271937)

___

Sustainability Engineer - Relocate to Saudi Arabia, Permanent Expat Family Relocation Package at aramco, United Kingdom, https://www.linkedin.com/jobs/view/3796128399 (Cosine similarity: 0.821883175779792)

___

(array([  79, 1061,  601,  880, 1421]),
 array([0.55224747, 0.73394574, 0.39969957, ..., 0.48999678, 0.5550241 ,
        0.3780957 ]))

In [15]:
indices, similarities, jobs = job_recommendation_from_file("We are looking for an entry-level junior energy market analyst with experience in Hydro power and monte carlo simulation. With masters degree in energy power systems and experience as a renewable energy sources engineer. Skilled in MATLAB.", 'data/today_jobs_energy.csv', google_model, vectorizer, 300, 5, verbose=True)

entry level junior energy market analyst hydro power monte carlo simulation masters degree energy power systems renewable energy sources engineer matlab


### Most similar jobs:

Senior Energy Modeler at Southern Lights, Sweden, https://www.linkedin.com/jobs/view/3784551535 (Cosine similarity: 0.8180115991092731)

___

Energy Systems Modeler at Oil and Gas Job Search Ltd, Saudi Arabia, https://www.linkedin.com/jobs/view/3765078103 (Cosine similarity: 0.8158899985009389)

___

Energy Systems Modeler at Energy Jobline, Saudi Arabia, https://www.linkedin.com/jobs/view/3773719111 (Cosine similarity: 0.8155490354008554)

___

Energy Manager at Energy Sciences, Detroit Metropolitan Area, https://www.linkedin.com/jobs/view/3825515440 (Cosine similarity: 0.8109612808439912)

___

Energy Consultant at Able Power Management, TX, https://www.linkedin.com/jobs/view/3813094891 (Cosine similarity: 0.8024482966001687)

___

In [18]:
f = open("data/links_for_energy.txt", "w")

for job in similarities.argsort()[:][::-1]:
    print(jobs.iloc[job]['job_title'], jobs.iloc[job]['company_name'], jobs.iloc[job]['job_location'], jobs.iloc[job]['job_url'], similarities[job])
    f.write(jobs.iloc[job]['job_title'] + ", " + jobs.iloc[job]['company_name'] +  ", " + jobs.iloc[job]['job_location'] +  ", " + jobs.iloc[job]['job_url'] + " (Cosine similarity: " + str(similarities[job]) + ")" + "\n")
    if similarities[job] < 0.75:
        break
f.close()

Senior Energy Modeler Southern Lights Sweden https://www.linkedin.com/jobs/view/3784551535 0.8180115991092731
Energy Systems Modeler Oil and Gas Job Search Ltd Saudi Arabia https://www.linkedin.com/jobs/view/3765078103 0.8158899985009389
Energy Systems Modeler Energy Jobline Saudi Arabia https://www.linkedin.com/jobs/view/3773719111 0.8155490354008554
Energy Manager Energy Sciences Detroit Metropolitan Area https://www.linkedin.com/jobs/view/3825515440 0.8109612808439912
Energy Consultant Able Power Management TX https://www.linkedin.com/jobs/view/3813094891 0.8024482966001687
Sourcing energy Analyst - Sustainability business Schneider Electric Spain https://www.linkedin.com/jobs/view/3810373686 0.8011112357732018
Energy Analysts (PAL5/PAL6) International Energy Agency (IEA) France https://www.linkedin.com/jobs/view/3828129366 0.7994009749601834
Energy Manager NYC Health + Hospitals NY https://www.linkedin.com/jobs/view/3803235680 0.792357800897258
Energy Market Analyst Flower Sweden h

In [19]:
indices, similarities = job_recommendation_w2v("We are looking for an entry-level junior energy market analyst with experience in Hydro power and monte carlo simulation. With masters degree in energy power systems and experience as a renewable energy sources engineer. Skilled in MATLAB.", corpora_google, google_model, vectorizer, 300, 5, verbose=1)

entry level junior energy market analyst hydro power monte carlo simulation masters degree energy power systems renewable energy sources engineer matlab


### Most similar jobs:

Energy Market Modeller (Plexos) at Turner Lovell, Germany, https://www.linkedin.com/jobs/view/3811037342 (Cosine similarity: 0.8363361207517792)

___

Head Energy is looking for a skilled/experienced System Engineer. at Head Energy AS, Norway, https://www.linkedin.com/jobs/view/3820587503 (Cosine similarity: 0.8361680537094076)

___

Public Affairs Officer Transport & Energy at Virya Energy, Belgium, https://www.linkedin.com/jobs/view/3798655943 (Cosine similarity: 0.8350234011814053)

___

Energy Consultant at PT Renewables ðŸŒ³, United Kingdom, https://www.linkedin.com/jobs/view/3811010351 (Cosine similarity: 0.8251357946271937)

___

Sustainability Engineer - Relocate to Saudi Arabia, Permanent Expat Family Relocation Package at aramco, United Kingdom, https://www.linkedin.com/jobs/view/3796128399 (Cosine similarity: 0.821883175779792)

___

In [20]:
f = open("data/links_for_energy.txt", "w")

for job in similarities.argsort()[:][::-1]:
    print(dataset.iloc[job]['job_title'], dataset.iloc[job]['company_name'], dataset.iloc[job]['job_location'], dataset.iloc[job]['job_url'], similarities[job])
    f.write(dataset.iloc[job]['job_title'] + ", " + dataset.iloc[job]['company_name'] +  ", " + dataset.iloc[job]['job_location'] +  ", " + dataset.iloc[job]['job_url'] + " (Cosine similarity: " + str(similarities[job]) + ")" + "\n")
    if similarities[job] < 0.75:
        break

Energy Market Modeller (Plexos) Turner Lovell Germany https://www.linkedin.com/jobs/view/3811037342 0.8363361207517792
Head Energy is looking for a skilled/experienced System Engineer. Head Energy AS Norway https://www.linkedin.com/jobs/view/3820587503 0.8361680537094076
Public Affairs Officer Transport & Energy Virya Energy Belgium https://www.linkedin.com/jobs/view/3798655943 0.8350234011814053
Energy Consultant PT Renewables ðŸŒ³ United Kingdom https://www.linkedin.com/jobs/view/3811010351 0.8251357946271937
Sustainability Engineer - Relocate to Saudi Arabia, Permanent Expat Family Relocation Package aramco United Kingdom https://www.linkedin.com/jobs/view/3796128399 0.821883175779792
Energy Analyst Smart Energy LAB Portugal https://www.linkedin.com/jobs/view/3711178713 0.8208331558973876
New Request - Anti-Abuse Data & Operations Analyst - Fraud Operations emagine Greater Stockholm Metropolitan Area https://www.linkedin.com/jobs/view/3819590336 0.819145695103594
Energy Resource Ana