In [29]:
import pandas as pd
import csv
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from gensim.models import Phrases


nltk.download('stopwords')
lemmatizer = nltk.stem.WordNetLemmatizer()
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zeidsmac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/zeidsmac/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zeidsmac/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/zeidsmac/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Data Examination
We start off my a visual examination of the data. We can see that we have 16 projects, with various statuses and start dates. The description of all these projects is available in the last column

In [30]:
df = pd.read_csv("/Users/zeidsmac/Desktop/entropic_case/data/ispt_projects.csv.csv")
df

Unnamed: 0,Project Title,Link,Code,Status,Start Date,Description
0,Flexible Efficient Electrification of Industri...,https://ispt.eu/projects/flexible-efficient-el...,HP-50-02,Ongoing,"November 13, 2023",Electrification of heat-driven processes is ne...
1,FUSE – FUll ScalE Industrial Heat Pump Using N...,https://ispt.eu/projects/fuse/,UH-30-05,Ongoing,,Supporting the development and success of an i...
2,COMTA – COMpact modular Thermo Acoustic heat pump,https://ispt.eu/projects/comta/,UH-30-04,Ongoing,"November 1, 2018",To further develop the compact electrically dr...
3,R-ACES – FRamework for Actual Cooperation on E...,https://ispt.eu/projects/r-aces/,I40-20-06,Ongoing,,R-ACES is an international project promoted by...
4,FLEXSTEAM – Development of heat storage for in...,https://ispt.eu/projects/steam-storing/,UH-30-02,Ongoing,"January 1, 2019",The FLEXSTEAM project aims to improve the way ...
5,SPOT: Sustainable PrOcess heaTing,https://ispt.eu/projects/spot-sustainable-proc...,UH-30-08,Ongoing,"January 1, 2020",The SPOT project aims to significantly reduce ...
6,StAgglop: Reducing energy use and material los...,https://ispt.eu/projects/stagglop/,DR-50-15,Ongoing,"May 1, 2020",A large part of powdered formulations within t...
7,The Heat Is On,https://ispt.eu/projects/the-heat-is-on/,HP-50-01,Ongoing,"September 1, 2021",The Heat is On aims to make processes for dewa...
8,COMPRESORP,https://ispt.eu/projects/compresorp/,UH-20-10,Completed,"April 1, 2015",Upgrading low temperature waste water streams ...
9,Steam and condensate quality,https://ispt.eu/projects/condensate-quality/,WP-20-11,Completed,"January 1, 2017",This project focuses on requiering better insi...


# Data Cleaning
Several steps need to be taken first to clean the data. For starters, there is an irrelevant section obtained in every link, where the text goes "You might also be interested". So the first step is to remove what comes after.

In [31]:
def clean_description(description):
    # Find the part where "You might also be interested" starts and remove it along with everything that follows
    cutoff_phrase = "You might also be interested"
    if cutoff_phrase in description:
        description = description.split(cutoff_phrase)[0].strip()  # Keep only the part before the phrase
    return description

In [32]:
df["Description"] = df["Description"].apply(clean_description)
df["Description"][0]

'Electrification of heat-driven processes is needed for a substantial reduction of CO 2  emissions. However, the challenge is to efficiently move to full electrification while coping with the high variability of the availability and price of electricity. Additionally, there is the co-existence of natural gas boilers and grid connection limitations. This project will develop potential routes based on available technologies through two virtual public simulation cases. These simulationas will enable partners to share and promote their expertise without having to disclose confidential information.\xa0 \n In short: \n \n This project will develop innovative process designs solving the efficient flexible electrification challenge \n The designs are simulated through two distinct virtual cases: agro-food and chemicals, allowing for open innovation\xa0\xa0 \n The industry, technology suppliers and technology experts participate by sharing their knowledge and expertise\xa0 \n Increased efficien

# Data Cleaning
Here I do some data cleaning, so removing stop words, capital letters, and tokenizing into both unigrams and bigrams

In [33]:
stop_words = stopwords.words('english') #this loads the pre-fabricated stop words list from NLTK


def clean_text(x):

    # remove punctuation
    x = x.translate(str.maketrans(' ', ' ', string.punctuation))

    # remove numbers
    # x = x.translate(str.maketrans(' ', ' ', string.digits))

    # lowercase
    x = x.lower()

    # remove — and ’
    x = re.sub("—", " ", x)
    x = re.sub("’", " ", x)
    x = re.sub("–", " ", x)
    x = re.sub("…", " ", x)
    x = re.sub("“", " ", x)
    x = re.sub("”", " ", x)
    #x = re.sub("\\x", " ", x)

    # strip excessive whitespaces
    x = x.strip()
    
    # tokenize
    x = nltk.word_tokenize(x)

    # remove stopwords
    x = [token for token in x if not token in stop_words]

    # lemmatization and pass string back
    x = ' '.join([lemmatizer.lemmatize(w) for w in x])

    return x

In [34]:
df['description_processed'] = df['Description'].apply(lambda x: clean_text(x))

In [35]:
df['tokenized_description'] = df['description_processed'].apply(word_tokenize)


In [36]:
#I also chose to construct bigrams, because that could help us more accurately discern the topics

# Train the bigram model
bigram_model = Phrases(df['tokenized_description'].tolist(), min_count=20)

# Create a function to apply bigram transformation to a document
def apply_bigrams(doc):
    # Apply the bigram model
    bigram_doc = bigram_model[doc]
    # Combine the original tokens with the bigrams
    return doc + [token for token in bigram_doc if '_' in token]

# Apply the function to each document to create bigrams
df["bigram_descriptions"] = df["tokenized_description"].apply(apply_bigrams)

In [37]:
df

Unnamed: 0,Project Title,Link,Code,Status,Start Date,Description,description_processed,tokenized_description,bigram_descriptions
0,Flexible Efficient Electrification of Industri...,https://ispt.eu/projects/flexible-efficient-el...,HP-50-02,Ongoing,"November 13, 2023",Electrification of heat-driven processes is ne...,electrification heatdriven process needed subs...,"[electrification, heatdriven, process, needed,...","[electrification, heatdriven, process, needed,..."
1,FUSE – FUll ScalE Industrial Heat Pump Using N...,https://ispt.eu/projects/fuse/,UH-30-05,Ongoing,,Supporting the development and success of an i...,supporting development success industrial heat...,"[supporting, development, success, industrial,...","[supporting, development, success, industrial,..."
2,COMTA – COMpact modular Thermo Acoustic heat pump,https://ispt.eu/projects/comta/,UH-30-04,Ongoing,"November 1, 2018",To further develop the compact electrically dr...,develop compact electrically driven thermoacou...,"[develop, compact, electrically, driven, therm...","[develop, compact, electrically, driven, therm..."
3,R-ACES – FRamework for Actual Cooperation on E...,https://ispt.eu/projects/r-aces/,I40-20-06,Ongoing,,R-ACES is an international project promoted by...,race international project promoted 8 partner ...,"[race, international, project, promoted, 8, pa...","[race, international, project, promoted, 8, pa..."
4,FLEXSTEAM – Development of heat storage for in...,https://ispt.eu/projects/steam-storing/,UH-30-02,Ongoing,"January 1, 2019",The FLEXSTEAM project aims to improve the way ...,flexsteam project aim improve way store reuse ...,"[flexsteam, project, aim, improve, way, store,...","[flexsteam, project, aim, improve, way, store,..."
5,SPOT: Sustainable PrOcess heaTing,https://ispt.eu/projects/spot-sustainable-proc...,UH-30-08,Ongoing,"January 1, 2020",The SPOT project aims to significantly reduce ...,spot project aim significantly reduce use foss...,"[spot, project, aim, significantly, reduce, us...","[spot, project, aim, significantly, reduce, us..."
6,StAgglop: Reducing energy use and material los...,https://ispt.eu/projects/stagglop/,DR-50-15,Ongoing,"May 1, 2020",A large part of powdered formulations within t...,large part powdered formulation within food in...,"[large, part, powdered, formulation, within, f...","[large, part, powdered, formulation, within, f..."
7,The Heat Is On,https://ispt.eu/projects/the-heat-is-on/,HP-50-01,Ongoing,"September 1, 2021",The Heat is On aims to make processes for dewa...,heat aim make process dewatering drying heat i...,"[heat, aim, make, process, dewatering, drying,...","[heat, aim, make, process, dewatering, drying,..."
8,COMPRESORP,https://ispt.eu/projects/compresorp/,UH-20-10,Completed,"April 1, 2015",Upgrading low temperature waste water streams ...,upgrading low temperature waste water stream s...,"[upgrading, low, temperature, waste, water, st...","[upgrading, low, temperature, waste, water, st..."
9,Steam and condensate quality,https://ispt.eu/projects/condensate-quality/,WP-20-11,Completed,"January 1, 2017",This project focuses on requiering better insi...,project focus requiering better insight distri...,"[project, focus, requiering, better, insight, ...","[project, focus, requiering, better, insight, ..."


In [38]:
df.to_csv("/Users/zeidsmac/Desktop/entropic_case/data/ispt_processed.csv")

# Using tf-idf to assign topic
The approach I decided to use was build a tf-idf matrix for the descriptions, and then assign some descriptor words that can help us decide if the project falls under the topic "decarbonization of industrial heat"

In [39]:
df['Unigrams and Bigrams'] = df['tokenized_description'].apply(lambda x: ' '.join(x)) + ' ' + df['bigram_descriptions'].apply(lambda x: ' '.join(x))

# Initialize TfidfVectorizer to handle both unigrams and bigrams
vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english')

# Fit and transform the "Unigrams and Bigrams" column
tfidf_matrix = vectorizer.fit_transform(df['Unigrams and Bigrams'])

# Get feature names (unigrams and bigrams)
feature_names = vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix into a DataFrame for inspection
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# Display the TF-IDF scores for each project
tfidf_df.head()


Unnamed: 0,0915,0915 pj,10,10 ecoregions,10 energy,10 example,10 industry,10 le,10 mw,10 time,...,year different,year emission,year status,yield,yield series,zeroemission,zeroemission integration,zone,zone agglomeration,zone working
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.02948,0.02948,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.019616,0.0,0.0,0.0,0.0,0.0,0.026277,0.0,...,0.0,0.030173,0.030173,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.063851,0.032738,0.0,0.032738,0.032738,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
keywords = ['decarbonization', 'carbon', 'emissions', 'industrial heat', 'renewable', 'capture', 
            'electrification', 'reduction', 'emission', 'industrial', 'sustainability', 'co2', 'waste',
            'heat recovery', 'waste heat', 'thermal efficiency', 'heat pump']

def check_keyword_relevance(tfidf_df, keyword):
    if keyword in tfidf_df.columns:
        relevant_projects = tfidf_df[tfidf_df[keyword] > 0].index.tolist()
        return relevant_projects
    else:
        return []

# Check relevance of decarbonization keywords
for keyword in keywords:
    projects_with_keyword = check_keyword_relevance(tfidf_df, keyword)
    print(f"Keyword '{keyword}' appears in the following project descriptions: {projects_with_keyword}")

Keyword 'decarbonization' appears in the following project descriptions: []
Keyword 'carbon' appears in the following project descriptions: [2, 9]
Keyword 'emissions' appears in the following project descriptions: []
Keyword 'industrial heat' appears in the following project descriptions: [0, 1, 5, 13, 15]
Keyword 'renewable' appears in the following project descriptions: [0, 2, 4, 5, 15]
Keyword 'capture' appears in the following project descriptions: [0]
Keyword 'electrification' appears in the following project descriptions: [0, 13]
Keyword 'reduction' appears in the following project descriptions: [0, 1, 2, 3, 6, 7, 12, 14, 15]
Keyword 'emission' appears in the following project descriptions: [0, 1, 2, 3, 6, 7, 14]
Keyword 'industrial' appears in the following project descriptions: [0, 1, 2, 3, 4, 5, 6, 8, 12, 13, 14, 15]
Keyword 'sustainability' appears in the following project descriptions: [1]
Keyword 'co2' appears in the following project descriptions: []
Keyword 'waste' appear

In [41]:
# Function to calculate the relevance score for a project based on the list of keywords
def calculate_relevance_scores(tfidf_df, keywords):
    relevance_scores = {}
    
    # Iterate over each project (each row of tfidf_df)
    for project_index in tfidf_df.index:
        score = 0
        for keyword in keywords:
            if keyword in tfidf_df.columns:
                score += tfidf_df.loc[project_index, keyword]  # Sum the TF-IDF scores for the keywords
        relevance_scores[project_index] = score  # Store the cumulative score for the project
    
    return relevance_scores

# Calculate relevance scores for all projects based on the keywords
project_scores = calculate_relevance_scores(tfidf_df, keywords)

# Sort the projects by relevance score (highest to lowest)
sorted_project_scores = sorted(project_scores.items(), key=lambda x: x[1], reverse=True)

# Display the projects ranked by relevance
print("Projects ranked by relevance to decarbonization of industrial heat:")
for project, score in sorted_project_scores:
    print(f"Project {project}: Score {score}")


Projects ranked by relevance to decarbonization of industrial heat:
Project 1: Score 0.7444608778838888
Project 0: Score 0.6099906027689681
Project 13: Score 0.59553822089784
Project 15: Score 0.5432266833420294
Project 2: Score 0.3732325058122403
Project 8: Score 0.32673471090614786
Project 14: Score 0.3007971466093099
Project 5: Score 0.22647950412931261
Project 10: Score 0.22385043939198582
Project 4: Score 0.206434770162472
Project 3: Score 0.16651357352355442
Project 11: Score 0.16625789622848197
Project 7: Score 0.15752810850224822
Project 6: Score 0.1369204044196243
Project 12: Score 0.07763060892089324
Project 9: Score 0.0338933908924221


Looking at the scores, I believe adding a threshold of 0.3 is a good level of leniency while still filtering out most topics.

In [43]:
threshold = 0.3

# Create a list of project IDs where the score is greater than or equal to the threshold
filtered_project_ids = [project for project, score in sorted_project_scores if score >= threshold]

# Filter the original DataFrame to include only the relevant projects
filtered_df = df.loc[filtered_project_ids].copy()
filtered_csv_filename = "/Users/zeidsmac/Desktop/entropic_case/data/filtered_projects.csv"
filtered_df.to_csv(filtered_csv_filename, index=False)
