In [None]:
import pandas as pd

In [None]:
import torch, numpy as np, random
seed = 42

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
np.random.seed(seed)  # Numpy module.
random.seed(seed)  # Python random module.
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [None]:
data = pd.read_csv('/content/SH opinions gathering - Opinions CE (3).csv')
data

Unnamed: 0,comment_id,comment_text,Sentiment,Name of media,link,stakeholder type,date & time in dd.mm.yy hh:mm format
0,1-1,How can a circular economy be implemented in c...,neutral,Quora,https://www.quora.com/How-can-a-circular-econo...,"Owner of Acton Precast Concrete (APC) company,...","7months ago (October, 2022)"
1,1-2,"Of course, this is not an approach that all pr...",negative,Quora,,,
2,1-3,"A ""design for disassembly"" (DfD) approach is o...",positive,Quora,,,
3,1-4,"When tackling construction design with DfD, th...",neutral,Quora,,,
4,1-5,Increased concrete strength with age\ndecrease...,positive,Quora,,,
...,...,...,...,...,...,...,...
901,12-3,Circular construction facilitates the reuse an...,positive,Article,https://www.sciencedirect.com/science/article/...,Academician,"January, 2022"
902,13,Circular economy is poised to make a tremendou...,positive,Article,https://www.sciencedirect.com/science/article/...,Academician,10.07.2022
903,13-1,"Moreover, CE is a concept that could offer var...",positive,Article,https://www.sciencedirect.com/science/article/...,Academician,10.07.2022
904,14,The transition to a more efficient circular mo...,neutral,Article,https://www.sciencedirect.com/science/article/...,Academician,"December, 2020"


In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def normalize(text):
  result = re.sub(r'[^\w\s]', '', re.sub(r'\d+', '', text)).lower()
  result = result.replace('circular', '').replace('uh', '').replace('um', '').replace('economy', '').replace('buildings', '').replace('weve', '').replace('im', '')
  result = result.replace('building', '').replace('construction','').replace('te', '').replace('ity', '').replace('really', '').replace('tber', '').replace('ofn', '').replace('rm','')
  result = result.replace('marials', '').replace('yeah', '').replace('marial', '')
  result = result.replace('\n', ' ').replace('________________________________________________________________________________', '').replace('\r', '').strip().lower()
  result = word_tokenize(result)
  return ' '.join([w for w in result if not w.lower() in stop_words])
data['comment_text'] = data['comment_text'].fillna(' ').apply(lambda x: normalize(x))
data['Sentiment'] = data['Sentiment'].fillna(' ').apply(lambda x: x.lower().strip())


In [None]:
data['comment_text']

0      plemend industries talking industries industry...
1      course approach professionals want take hor nb...
2      design disassembly dfd approach one best ways ...
3      tackling design dfd choice becomes extremely p...
4      increased concre strength age decreased peeabi...
                             ...                        
901    facilitas reuse recycling maintains components...
902    poised make tremendous pact facilitating susta...
903    moreover ce concept could offer various advant...
904    transition efficient model economics ambitions...
905    another example ce plementation case port swed...
Name: comment_text, Length: 906, dtype: object

In [None]:
! pip install bertopic >> /dev/null

In [None]:
from bertopic import BERTopic

In [None]:
dct = {'neutral': 0, 'positive':1, 'negative':2}

In [None]:
from sentence_transformers import SentenceTransformer

# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech

# KeyBERT
keybert_model = KeyBERTInspired()

# Part-of-Speech
pos_model = PartOfSpeech("en_core_web_sm")

# MMR
mmr_model = MaximalMarginalRelevance(diversity=0.4)

representation_model = {
    "KeyBERT": keybert_model,
    "MMR": mmr_model,
    "POS": pos_model
}

from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=10, metric='euclidean', cluster_selection_method='eom', prediction_data=True)


from umap import UMAP

# umap_model = UMAP(n_neighbors=20, n_components=15, min_dist=0.0, metric='cosine', random_state=42)
umap_model = UMAP(n_neighbors=15, n_components=15, min_dist=0.0, metric='cosine', random_state=42)

from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))

In [None]:
model = BERTopic(language="english", calculate_probabilities=True, verbose=True,
                 embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  representation_model=representation_model,
  vectorizer_model=vectorizer_model,
 top_n_words=50,
)
topics, probs = model.fit_transform(data['comment_text'].tolist(),
                                  # Uncomment to use supervised labels
                                   # y=data['Sentiment'].map(dct).tolist()
                                    )

2023-12-19 14:44:18,020 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/29 [00:00<?, ?it/s]

2023-12-19 14:44:56,710 - BERTopic - Embedding - Completed ✓
2023-12-19 14:44:56,713 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2023-12-19 14:45:01,477 - BERTopic - Dimensionality - Completed ✓
2023-12-19 14:45:01,480 - BERTopic - Cluster - Start clustering the reduced embeddings
2023-12-19 14:45:01,613 - BERTopic - Cluster - Completed ✓
2023-12-19 14:45:01,622 - BERTopic - Representation - Extracting topics from clusters using representation models.
2023-12-19 14:45:10,783 - BERTopic - Representation - Completed ✓


In [None]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,MMR,POS,Representative_Docs
0,-1,408,-1_use_new_industry_resources,"[use, new, industry, resources, products, desi...","[built environment, environment, sustainable, ...","[resources, sustainable, recycling, economic, ...","[use, new, industry, resources, products, desi...",[regarding end life think demolition wass pont...
1,0,138,0_sustainable_resources_ce_economic,"[sustainable, resources, ce, economic, environ...","[sustainable development, sustainable, sustain...","[resources, sustainabil, emissions, sector, ca...","[sustainable, resources, ce, economic, environ...",[due resources consption associad environmenta...
2,1,105,1_know_sort_design_kind,"[know, sort, design, kind, think, work, projec...","[projects, build, facade, design, engineering,...","[sort, design, kind, project, build, thinking,...","[sort, design, kind, think, work, project, way...",[could give us idea cost per square mer projec...
3,2,75,2_recycled_recycling_reuse_plastic,"[recycled, recycling, reuse, plastic, products...","[recycling, recycled reused, reuse recycling, ...","[recycled, recycling, products, plastics, sust...","[recycled, recycling, reuse, plastic, products...",[partners collect clients built use raw make c...
4,3,66,3_business_lack_industry_ce,"[business, lack, industry, ce, products, et al...","[supply chains, supply chain, stakeholders, in...","[lack, supply chain, business model, barriers,...","[business, lack, industry, ce, products, et, p...",[addition industry design process manufacturin...
5,4,49,4_donut_cities_framework_amsrdam,"[donut, cities, framework, amsrdam, people, ec...","[cities, foundation, roadmap, framework, devel...","[cities, framework, amsrdam, global, donut eco...","[donut, cities, framework, people, economics, ...",[see connects connects minds hearts connects p...
6,5,29,5_demolition_sel_big_like,"[demolition, sel, big, like, demolishing, used...","[demolition, life demolition, demolishing, exc...","[demolition, demolishing, recycled, contractor...","[demolition, sel, big, demolishing, used, conc...",[uk early stages organizations ahead game prov...
7,6,25,6_design_components_modular_like,"[design, components, modular, like, easily, ma...","[modular design, engineered, designing, design...","[components, manufacturing, design stage, flex...","[design, components, modular, manufacturing, l...",[industry lets dive deeper innovative chniques...
8,7,11,7_good_kill_used feed_kill anal,"[good, kill, used feed, kill anal, universitie...","[case studies, platfo products, expert platfo,...","[used feed, kill anal, platfo products, furnit...","[good, universities, ozili, bad, furniture, ex...",[less bad good enough office furniture chicago...


In [None]:
model.get_document_info(data['comment_text'].tolist()).to_csv('new_topics.csv')

In [None]:
model.visualize_topics()

In [None]:
model.visualize_barchart(n_words=150, top_n_topics=20)

In [None]:
# embeddings = embedding_model.encode(data['comment_text'].tolist(), show_progress_bar=True)
# reduced_embeddings = UMAP(n_neighbors=2, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

# model.visualize_documents(data['comment_text'].tolist(), embeddings=reduced_embeddings)


In [None]:
model.set_topic_labels(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', "I"])

In [None]:
model.visualize_heatmap(n_clusters=3, width=1200, height=1100, custom_labels=True)

In [None]:
model.visualize_hierarchy(top_n_topics=14)

In [None]:
data['topic'] = topics


In [None]:
data.groupby(['topic',])['Sentiment'].value_counts()

topic  Sentiment
-1     positive     240
       neutral      107
       negative      59
       neutal         1
       posiitve       1
 0     positive      92
       neutral       30
       negative      16
 1     positive      61
       neutral       37
       negative       7
 2     positive      47
       negative      15
       neutral       12
                      1
 3     neutral       26
       negative      22
       positive      18
 4     positive      24
       neutral       14
       negative      11
 5     positive      15
       neutral        9
       negative       5
 6     positive      19
       neutral        5
       negative       1
 7     neutral        8
       positive       2
       negative       1
Name: Sentiment, dtype: int64