In [1]:
import pandas as pd

In [8]:
data = pd.read_csv('text_ORT_topicmodelling.csv')
data.head()

Unnamed: 0,No,group,Topic,Category,issues,identify_cause,initial_theme
0,1,MOGSC,HSE and Quality,HUCSU & Offshore Maintenance,Revision of CSP25 (Safe House Habitat) by Petr...,- Requirements for Habitat Technicians and Sup...,Capability
1,1,MOSVA,Others,Marine & Logistics,Marine Crew Work Permit to get more support an...,Local Authorities implementation,Sabah & Sarawak
2,1,PAC,Capability and Technology,Others (e.g. more than one categories),Low availability of marine vessels in the mark...,High demand for marine vessels due to High lev...,Contract
3,2,PAC,Capability and Technology,Others (e.g. more than one categories),Shortage of local skilled manpower (10-15 year...,Potentially due to reducing work opportunities...,Capability
4,3,MOGSC,HSE and Quality,HUCSU & Offshore Maintenance,Petronas - SHO (Green Book) required all the t...,- Different standards practices across differe...,HSE


### Topic Modeling (LDA): from sklearn

In [36]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np


In [37]:
df = pd.DataFrame(data)

In [38]:
df.columns

Index(['No', 'group', 'Topic', 'Category', 'issues', 'identify_cause',
       'initial_theme'],
      dtype='object')

In [39]:
set(df.initial_theme)

{'Capability',
 'Contract',
 'HSE',
 'Integrity',
 'Quality',
 'Sabah & Sarawak',
 'Technology',
 'Technology ',
 'Technology  '}

In [40]:
# Combine text columns for better topic modeling
df['combined_text'] = df['issues'] + ' ' + df['identify_cause']

# Replace any NaN values with an empty string
df['combined_text'] = df['combined_text'].fillna('')

# Vectorization (Converting text data to numerical form)
vectorizer = CountVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(df['combined_text'])

# Fit LDA Model (Assuming we are trying to find 10 topics)
lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(X)

# Get topics
def print_topics(model, vectorizer, top_n=10):
    words = vectorizer.get_feature_names_out()
    for idx, topic in enumerate(model.components_):
        print(f"Topic {idx}:")
        print([words[i] for i in topic.argsort()[-top_n:]])

# Print the topics
print_topics(lda, vectorizer)

# Get topic distribution for each document
topic_values = lda.transform(X)

# Add predicted topics to DataFrame
df['Predicted Topic'] = topic_values.argmax(axis=1)

# Display the DataFrame with predicted topics
df

Topic 0:
['clients', 'company', 'long', 'lack', 'cost', 'manpower', 'personnel', 'approval', 'process', 'work']
Topic 1:
['vessel', 'financial', 'high', 'project', 'exe', 'petronas', 'bank', 'guarantee', 'bg', 'cost']
Topic 2:
['immediately', 'inspector', 'work', 'pacs', 'issues', 'local', 'company', 'manpower', 'companies', 'petronas']
Topic 3:
['vdr', 'project', 'award', 'petronas', 'main', 'payment', 'price', 'contract', 'sub', 'contractor']
Topic 4:
['permit', 'operations', 'difficulties', 'new', 'contractors', 'work', 'contractor', 'project', 'late', 'payment']
Topic 5:
['different', 'contractor', 'cost', 'petronas', 'payment', 'lack', 'local', 'contract', 'technology', 'project']
Topic 6:
['personnel', 'lack', 'time', 'based', 'requirement', 'project', 'vessels', 'cost', 'non', 'contractor']
Topic 7:
['fund', 'technologies', 'new', 'price', 'tender', 'contractor', 'cost', 'industry', 'high', 'risk']
Topic 8:
['crew', 'training', 'activities', 'generation', 'ptw', 'high', 'petrona

Unnamed: 0,No,group,Topic,Category,issues,identify_cause,initial_theme,combined_text,Predicted Topic
0,1,MOGSC,HSE and Quality,HUCSU & Offshore Maintenance,Revision of CSP25 (Safe House Habitat) by Petr...,- Requirements for Habitat Technicians and Sup...,Capability,Revision of CSP25 (Safe House Habitat) by Petr...,0
1,1,MOSVA,Others,Marine & Logistics,Marine Crew Work Permit to get more support an...,Local Authorities implementation,Sabah & Sarawak,Marine Crew Work Permit to get more support an...,0
2,1,PAC,Capability and Technology,Others (e.g. more than one categories),Low availability of marine vessels in the mark...,High demand for marine vessels due to High lev...,Contract,Low availability of marine vessels in the mark...,8
3,2,PAC,Capability and Technology,Others (e.g. more than one categories),Shortage of local skilled manpower (10-15 year...,Potentially due to reducing work opportunities...,Capability,Shortage of local skilled manpower (10-15 year...,2
4,3,MOGSC,HSE and Quality,HUCSU & Offshore Maintenance,Petronas - SHO (Green Book) required all the t...,- Different standards practices across differe...,HSE,Petronas - SHO (Green Book) required all the t...,5
...,...,...,...,...,...,...,...,...,...
201,139,MOGSC,Others,Engineering,Milestone is not parallel to cashflow,,Contract,,0
202,141,MOGSC,Others,Engineering,Verification of basic engineering is too vague,Petronas GTS do basic engineering wrongly,Contract,Verification of basic engineering is too vague...,4
203,142,MOGSC,Others,Technology & Digitalisation,Preference to Foreign inventions/inventors,Lack of trust to local inventions,Technology,Preference to Foreign inventions/inventors La...,5
204,1,MTEM,,,Bundling/ Unbundling contracting strategy\n,,Contract,,0


In [41]:
# Manually map the topics to the actual theme names
theme_mapping = {
    0: 'Manpower Approval and Work Process Challenges',
    1: 'Vessel Costs and Financial Guarantees',
    2: 'Local Manpower and Immediate Issues with PACs',
    3: 'Project Awards and Contract Payment Issues',
    4: 'Work Permits and Operational Delays',
    5: 'Contractor Cost and Technology Adoption',
    6: 'Personnel Shortage and Vessel Requirements',
    7: 'New Technology, High Costs, and Industry Risks',
    8: 'Crew Training and Safety in High-Activity Projects',
    9: 'High Software Costs and Technology Implementation'
}

# Map the predicted topics to the corresponding theme names
df['Predicted Subtheme'] = df['Predicted Topic'].map(theme_mapping)

df.head()


Unnamed: 0,No,group,Topic,Category,issues,identify_cause,initial_theme,combined_text,Predicted Topic,Predicted Subtheme
0,1,MOGSC,HSE and Quality,HUCSU & Offshore Maintenance,Revision of CSP25 (Safe House Habitat) by Petr...,- Requirements for Habitat Technicians and Sup...,Capability,Revision of CSP25 (Safe House Habitat) by Petr...,0,Manpower Approval and Work Process Challenges
1,1,MOSVA,Others,Marine & Logistics,Marine Crew Work Permit to get more support an...,Local Authorities implementation,Sabah & Sarawak,Marine Crew Work Permit to get more support an...,0,Manpower Approval and Work Process Challenges
2,1,PAC,Capability and Technology,Others (e.g. more than one categories),Low availability of marine vessels in the mark...,High demand for marine vessels due to High lev...,Contract,Low availability of marine vessels in the mark...,8,Crew Training and Safety in High-Activity Proj...
3,2,PAC,Capability and Technology,Others (e.g. more than one categories),Shortage of local skilled manpower (10-15 year...,Potentially due to reducing work opportunities...,Capability,Shortage of local skilled manpower (10-15 year...,2,Local Manpower and Immediate Issues with PACs
4,3,MOGSC,HSE and Quality,HUCSU & Offshore Maintenance,Petronas - SHO (Green Book) required all the t...,- Different standards practices across differe...,HSE,Petronas - SHO (Green Book) required all the t...,5,Contractor Cost and Technology Adoption


In [42]:
# Manually map the topics to the new theme names
theme_mapping2 = {
    0: 'Manpower',
    1: 'Contract',
    2: 'Manpower',
    3: 'Contract',
    4: 'HSE',
    5: 'Technology',
    6: 'Manpower',
    7: 'Technology',
    8: 'HSE',
    9: 'Technology'
}

# Map the predicted topics to the corresponding theme names
df['Predicted Theme'] = df['Predicted Topic'].map(theme_mapping2)

df.head()


Unnamed: 0,No,group,Topic,Category,issues,identify_cause,initial_theme,combined_text,Predicted Topic,Predicted Subtheme,Predicted Theme
0,1,MOGSC,HSE and Quality,HUCSU & Offshore Maintenance,Revision of CSP25 (Safe House Habitat) by Petr...,- Requirements for Habitat Technicians and Sup...,Capability,Revision of CSP25 (Safe House Habitat) by Petr...,0,Manpower Approval and Work Process Challenges,Manpower
1,1,MOSVA,Others,Marine & Logistics,Marine Crew Work Permit to get more support an...,Local Authorities implementation,Sabah & Sarawak,Marine Crew Work Permit to get more support an...,0,Manpower Approval and Work Process Challenges,Manpower
2,1,PAC,Capability and Technology,Others (e.g. more than one categories),Low availability of marine vessels in the mark...,High demand for marine vessels due to High lev...,Contract,Low availability of marine vessels in the mark...,8,Crew Training and Safety in High-Activity Proj...,HSE
3,2,PAC,Capability and Technology,Others (e.g. more than one categories),Shortage of local skilled manpower (10-15 year...,Potentially due to reducing work opportunities...,Capability,Shortage of local skilled manpower (10-15 year...,2,Local Manpower and Immediate Issues with PACs,Manpower
4,3,MOGSC,HSE and Quality,HUCSU & Offshore Maintenance,Petronas - SHO (Green Book) required all the t...,- Different standards practices across differe...,HSE,Petronas - SHO (Green Book) required all the t...,5,Contractor Cost and Technology Adoption,Technology


Topic 0: Manpower Approval and Work Process Challenges

Topic 1: Vessel Costs and Financial Guarantees

Topic 2: Local Manpower and Immediate Issues with PACs

Topic 3: Project Awards and Contract Payment Issues

Topic 4: Work Permits and Operational Delays

Topic 5: Contractor Cost and Technology Adoption

Topic 6: Personnel Shortage and Vessel Requirements

Topic 7: New Technology, High Costs, and Industry Risks

Topic 8: Crew Training and Safety in High-Activity Projects

Topic 9: High Software Costs and Technology Implementation

Topic 0: Manpower

Topic 1: Contract

Topic 2: Manpower

Topic 3: Contract

Topic 4: HSE

Topic 5: Technology

Topic 6: Manpower

Topic 7: Technology

Topic 8: HSE

Topic 9: Technology

### Topic Modeling (LDA): from gensim 

In [45]:
!pip install gensim nltk




[notice] A new release of pip is available: 24.1.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [49]:
import pandas as pd
import gensim
from gensim import corpora
from gensim.models import LdaModel
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ahmadnajmi.ariffin\AppData\Roaming\nltk_data.
[nltk_data]     ..
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ahmadnajmi.ariffin\AppData\Roaming\nltk_data.
[nltk_data]     ..
[nltk_data]   Unzipping corpora\stopwords.zip.


In [50]:
def preprocess(text):
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
    return [word for word in tokens if word.isalnum() and word not in stop_words]  # Remove stop words and non-alphanumeric tokens

df['processed_text'] = df['combined_text'].apply(preprocess)

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\ahmadnajmi.ariffin/nltk_data'
    - 'c:\\Users\\ahmadnajmi.ariffin\\AppData\\Local\\Programs\\Python\\Python311\\nltk_data'
    - 'c:\\Users\\ahmadnajmi.ariffin\\AppData\\Local\\Programs\\Python\\Python311\\share\\nltk_data'
    - 'c:\\Users\\ahmadnajmi.ariffin\\AppData\\Local\\Programs\\Python\\Python311\\lib\\nltk_data'
    - 'C:\\Users\\ahmadnajmi.ariffin\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - 'C:/GIT-ONASstg/topic_modelling/nltk_data'
    - 'C:/GIT-ONASstg/topic_modelling/nltk_data'
    - 'C:/Users/ahmadnajmi.ariffin/AppData/Roaming/nltk_data'
    - 'C:/Users/ahmadnajmi.ariffin/AppData/Roaming/nltk_data'
    - 'C:/Users/ahmadnajmi.ariffin/AppData/Roaming/nltk_data'
    - 'C:/Users/ahmadnajmi.ariffin/AppData/Roaming/nltk_data'
    - 'C:/Users/ahmadnajmi.ariffin/AppData/Roaming/nltk_data'
**********************************************************************


In [31]:
# Create a dictionary and corpus for Gensim
dictionary = corpora.Dictionary(df['processed_text'])
corpus = [dictionary.doc2bow(text) for text in df['processed_text']]

# Fit LDA Model (assuming we want 10 topics)
lda_model = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15, random_state=42)

# Print the topics
topics = lda_model.print_topics(num_words=10)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

# Assign the most likely topic to each document
def get_document_topics(lda_model, corpus):
    topics = []
    for doc_bow in corpus:
        topic_probs = lda_model.get_document_topics(doc_bow)
        top_topic = max(topic_probs, key=lambda x: x[1])[0]  # Get the most probable topic
        topics.append(top_topic)
    return topics

# Add predicted topics to the DataFrame
df['Predicted Topic'] = get_document_topics(lda_model, corpus)

# Display the DataFrame with predicted topics
print(df)


KeyError: 'processed_text'