In [7]:
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import seaborn as sns
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt

In [8]:
df = pd.read_csv('cleaned_campaign_description_data.csv')
df.head()

Unnamed: 0,charity_name,campaign_name,campaign_description
0,Singapore Red Cross Society,You're The Difference Between An Empty Stomach...,“The monthly FoodAid worth $250 from the Singa...
1,Singapore Red Cross Society,SCDF OPERATION LIONHEART PRESENTS: THE LIFE-SA...,"Superman, Batman, Spider-Man – every superhero..."
2,Singapore Red Cross Society,Box It Forward with DHL,"DHL, Ten Square, Singapore Red Cross and Mini ..."
3,Singapore Red Cross Society,Together for Humanity - Grateful Hearts Day 2024,Singapore Red Cross is organising Grateful Hea...
4,Singapore Red Cross Society,International Bazaar 2023 - One World United i...,"Join us to eat, play, shop for a good cause!Th..."


In [6]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Download necessary resources for NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


# Function to clean the 'campaign_description' text
def clean_description(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(str(text))
    
    # Remove punctuation and stopwords, then apply lemmatization
    cleaned_tokens = [
        lemmatizer.lemmatize(word.lower()) for word in tokens 
        if word.lower() not in stop_words and word not in string.punctuation
    ]
    
    # Join tokens back to a single cleaned string
    cleaned_text = ' '.join(cleaned_tokens)
    return cleaned_text

# Apply the cleaning function to the 'campaign_description' column
df['Cleaned_Campaign_Description'] = df['campaign_description'].apply(clean_description)

# Select the required columns for the output
cleaned_df = df[['charity_name','campaign_name', 'Cleaned_Campaign_Description']]

# Save the cleaned data to a new CSV file
output_path = 'cleaned_deedaSG_campaign_details.csv'  
cleaned_df.to_csv(output_path, index=False)

print("File saved at:", output_path)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Regin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Regin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Regin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


File saved at: cleaned_deedaSG_campaign_details.csv


In [9]:
df1 = pd.read_csv('cleaned_deedaSG_campaign_details.csv')
df1.head()

Unnamed: 0,charity_name,campaign_name,Cleaned_Campaign_Description
0,Singapore Red Cross Society,You're The Difference Between An Empty Stomach...,“ monthly foodaid worth 250 singapore red cros...
1,Singapore Red Cross Society,SCDF OPERATION LIONHEART PRESENTS: THE LIFE-SA...,superman batman spider-man – every superhero g...
2,Singapore Red Cross Society,Box It Forward with DHL,dhl ten square singapore red cross mini fanati...
3,Singapore Red Cross Society,Together for Humanity - Grateful Hearts Day 2024,singapore red cross organising grateful heart ...
4,Singapore Red Cross Society,International Bazaar 2023 - One World United i...,join u eat play shop good cause singapore red ...


In [9]:
#TF-IDF (Term Frequency-Inverse Document Frequency) to extract keywords

from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_df=0.8, max_features=1000, stop_words='english', use_idf=True)

# Fit and transform the 'Cleaned_AboutCampaign' column
tfidf_matrix = tfidf.fit_transform(df['Cleaned_Campaign_Description'])

# Extract feature names (i.e., words)
feature_names = tfidf.get_feature_names_out()

# Convert TF-IDF matrix to a DataFrame for better readability
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# Display top keywords for each campaign description
top_n = 5  # Number of top keywords to extract for each campaign

def get_top_keywords(row, features, n=top_n):
    # Get top n indices for TF-IDF values
    indices = row.argsort()[-n:][::-1]
    # Get feature names for the top indices
    keywords = [features[i] for i in indices]
    return keywords

# Apply the function to each row in the TF-IDF matrix and add as a new column
df['Top_Keywords'] = tfidf_df.apply(lambda row: get_top_keywords(row, feature_names), axis=1)

# Display the DataFrame with the top keywords
display_columns = ['charity_name','campaign_name', 'Cleaned_Campaign_Description', 'Top_Keywords']
df[display_columns].head(10)  # Display the first 10 rows for a quick view

Unnamed: 0,charity_name,campaign_name,Cleaned_Campaign_Description,Top_Keywords
0,Singapore Red Cross Society,You're The Difference Between An Empty Stomach...,“ monthly foodaid worth 250 singapore red cros...,"[safri, foodaid, family, problem, food]"
1,Singapore Red Cross Society,SCDF OPERATION LIONHEART PRESENTS: THE LIFE-SA...,superman batman spider-man – every superhero g...,"[rescue, operation, mission, action, equipment]"
2,Singapore Red Cross Society,Box It Forward with DHL,dhl ten square singapore red cross mini fanati...,"[mdm, ang, 150, cross, red]"
3,Singapore Red Cross Society,Together for Humanity - Grateful Hearts Day 2024,singapore red cross organising grateful heart ...,"[humanity, grateful, 2024, vulnerable, heart]"
4,Singapore Red Cross Society,International Bazaar 2023 - One World United i...,join u eat play shop good cause singapore red ...,"[src, tan, mdm, international, redcross]"
5,Singapore Red Cross Society,Be Our Champion for Humanity,singapore red cross mark 75th year serving hum...,"[mdm, ang, volunteer, src, cross]"
6,Singapore Red Cross Society,Championing Humanity with Project Red Blood Ce...,project red blood cell rbc student-initiated y...,"[gift, red, compassion, project, life]"
7,Singapore Red Cross Society,Sustainable Christmas Market 2024 by Peace of Art,sustainable christmas market 2024 • 16-17 nov ...,"[child, underprivileged, sustainable, program,..."
8,Singapore Red Cross Society,The Sustainable Christmas Market 2023 by Peace...,sustainable christmas market back another run ...,"[christmas, market, sustainable, heart, young]"
9,Singapore Red Cross Society,70 Years of Serving Humanity - Singapore Red C...,dear friend past 70 year singapore red cross s...,"[red, cross, src, singapore, disabled]"


In [11]:
#RAKE (Rapid Automatic Keyword Extraction) to extract key phrases

from rake_nltk import Rake

# Initialize RAKE with NLTK's stopwords for English
rake = Rake()

# Function to extract keywords using RAKE
def extract_keywords_rake(text):
    rake.extract_keywords_from_text(text)
    # Get the highest-ranked phrases as a list
    keywords = rake.get_ranked_phrases()
    return keywords[:5]  # Get top 5 phrases

# Apply RAKE to the 'Cleaned_AboutCampaign' column
df['Top_Keywords_RAKE'] = df['Cleaned_Campaign_Description'].apply(extract_keywords_rake)

# Display the DataFrame with the RAKE keywords
display_columns = ['charity_name','campaign_name', 'Cleaned_Campaign_Description', 'Top_Keywords_RAKE']
df[display_columns].head(10)  # Display the first 10 rows for a quick view

Unnamed: 0,charity_name,campaign_name,Cleaned_Campaign_Description,Top_Keywords_RAKE
0,Singapore Red Cross Society,You're The Difference Between An Empty Stomach...,“ monthly foodaid worth 250 singapore red cros...,[25 gift family four foodaid voucher buy food ...
1,Singapore Red Cross Society,SCDF OPERATION LIONHEART PRESENTS: THE LIFE-SA...,superman batman spider-man – every superhero g...,[coordinated rescue operationmedical doctor re...
2,Singapore Red Cross Society,Box It Forward with DHL,dhl ten square singapore red cross mini fanati...,[mdm linda lum community befriender elderaid p...
3,Singapore Red Cross Society,Together for Humanity - Grateful Hearts Day 2024,singapore red cross organising grateful heart ...,[singapore red cross organising grateful heart...
4,Singapore Red Cross Society,International Bazaar 2023 - One World United i...,join u eat play shop good cause singapore red ...,[join u eat play shop good cause singapore red...
5,Singapore Red Cross Society,Be Our Champion for Humanity,singapore red cross mark 75th year serving hum...,[mdm linda lum community befriender elderaid p...
6,Singapore Red Cross Society,Championing Humanity with Project Red Blood Ce...,project red blood cell rbc student-initiated y...,[initiated youth cause project nan hua high sc...
7,Singapore Red Cross Society,Sustainable Christmas Market 2024 by Peace of Art,sustainable christmas market 2024 • 16-17 nov ...,[vibrant culture commitment environmental soci...
8,Singapore Red Cross Society,The Sustainable Christmas Market 2023 by Peace...,sustainable christmas market back another run ...,[proceeds sustainable christmas market provide...
9,Singapore Red Cross Society,70 Years of Serving Humanity - Singapore Red C...,dear friend past 70 year singapore red cross s...,[dear friend past 70 year singapore red cross ...


In [12]:
#YAKE (Yet Another Keyword Extractor) for extracting keywords & keyphrases
import yake

# Initialize YAKE with specific parameters
language = "en"
max_ngram_size = 3  # Limit n-grams up to 3 words
deduplication_threshold = 0.9  # Control phrase overlap (0 to 1)
num_keywords = 5  # Number of keywords to extract per description

yake_extractor = yake.KeywordExtractor(
    lan=language, 
    n=max_ngram_size, 
    dedupLim=deduplication_threshold, 
    top=num_keywords, 
    features=None
)

# Function to extract keywords using YAKE
def extract_keywords_yake(text):
    keywords = yake_extractor.extract_keywords(text)
    # Extract only the keyword phrases
    top_keywords = [keyword for keyword, score in keywords]
    return top_keywords

# Apply YAKE to the 'Cleaned_AboutCampaign' column
df['Top_Keywords_YAKE'] = df['Cleaned_Campaign_Description'].apply(extract_keywords_yake)

# Display the DataFrame with YAKE keywords
display_columns = ['charity_name','campaign_name', 'Cleaned_Campaign_Description', 'Top_Keywords_YAKE']
df[display_columns].head(10)  # Display the first 10 rows for a quick view

Unnamed: 0,charity_name,campaign_name,Cleaned_Campaign_Description,Top_Keywords_YAKE
0,Singapore Red Cross Society,You're The Difference Between An Empty Stomach...,“ monthly foodaid worth 250 singapore red cros...,"[diagnosis bone marrow, bone marrow cancer.the..."
1,Singapore Red Cross Society,SCDF OPERATION LIONHEART PRESENTS: THE LIFE-SA...,superman batman spider-man – every superhero g...,"[command on-site search, on-site search rescue..."
2,Singapore Red Cross Society,Box It Forward with DHL,dhl ten square singapore red cross mini fanati...,"[singapore red cross, red cross mark, red cros..."
3,Singapore Red Cross Society,Together for Humanity - Grateful Hearts Day 2024,singapore red cross organising grateful heart ...,"[grateful heart day, singapore red cross, annu..."
4,Singapore Red Cross Society,International Bazaar 2023 - One World United i...,join u eat play shop good cause singapore red ...,"[make triumphant return, november ngee ann, co..."
5,Singapore Red Cross Society,Be Our Champion for Humanity,singapore red cross mark 75th year serving hum...,"[singapore red cross, red cross mark, red cros..."
6,Singapore Red Cross Society,Championing Humanity with Project Red Blood Ce...,project red blood cell rbc student-initiated y...,"[singapore red cross, anniversary singapore re..."
7,Singapore Red Cross Society,Sustainable Christmas Market 2024 by Peace of Art,sustainable christmas market 2024 • 16-17 nov ...,"[public rental flat, sustainable christmas mar..."
8,Singapore Red Cross Society,The Sustainable Christmas Market 2023 by Peace...,sustainable christmas market back another run ...,"[sustainable christmas market, christmas marke..."
9,Singapore Red Cross Society,70 Years of Serving Humanity - Singapore Red C...,dear friend past 70 year singapore red cross s...,"[singapore red cross, red cross red, dear frie..."


In [13]:
#Topic Modelling: Latent Dirichlet Allocation (LDA)
#Show primary topic for each campaign description

import gensim
from gensim import corpora
from nltk.tokenize import word_tokenize
import nltk


# Download NLTK tokenizer if not already installed
nltk.download('punkt')

# Tokenize and prepare text data for LDA
def preprocess_for_lda(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())
    # Filter out tokens that are too short or non-alphabetic
    tokens = [token for token in tokens if token.isalpha() and len(token) > 2]
    return tokens

# Apply tokenization to each campaign description
df['Tokenized_AboutCampaign'] = df['Cleaned_Campaign_Description'].apply(preprocess_for_lda)

# Create a dictionary and a corpus for LDA
dictionary = corpora.Dictionary(df['Tokenized_AboutCampaign'])
corpus = [dictionary.doc2bow(text) for text in df['Tokenized_AboutCampaign']]

# Define the number of topics
num_topics = 5  # Adjust the number of topics as needed

# Build the LDA model
lda_model = gensim.models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    random_state=42,
    update_every=1,
    chunksize=10,
    passes=10,
    alpha='auto'
)

# Display the top words in each topic
for idx, topic in lda_model.print_topics(num_topics=num_topics, num_words=5):
    print(f"Topic {idx + 1}: {topic}")

# Assign a dominant topic for each campaign description
def get_dominant_topic(bow):
    topic_probs = lda_model.get_document_topics(bow)
    dominant_topic = max(topic_probs, key=lambda x: x[1])[0]  # Get topic with highest probability
    return dominant_topic

# Add dominant topic to the DataFrame
df['Dominant_Topic'] = [get_dominant_topic(bow) for bow in corpus]

# Display the DataFrame with the dominant topics for each campaign
display_columns = ['charity_name','campaign_name', 'Cleaned_Campaign_Description', 'Dominant_Topic']
df[display_columns].head(10)  # Display the first 10 rows for a quick view

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Regin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Topic 1: 0.031*"support" + 0.020*"life" + 0.017*"family" + 0.017*"help" + 0.016*"syndrome"
Topic 2: 0.016*"dog" + 0.016*"awareness" + 0.015*"kidney" + 0.013*"back" + 0.012*"season"
Topic 3: 0.025*"ticket" + 0.021*"donation" + 0.018*"mystique" + 0.015*"flavour" + 0.014*"singapore"
Topic 4: 0.016*"child" + 0.015*"dsa" + 0.014*"individual" + 0.013*"impact" + 0.012*"skill"
Topic 5: 0.024*"mental" + 0.020*"event" + 0.020*"health" + 0.017*"charity" + 0.010*"donation"


Unnamed: 0,charity_name,campaign_name,Cleaned_Campaign_Description,Dominant_Topic
0,Singapore Red Cross Society,You're The Difference Between An Empty Stomach...,“ monthly foodaid worth 250 singapore red cros...,0
1,Singapore Red Cross Society,SCDF OPERATION LIONHEART PRESENTS: THE LIFE-SA...,superman batman spider-man – every superhero g...,1
2,Singapore Red Cross Society,Box It Forward with DHL,dhl ten square singapore red cross mini fanati...,0
3,Singapore Red Cross Society,Together for Humanity - Grateful Hearts Day 2024,singapore red cross organising grateful heart ...,0
4,Singapore Red Cross Society,International Bazaar 2023 - One World United i...,join u eat play shop good cause singapore red ...,0
5,Singapore Red Cross Society,Be Our Champion for Humanity,singapore red cross mark 75th year serving hum...,0
6,Singapore Red Cross Society,Championing Humanity with Project Red Blood Ce...,project red blood cell rbc student-initiated y...,0
7,Singapore Red Cross Society,Sustainable Christmas Market 2024 by Peace of Art,sustainable christmas market 2024 • 16-17 nov ...,3
8,Singapore Red Cross Society,The Sustainable Christmas Market 2023 by Peace...,sustainable christmas market back another run ...,3
9,Singapore Red Cross Society,70 Years of Serving Humanity - Singapore Red C...,dear friend past 70 year singapore red cross s...,0


In [14]:
# Sentiment Analysis: VADER (Valence Aware Dictionary and sEntiment Reasoner)
# Provides sentiment scores and corresponding labels for each campaign description

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to get sentiment scores
def get_sentiment_scores(text):
    scores = analyzer.polarity_scores(text)
    return scores['compound']  # Compound score gives overall sentiment

# Apply VADER to get sentiment scores for each campaign description
df['Sentiment_Score'] = df['Cleaned_Campaign_Description'].apply(get_sentiment_scores)

# Define sentiment labels based on the compound score
def sentiment_label(score):
    if score >= 0.05:
        return 'Positive'
    elif score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

# Apply sentiment labels to the compound scores
df['Sentiment_Label'] = df['Sentiment_Score'].apply(sentiment_label)

# Display the DataFrame with sentiment scores and labels
display_columns = ['charity_name','campaign_name', 'Cleaned_Campaign_Description', 'Sentiment_Score', 'Sentiment_Label']
df[display_columns].head(10)  # Display the first 10 rows for a quick view

Unnamed: 0,charity_name,campaign_name,Cleaned_Campaign_Description,Sentiment_Score,Sentiment_Label
0,Singapore Red Cross Society,You're The Difference Between An Empty Stomach...,“ monthly foodaid worth 250 singapore red cros...,0.9993,Positive
1,Singapore Red Cross Society,SCDF OPERATION LIONHEART PRESENTS: THE LIFE-SA...,superman batman spider-man – every superhero g...,0.9979,Positive
2,Singapore Red Cross Society,Box It Forward with DHL,dhl ten square singapore red cross mini fanati...,0.9883,Positive
3,Singapore Red Cross Society,Together for Humanity - Grateful Hearts Day 2024,singapore red cross organising grateful heart ...,0.9735,Positive
4,Singapore Red Cross Society,International Bazaar 2023 - One World United i...,join u eat play shop good cause singapore red ...,0.9891,Positive
5,Singapore Red Cross Society,Be Our Champion for Humanity,singapore red cross mark 75th year serving hum...,0.981,Positive
6,Singapore Red Cross Society,Championing Humanity with Project Red Blood Ce...,project red blood cell rbc student-initiated y...,0.9955,Positive
7,Singapore Red Cross Society,Sustainable Christmas Market 2024 by Peace of Art,sustainable christmas market 2024 • 16-17 nov ...,0.9975,Positive
8,Singapore Red Cross Society,The Sustainable Christmas Market 2023 by Peace...,sustainable christmas market back another run ...,0.9628,Positive
9,Singapore Red Cross Society,70 Years of Serving Humanity - Singapore Red C...,dear friend past 70 year singapore red cross s...,0.9831,Positive
