In [None]:
!pip install keybert
!pip install -U sentence-transformers

In [None]:
!ls

In [None]:
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.util import ngrams
from typing import List
from collections import Counter

In [None]:
sns.set_style("whitegrid")
sns.set_context("talk")

In [None]:
data_dict = pd.read_excel('Data_Dictionary.xlsx')

In [None]:
# data_dict

In [None]:
pre_covid = pd.read_excel('VW_2019.xlsx')

In [None]:
pos_covid = pd.read_excel('VW_2022.xlsx')

In [55]:
pre_covid['Year'] = [2019 for x in range(len(pre_covid))]
pos_covid['Year'] = [2022 for x in range(len(pos_covid))]

In [None]:
data_df = pd.concat([pre_covid, pos_covid], ignore_index=True).reset_index()

In [None]:
fig = plt.figure(figsize=(9,13))
sns.countplot(data=data_df,
              y='OrgName',
              hue='Year',
              orient='h',
              order=data_df['OrgName'].value_counts().index[:25],
              palette='hls',
              lw=2,
              edgecolor='0.4',
             )
plt.xlim(0,26)
plt.title("Number of Volunteer Postings in 2019 and 2022")
plt.xlabel("Number of Postings")
plt.ylabel('Organization Name')
sns.despine(offset=15, trim=True)
plt.legend(loc=5)
# plt.xticks(rotation = 90);

In [None]:
fig = plt.figure(figsize=(9,13))
sns.countplot(data=data_df,
              y='OrgName',
              hue='Year',
              orient='h',
              order=data_df['OrgName'].value_counts().index[25:50],
              palette='hls',
              lw=2,
              edgecolor='0.4',
             )
# plt.ylim(0,26)
plt.xlim(0,26)
plt.title("Number of Volunteer Postings in 2019 and 2022")
plt.xlabel("Number of Postings")
plt.ylabel('Organization Name')
sns.despine(offset=5, trim=True)
plt.legend(loc=5)
# plt.xticks(rotation = 90);

In [None]:
fig = plt.figure(figsize=(9,13))
sns.countplot(data=data_df,
              y='OrgName',
              hue='Year',
              orient='h',
              order=data_df['OrgName'].value_counts().index[50:75],
              palette='hls',
              lw=2,
              edgecolor='0.4',
             )
plt.xlim(0,26)
plt.title("Number of Volunteer Postings in 2019 and 2022")
plt.xlabel("Number of Postings")
plt.ylabel('Organization Name')
sns.despine(offset=5, trim=True)
plt.legend(loc=5)
# plt.xticks(rotation = 90);

In [None]:
fig = plt.figure(figsize=(9,13))
sns.countplot(data=data_df,
              y='OrgName',
              hue='Year',
              orient='h',
              order=data_df['OrgName'].value_counts().index[75:100],
              palette='hls',
              lw=2,
              edgecolor='0.4',
             )
plt.xlim(0,26)
plt.title("Number of Volunteer Postings in 2019 and 2022")
plt.xlabel("Number of Postings")
plt.ylabel('Organization Name')
sns.despine(offset=5, trim=True)
plt.legend(loc=5)
# plt.xticks(rotation = 90);

In [78]:
def create_embed(docs: List[str]) -> List[List]:

  """
  create embeddings for a list of keyphrases
  """

  model  = SentenceTransformer("bert-base-uncased")
  embeddings =  model.encode(docs)

  return embeddings


In [79]:
def extract_keyphrases(docs) -> List[List]:

  """
  """

  kw_model = KeyBERT()

  kw_ = [kw_model.extract_keywords(d,
                                keyphrase_ngram_range=(1, 2),
#                                stop_words='None'
                               ) for d in docs]

  kw = []
  for doc_kw in kw_:
      kw.append([w[0] for w in doc_kw])

  return kw

In [80]:
def find_theme(documents: List[str], theme: List[str]) -> list[List]:

  """
  function to take in a list of documents,
  extract keywords (phrases)
  and calculate their similarity to a given theme

  INPUT:
  documents: list of text
  theme: list of text

  OUTPUT:
  List of lists
  """

  doc_kw = extract_keyphrases(documents)
  kw_embed = create_embed(doc_kw)

  theme_embed = create_embed(theme)

  sim = cosine_similarity(kw_embed, theme_embed)

  kw_comb = [w for kw_ in doc_kw for w in kw_]

  kw_count = Counter(kw_comb).most_common()

  return sim, kw_comb, kw_count

In [95]:
# org_name = 'Canadian Cancer Society'
# org_name = 'Kitchener Public Library'
org_name = 'House of Friendship'

year = 2019

In [None]:
data_slice = data_df['Position_Description'][(data_df['OrgName'] == org_name) & (data_df['Year'] == year)]
print(data_slice.shape)

In [97]:
docs = data_df['Position_Description'][(data_df['OrgName'] == org_name) & (data_df['Year'] == year)]

In [98]:
community_theme = ['community', 'community care', 'community engagement', 'community mobilization']

In [None]:
s22, kw, kw_count = find_theme(docs, community_theme)

In [None]:
kw_count

In [None]:
fig = plt.figure(figsize=(11,8))
sns.heatmap(s22,
            cmap='Blues',
            linewidths= 2,
            annot=True)

In [None]:
docs.iloc[0]

In [None]:
docs.iloc[3]