In [103]:
class EventsTagger:
  def __init__(self, path_to_file):
    # imports and dependencies
    import numpy as np
    import gensim
    from gensim.models import Word2Vec
    import re
    import nltk
    nltk.download('punkt')
    from nltk.tokenize import sent_tokenize
    with open(path_to_file, "rb") as f:
      text = f.read().decode("utf-8")
    self.paragraph = text
    self.keywords = ['academic', 'computer science', 'humanities', 'fellowship', 'protest', 'social justice', 'workshop', 'seminar', 'conference', 'symposium', 'panel', 'lecture', 'workshop series', 'career fair', 'networking', 'cultural', 'social', 'club', 'volunteer', 'orientation', 'recruitment', 'sports', 'party', 'wellness']
    self.W1 = np.load('W1.npy')
    self.V = np.load('V.npy')
    self.m = len(self.V) # n_terms
    self.topic_names_dict = {1: "community events and workshops",
                             2: "social good and fundraising",
                             3: "cultural and artistic events",
                             4: "career development",
                             5: "volunteering",
                             6: "theater and performing arts",
                             7: "recreation and nightlife",
                             8: "pre-professional events",
                             9: "social justice and advocacy",
                             10: "food and snacks",
                             11: "other"}

  # cleaning a single event description
  def clean_desc(self, desc):
    import re
    clean_desc = desc.lower()
    clean_desc = re.sub('[^a-zA-Z]', ' ', clean_desc)
    clean_desc=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",clean_desc)
    clean_desc=re.sub("(\\d|\\W)+"," ",clean_desc)
    clean_desc = clean_desc.replace('\n', '')
    clean_desc = clean_desc.replace('\Terminate\\', '\n')
    clean_desc = clean_desc.replace('Subject: ', '')
    return clean_desc

  # cleaning the whole text
  def clean_text(self):
    import nltk
    nltk.download('punkt')
    from nltk.tokenize import sent_tokenize
    sentences = sent_tokenize(self.paragraph)
    self.clean_txt = []
    for sentence in sentences:
      desc = self.clean_desc(sentence)
      self.clean_txt.append(desc)

  # fitting the model
  def fit(self):
    import gensim
    from gensim.models import Word2Vec
    self.clean_text()
    corpus = []
    for col in self.clean_txt:
      word_list = col.split(" ")
      corpus.append(word_list)
    self.word2vec_model = Word2Vec(corpus, min_count=1, vector_size = 70)

    # create vector embeddings for keywords
    self.keyword_embeddings = {}
    for keyword in self.keywords:
      keyword_tokens = keyword.split()  # Tokenize the keyword if it contains multiple words
      keyword_embedding = []
      for token in keyword_tokens:
        if token in self.word2vec_model.wv.key_to_index:
            keyword_embedding.append(self.word2vec_model.wv.get_vector(token))
      if keyword_embedding:
        self.keyword_embeddings[keyword] = sum(keyword_embedding) / len(keyword_embedding)

  # tags events
  def tag(self, event):
    # all the helper functions are subfunctions so that we only need one thing

    def clean_event(desc):
      import re # !!!!!
      clean_desc = desc.lower()
      clean_desc = re.sub('[^a-zA-Z]', ' ', clean_desc)
      clean_desc=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",clean_desc)
      clean_desc=re.sub("(\\d|\\W)+"," ",clean_desc)
      clean_desc = clean_desc.replace('\n', '')
      clean_desc = clean_desc.replace('\Terminate\\', '\n')
      clean_desc = clean_desc.replace('Subject: ', '')
      return clean_desc

    # gets similarities between keywords
    def get_keyword_similarities(event):
      event_description = clean_event(event).split()

      # calculate similarity between event description and each keyword
      similarity_scores = {}
      for keyword, embedding in self.keyword_embeddings.items(): #!!!!!
        keyword_similarity = []
        for word in event_description:
          if word in self.word2vec_model.wv.key_to_index: #!!!!!
            word_embedding = self.word2vec_model.wv.get_vector(word)
            similarity = self.word2vec_model.wv.cosine_similarities(embedding, [word_embedding])
            keyword_similarity.append(max(similarity))
            if keyword_similarity:
              similarity_scores[keyword] = max(keyword_similarity)

      sorted_keywords = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)
      top_keywords = sorted_keywords[:10]
      return top_keywords

    def vectorize_doc(cleaned_doc):
      word_to_index = {word: index for index, word in enumerate(self.V)} #!!!!
      word_counts = {}
      words = cleaned_doc.split()
      for word in words:
        if word in word_to_index:
          word_index = word_to_index[word]
          word_counts[word_index] = word_counts.get(word_index, 0) + 1

      document_vector = [word_counts.get(index, 0) for index in range(len(self.V))]
      return document_vector

    def list_topics_above_proportion(topic_distribution, threshold):
      topics_above_threshold = []

      for topic_index, proportion in enumerate(topic_distribution):
        if proportion > threshold:
            topics_above_threshold.append(topic_index + 1)
      return topics_above_threshold

    def get_topic_numbers(cleaned_doc, threshold=0.3):
      import numpy as np
      document_vector = vectorize_doc(cleaned_doc)
      document_topic_distribution = np.dot(document_vector, self.W1) #!!!!
      topics_above_threshold = list_topics_above_proportion(document_topic_distribution, threshold)
      topics_sorted_by_proportion = sorted(topics_above_threshold, key=lambda topic_index: document_topic_distribution[topic_index - 1], reverse=True)
      return topics_sorted_by_proportion

    def get_topics(doc):
      cleaned_doc = clean_event(doc)
      topic_numbers = get_topic_numbers(cleaned_doc)
      tags = [self.topic_names_dict.get(topic) for topic in topic_numbers] #!!!
      if tags == []:
        return []
      else:
        return tags

    tags = []
    keyword_similarities = get_keyword_similarities(event)
    for (keyword, score) in keyword_similarities:
      if score < 0.97:
        break
      else:
        tags.append(keyword)

    clean_event_desc = clean_event(event)
    custom_keywords = {
        'culture': ["aamp", "obsa", "apida", "ideas", "chaplains", "samp", "christian",\
                    "first love", "chaplains", "msa", "women", "poc", "black",\
                    "mcalister", "jewish", "hillel", "chicanx", "latinx", "qrc",\
                    "queer", "lgbt", "bsu", "indigenous", "fli", "trans", "ismp", \
                    "oldenborg", "draper", "muslim"],
        'academic': ["stem", "professor", "course", "registration", "registrar", \
                     "idpo", "financial aid", "journal", "department", "grad"],
        'protest': ["march", "sjp", "divest", "undercurrents", "cswa", "workers"],
        'wellness': ["title ix", "cares", "lmft"],
        'career': ["cdo", "alum", "pcip", "internship", "prehealth", "prelaw", \
                   "resume", "consulting", "handshake"],
        'party': ['walker']
    }

    for tag, lst in custom_keywords.items():
      added_tag = False
      for word in lst:
        if word in clean_event_desc:
          added_tag = True
          tags.append(tag)
          break
      # if we already added the tag no need to continue looking through the words
      if added_tag == True:
        continue

    topics = get_topics(event) # !!


    return tags + topics


  def save_model(self, filename):
    import pickle
    self.fit()

    model_data = {
        'V': self.V,
        'W1': self.W1,
        'word2vec_model': self.word2vec_model,
        'topic_names_dict': self.topic_names_dict,
        'keyword_embeddings': self.keyword_embeddings,
        'tag_method': self.tag
    }

    with open(filename, 'wb') as f:
        pickle.dump(model_data, f)















In [104]:
tagger = EventsTagger('chirp_output2.txt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [105]:
tagger.fit()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [106]:
tagger.save_model('p-5cevents-tagger3.pkl')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [107]:
tagger.tag("Come talk to the Computer Science professors in the pannel for understanding computer science course registration")

['computer science', 'academic', 'volunteering']

In [108]:
# Open the text file for reading
with open('chirp_output2.txt', 'r') as file:
    # Read the entire contents of the file
    text = file.read()

# Remove line breaks from the text
text = text.replace('\n', ' ')
text = text.replace('\Terminate\\', '\n')
text = text.replace('Subject: ', '')


# Open the same text file for writing
with open('input.txt', 'w') as file:
    # Write the modified text to the file
    file.write(text)

In [110]:
with open('input.txt', 'r') as input_file, open('output.txt', 'w') as output_file:
    for line in input_file:
        line = line.strip()
        tags = tagger.tag(line)

        output_file.write(f"{line}\n")
        output_file.write(f"Tags: {', '.join(tags)}\n")
        output_file.write("\n")