In [1]:
from warnings import filterwarnings
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from nltk.corpus import stopwords
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

stop_words = stopwords.words('english')

filterwarnings('ignore')

Read Data and Discover

In [2]:
df = pd.read_csv("all_tickets_processed_improved_v3.csv")
df.head()

Unnamed: 0,Document,Topic_group
0,connection with icon icon dear please setup ic...,Hardware
1,work experience user work experience user hi w...,Access
2,requesting for meeting requesting meeting hi p...,Hardware
3,reset passwords for external accounts re expir...,Access
4,mail verification warning hi has got attached ...,Miscellaneous


Remove Stopwords Using nltk Library

In [3]:
def remove_stopwords(text):
    words = text.lower().split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

df['Document'] = df['Document'].apply(remove_stopwords)


Creating Cloud To Observe The Most Used Words

In [4]:
def create_cloud(data_frame):
    # Get the text from the Document column
    text = data_frame["Document"].str.cat(sep=" ")  # Concatenate all documents
    text = text.lower()

    # Create the WordCloud object
    wordcloud = WordCloud(max_font_size=40).generate(text)

    # Create a plot and display the word cloud
    plt.figure(figsize=(8, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title("Most Frequent Words")
    plt.show()

#create_cloud(df)


Adding a new function to get rid off the spesific words that we encounter at the dataset most

In [5]:
words_to_remove = ["ga", "kind", "hello", "please", "let", "help", "best", "regards", "icon", "dear", "per", "hi", "thanks", "thank", "importance", "high", "issue", "ab", "abc"]

def remove_words(text):
  return " ".join([word for word in text.lower().split() if word not in words_to_remove])

df["Document"] = df["Document"].apply(remove_words)
#create_cloud(df)


To see an example of a ["Document] we create a new func becouse the original one is too long

In [6]:
def see_example_row(index:int):
    first_document = df["Document"].iloc[index]
    words = first_document.split()[:100]
    print(" ".join(words))
    print(df["Topic_group"].iloc[index])

see_example_row(1)

work experience user work experience user work experience student coming next name much appreciate duration
Access


In [7]:
vectorizer = TfidfVectorizer(stop_words='english')
vectorizer.fit(df["Document"])

In [8]:
tfidf_matrix = vectorizer.transform(df["Document"])

In [9]:
# Group documents by topic
grouped_documents = {}
for doc, topic in zip(df["Document"], df["Topic_group"]):
  if topic not in grouped_documents:
    grouped_documents[topic] = []
  grouped_documents[topic].append(doc)
grouped_documents

{'Hardware': ['connection setup engineers details needed lead',
  'requesting meeting requesting meeting follow equipments cable pc cord plug',
  'prod servers tunneling prod tunneling va la tunneling la host si la si host host cards port name bytes bytes seq bytes seq statistics packets transmitted received packet loss avg bytes bytes seq bytes seq tuesday pm acre tunneling si pm acre tunneling extended object host extended host extended object host range extended host range administrator sector pm acre tunneling va la si la tunneling design lead ext friday pm tunneling va extended object host extended host administrator sector pm tunneling va la tunneling users pinging bytes bytes bytes bytes bytes design lead ext con care pot partial strict si pot ale care va la contains proprietary information legally privileged unauthorized dissemination prohibited intended recipient views addressing transmission error misdirected notify author replying intended recipient must disclose distribute 

In [15]:
# Analyze TF-IDF for each topic group
for topic, group_documents in grouped_documents.items():
  print(f"\n** Topic Group: {topic} **")
  print((group_documents[0]))

  # Create and fit TF-IDF vectorizer
  group_vectorizer = TfidfVectorizer(stop_words='english')
  tfid_matrix = group_vectorizer.fit_transform(group_documents)
  feature_names = group_vectorizer.get_feature_names_out()
  
  # Analyze overall TF-IDF for the topic group
  overall_tfidf_sum = tfidf_matrix.sum(axis=0)
  
  overall_top_features = sorted(zip(feature_names, overall_tfidf_sum), key=lambda x: x[1], reverse=True)[:10]
  #print(overall_top_features)

  #print(pd.DataFrame(tfid_matrix.toarray(), columns = feature_names))


** Topic Group: Hardware **
connection setup engineers details needed lead

** Topic Group: Access **
work experience user work experience user work experience student coming next name much appreciate duration

** Topic Group: Miscellaneous **

** Topic Group: HR Support **
access request modules report report cost much

** Topic Group: Purchase **
system movement left available device device denmark copenhagen denmark source quotation shipping lead

** Topic Group: Administrative rights **
notification wireless devices upgrade cr medium wireless devices upgrade cr medium announce users window wireless senior engineer cr medium summary software upgrade release approved wait completed starting ref msg

** Topic Group: Storage **
mailbox almost full mailbox almost mailbox almost senior infrastructure engineer infrastructure upcoming holiday none id

** Topic Group: Internal Project **
opportunity pas known pipeline opportunity known pipeline import bellow opportunity known pipeline oppo