In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD


In [2]:
df_En=pd.read_csv("preprocessed_data_en.csv")
df_De=pd.read_csv("preprocessed_data_de.csv")

In [3]:
vectorizer_En = TfidfVectorizer(input= 'preprocessed_data_en.csv', smooth_idf=True)
vectorizer_De = TfidfVectorizer(input= 'preprocessed_data_de.csv',smooth_idf=True)

df_En["text_lemmatization"]=df_En["text_lemmatization"].fillna("")
df_De["text_lemmatization"]=df_De["text_lemmatization"].fillna("")

X_En = vectorizer_En.fit_transform(df_En["text_lemmatization"])
X_De = vectorizer_De.fit_transform(df_De["text_lemmatization"])


print(X_En.shape) # check shape of the document-term matrix
print(X_De.shape)

(2406, 3281)
(3877, 5244)


# English data

In [4]:
# TruncatedSVD
n_components_En = 7    # Set the number of topics to 7
svd_model_En = TruncatedSVD(n_components_En, algorithm='randomized', n_iter=100, random_state=1) 
X_topics_En = svd_model_En.fit_transform(X_En)
X_topics_En

array([[ 0.15142065, -0.05307252, -0.06952288, ...,  0.06067309,
        -0.07069957,  0.0393933 ],
       [ 0.23148472, -0.15643483,  0.07502752, ...,  0.1571974 ,
        -0.05514851,  0.03897916],
       [ 0.18632449, -0.13014447, -0.06111916, ...,  0.06289895,
        -0.09235059,  0.07498092],
       ...,
       [ 0.19483022, -0.1974531 ,  0.01709883, ...,  0.09640559,
         0.02588503, -0.02635655],
       [ 0.09834024, -0.19830244, -0.08459287, ..., -0.07056232,
        -0.01687574, -0.02094841],
       [ 0.13316825, -0.03313369, -0.06094939, ...,  0.02055697,
        -0.01967723,  0.03530532]])

In [5]:
X_topics_En.shape # X_topics_En with a scale of N*T (number of ticket's samples, number of topics)

(2406, 7)

In [6]:
# Select 6 keywords of each topic.
for i, comp in enumerate(svd_model_En.components_):
    terms_En= vectorizer_En.get_feature_names()
    terms_comp_En = zip(terms_En, comp)
    sorted_terms_En = sorted(terms_comp_En, key= lambda x:x[1], reverse=True)[:6]
    print("Topic %d:" % i)

    for t in sorted_terms_En:      
        print(t[0],end=' | ')
    print()
    print()

Topic 0:
client | reset | gbi | user | hana | password | 

Topic 1:
client | date | contract | team | gbi | provision | 

Topic 2:
reset | master | password | client | account | many | 

Topic 3:
key | generate | user | i78 | mandant | developer | 

Topic 4:
password | master | account | hana | user | date | 

Topic 5:
gbi | version | upgrade | case | study | master | 

Topic 6:
saprouter | connection | server | client | router | message | 



# German data

In [7]:
# TruncatedSVD
n_components_De = 7
svd_model_De = TruncatedSVD(n_components_De, algorithm='randomized', n_iter=100, random_state=1) 
svd_model_De.fit(X_De)
X_topics_De = svd_model_De.fit_transform(X_De)
X_topics_De

array([[ 0.07221025, -0.06295314,  0.02717152, ...,  0.08408536,
        -0.10850394,  0.11897484],
       [ 0.09043899, -0.04350017,  0.04044593, ...,  0.01676354,
        -0.01232909, -0.002909  ],
       [ 0.10037089, -0.06373563, -0.02441942, ...,  0.00706635,
        -0.02496967, -0.03147897],
       ...,
       [ 0.10175984, -0.06961609,  0.00137862, ...,  0.02309904,
        -0.03254945, -0.05368077],
       [ 0.16525203, -0.013086  , -0.1211014 , ..., -0.02681615,
        -0.04429496, -0.00196059],
       [ 0.10720338, -0.05100082,  0.08356271, ..., -0.00820548,
         0.03215703,  0.04041778]])

In [8]:
X_topics_De.shape # X_topics_De with a scale of N*T (number of ticket's samples, number of topics)

(3877, 7)

In [9]:
# Select 6 keywords of each topic.
for i, comp in enumerate(svd_model_De.components_):
    terms_De= vectorizer_De.get_feature_names()
    terms_comp_De = zip(terms_De, comp)
    sorted_terms_De = sorted(terms_comp_De, key= lambda x:x[1], reverse=True)[:6]
    print("Topic %d:" % i)

    for t in sorted_terms_De:      
        print(t[0],end=' | ')
    print()
    print()

Topic 0:
mandanten | user | zurücksetzen | gbi | mandant | team | 

Topic 1:
zurücksetzen | mandanten | mandant | i20 | hiermit | bitten | 

Topic 2:
niklas | grüsse | gut | können | genannt | bestätigung | 

Topic 3:
user | niklas | grüsse | passwort | gut | können | 

Topic 4:
mandantenrücksetzung | schulung | sperren | mandant | i45 | i16 | 

Topic 5:
mandantenrücksetzung | schulung | gbi | user | sperren | i16 | 

Topic 6:
gbi | hana | vg | zurücksetzen | i81 | können | 

