In [93]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import WordPunctTokenizer 
import csv
from gensim import corpora
import os
import numpy as np  
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.cluster import KMeans 
from sklearn import cluster

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/brittanysteenbergen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/brittanysteenbergen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/brittanysteenbergen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [94]:
tk = WordPunctTokenizer() 

def tokenizing_nltk(doc):
    doc_tokenized = tk.tokenize(doc)
    
    wordnet_lemmatizer = WordNetLemmatizer()
    nltk_lemmaList = []
    for word in doc_tokenized:
        nltk_lemmaList.append(wordnet_lemmatizer.lemmatize(word))

    filtered_sentence = []  
    nltk_stop_words = set(stopwords.words("english"))
    my_stop_words = {"I", "4", "ha", "c", ")", ").", "|", "3", "-", "(","–", "e", "249"}
    
    for w in nltk_lemmaList:  
        if w not in nltk_stop_words and w not in my_stop_words:  
            filtered_sentence.append(w)

    punctuations="?:!.,;-()|/"
    for word in filtered_sentence:
        if word in punctuations:
            filtered_sentence.remove(word)

    return filtered_sentence

In [95]:
def get_top_features_cluster(tf_idf_array, prediction, n_feats):
    labels = np.unique(prediction)
    dfs = pd.DataFrame()
    for label in labels:
        id_temp = np.where(prediction==label) # indices for each cluster
        x_means = np.mean(tf_idf_array[id_temp], axis = 0) # returns average score across cluster
        sorted_means = np.argsort(x_means)[::-1][:n_feats] # indices with top 20 scores
        features = vectorizer.get_feature_names_out()
        best_features = [(features[i], x_means[i]) for i in sorted_means]
        df = pd.DataFrame(best_features, columns = ['features', 'score'])
        dfs = pd.concat([df, dfs])
    return dfs

In [96]:
vectorizer = TfidfVectorizer(stop_words='english') 

def k_means(file):
    survey = open(file, 'r')
    lines = survey.readlines()
    questions = [line.lower() for line in lines]
    questions = [tokenizing_nltk(doc) for doc in questions] 
    documents = [' '.join(tokens) for tokens in questions]

#    vectorizer = TfidfVectorizer(stop_words='english') 
    vectorized_documents = vectorizer.fit_transform(documents) 

    final_df = pd.DataFrame(data = vectorized_documents.toarray(), columns=vectorizer.get_feature_names_out())

    kmeans = cluster.KMeans(n_clusters = 9
                       , init = 'k-means++'
                       , n_init = 10
                       , tol = 0.0001
                       , random_state = 1)

    kmeans.fit(vectorized_documents) 

    results = pd.DataFrame() 
    results['document'] = lines
    results['cluster'] = kmeans.labels_ 

    final_df_array = final_df.to_numpy()
    prediction = kmeans.predict(final_df)
    n_feats = 3
    dfs = get_top_features_cluster(final_df_array, prediction, n_feats)
    dfs.to_csv("top_words_kmeans_" + file.rsplit('.', maxsplit=1)[0] + ".csv")
    return dfs

In [97]:
for file in os.listdir():
    if file.endswith(".txt"):
        print('File: ', file)
        print('Topics: ', k_means(file))



File:  mathematics.txt
Topics:         features     score
0    department  0.469863
1    suggestion  0.410567
2      relation  0.265439
0    graduation  0.304971
1           lmu  0.262803
2          plan  0.249314
0       comment  1.000000
1       writing  0.000000
2      decision  0.000000
0       ability  0.418798
1    confidence  0.418798
2   mathematics  0.381115
0      division  0.384394
1          math  0.329164
2        course  0.279689
0      advising  0.540543
1        career  0.200416
2          feel  0.164599
0       student  0.296206
1    department  0.216054
2  relationship  0.190688
0          unit  0.621517
1        course  0.430851
2        versus  0.295190
0    experience  0.443739
1       thought  0.391985
2      division  0.271734
File:  hhsc.txt
Topics:       features     score
0     provide  1.000000
1         yes  0.000000
2    employer  0.000000
0      course  0.462325
1        wish  0.299076
2       taken  0.299076
0       major  0.359580
1          wa  0.248556



Topics:          features     score
0            lmu  0.315919
1        offered  0.158033
2          major  0.138607
0  institutional  0.318516
1  communication  0.318516
2        setting  0.318516
0        current  0.622632
1     profession  0.373200
2         status  0.325122
0        society  0.483504
1   organization  0.483504
2         member  0.483504
0           rate  0.182309
1        overall  0.182309
2       received  0.170618
0          begin  0.470189
1        college  0.470189
2         attend  0.289346
0        science  0.382185
1          human  0.250325
2         health  0.223218
0       valuable  0.396728
1           ntls  0.323881
2           hhsc  0.277452
0         course  0.323262
1       division  0.286301
2      knowledge  0.201970
File:  elec_eng.txt
Topics:           features     score
0          sexual  0.538084
1     orientation  0.418703
2          people  0.209218
0       equitable  0.430930
1         support  0.378483
2            peer  0.088785
0      der



Topics:        features     score
0         wish  0.464320
1      improve  0.464320
2  differently  0.464320
0     obstacle  0.509455
1  significant  0.509455
2      success  0.509455
0       physic  0.408755
1         know  0.372585
2       modern  0.372585
0       physic  0.602794
1     prepared  0.486961
2  effectively  0.442622
0       method  0.383893
1        model  0.383893
2  calculation  0.383893
0       physic  0.562686
1      applied  0.313447
2      program  0.281343
0       option  0.368064
1       school  0.339589
2     graduate  0.339589
0       physic  0.099676
1  requirement  0.077576
2       result  0.069812
0      faculty  0.539603
1     research  0.539603
2  opportunity  0.478231
File:  envir_sci.txt
Topics:        features     score
0         work  0.515819
1     position  0.515819
2     complete  0.515819
0       answer  0.465886
1    following  0.465886
2       regard  0.465886
0      project  0.577350
1        brief  0.577350
2  description  0.577350
0      facu



Topics:          features     score
0           year  0.775898
1          class  0.630859
2      improving  0.000000
0         affect  0.416106
1           mark  0.416106
2          apply  0.416106
0  encouragement  0.392361
1    experienced  0.339593
2           home  0.290401
0            lmu  0.097389
1      generally  0.074665
2            gpa  0.066667
0          group  0.399262
1      community  0.399262
2            lab  0.345566
0       identity  0.652621
1         aspect  0.386018
2           feel  0.351230
0      harassing  0.369805
1       behavior  0.369805
2    experienced  0.320071
0      computing  0.436192
1       pursuing  0.436192
2         career  0.398196
0           post  0.383673
1           sent  0.383673
2         social  0.383673
File:  mech_eng.txt
Topics:            features     score
0         societal  0.325470
1        recognize  0.325470
2          ethical  0.325470
0       conclusion  0.303538
1  experimentation  0.303538
2             data  0.303538
0  

