In [302]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('Your_file_name.csv')
print(df.head())

In [None]:
df['Response-2020'].dropna().to_csv('cleaned_response-2020.csv', index = False)
df_2020 = pd.read_csv('cleaned_response-2020.csv')
print(df_2020.head())

In [None]:
df['Response-2021'].dropna().to_csv('cleaned_response-2021.csv', index = False)
df_2021 = pd.read_csv('cleaned_response-2021.csv')
print(df_2021.head())

In [306]:
def drop_numbers(list_text):
    list_text_new = []
    for i in list_text:
        if not re.search('\d', i):
            list_text_new.append(i)
    return ''.join(list_text_new)

df_2020['cleaned_data'] = df_2020['Response-2020'].apply(drop_numbers)
df_2021['cleaned_data'] = df_2021['Response-2021'].apply(drop_numbers)

In [307]:
def lower_case(text):
    text_words = word_tokenize(text)
    text_words_lower = [x.lower() for x in text_words]
    return ' '.join(text_words_lower)

df_2020['cleaned_data'] = df_2020['cleaned_data'].apply(lower_case)
df_2021['cleaned_data'] = df_2021['cleaned_data'].apply(lower_case)

In [None]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('wordnet')

In [309]:
from nltk.corpus import wordnet

def get_wordnet_pos(tag):
    """Map POS tag to first character lemmatize() accepts"""
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag[0].upper(), wordnet.NOUN)

lemmatizer = WordNetLemmatizer()

def lemmatise(text):
    text_tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(text_tokens)
    text_lemm = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags]
    return ' '.join(text_lemm)

df_2020['cleaned_data'] = df_2020['cleaned_data'].apply(lemmatise)
df_2021['cleaned_data'] = df_2021['cleaned_data'].apply(lemmatise)

In [310]:
def remove_stopword(text):
    additional_stopwords = {'inaudible', "n't", 'u', 'oh', 's', 'r', 'uh', 'd', 'thing', 'think', 'still', 'much', 'anything', 'whatever','ve', 'sometimes', 'something'}
    default_stopwords = set(stopwords.words('english'))
    custom_stopwords = default_stopwords.union(additional_stopwords)
    text_tokens = word_tokenize(text)
    tokens = [word for word in text_tokens if not word in custom_stopwords]
    tokens_text = ' '.join(tokens)
    return tokens_text

df_2020['cleaned_data'] = df_2020['cleaned_data'].apply(remove_stopword)
df_2021['cleaned_data'] = df_2021['cleaned_data'].apply(remove_stopword)

In [320]:
# Function to remove filler words like 'umm' from text
def remove_umm(text):
    pattern = r'\bum{1,}\b' # Regular expression pattern for 'umm'
    return re.sub(pattern, '', text) # Replace 'umm' with an empty string

# Applying the remove_umm function to the cleaned_data column of 2020 and 2021 dataframes
df_2020['cleaned_data'] = df_2020['cleaned_data'].apply(remove_umm)
df_2021['cleaned_data'] = df_2021['cleaned_data'].apply(remove_umm)

In [312]:
# Saving the modified dataframes to CSV files
df_2020.to_csv('cleaned_response-2020.csv')
df_2021.to_csv('cleaned_response-2021.csv')

In [313]:
# Combining the cleaned_data columns from both dataframes into one series
col1 = df_2020['cleaned_data']
col2 = df_2021['cleaned_data']
combined_column = pd.concat([col1, col2])
combined_df = pd.DataFrame(combined_column) # Creating a dataframe from the combined series


In [None]:
combined_df

In [315]:
# Initializing a TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_df=0.5, min_df=0.2)

In [316]:
# Transforming the combined data using TF-IDF
tfidf_matrix = vectorizer.fit_transform(combined_df['cleaned_data'])

In [None]:
# Getting the feature names (words) from the vectorizer
vectorizer.get_feature_names_out()

In [318]:
# Normalizing the TF-IDF matrix
normalized_tfidf = normalize(tfidf_matrix)

In [None]:
num_clusters = 1 # Setting the number of clusters for KMeans

# Initializing and applying KMeans clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(normalized_tfidf)

# Adding the cluster labels to the dataframe
combined_df['cluster'] = clusters

In [None]:
# Generating word clouds for each cluster
for i in range(num_clusters):
    # Aggregate text for each cluster
    cluster_text = ' '.join(combined_df[combined_df['cluster'] == i]['cleaned_data'])

    # Generate a word cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(cluster_text)

    # Plot the word cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud for Cluster {i}')
    plt.show()
