In [1]:
import pandas as pd
import nltk

# Download NLTK data
nltk.download('punkt')

# Load the original Excel file into a DataFrame
df_original = pd.read_excel("Cluster_Def.xlsx")

# Load the CSV file containing the 'Description' column
df_description = pd.read_csv("filtered_combined_data.csv")

# Assuming there is a common column named 'ID' for merging
common_column = 'Make-Model'

# Merge the DataFrames on the common column
df_merged = pd.merge(df_original, df_description[['Make-Model', 'Description']], on=common_column, how='left')

# Words to be removed
words_to_remove = ['good','offers','it', 'its', 'but', 'now','mpg','an',"1","3","6","features"]

# Function to remove specific words from a description
def remove_specific_words(description):
    words = nltk.word_tokenize(description)
    filtered_words = [word for word in words if word.lower() not in words_to_remove]
    return ' '.join(filtered_words)

# Apply the function to the "Description" column
df_merged['Description'] = df_merged['Description'].apply(remove_specific_words)

# Display the modified DataFrame
print(df_merged)

# Write the modified DataFrame to a new CSV file
df_merged.to_csv("modified_data.csv", index=False)

[nltk_data] Downloading package punkt to /home/cformanek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                            Make-Model Engine Type  MSRP Price  \
0                        2024 Audi SQ5        3.0L       58195   
1                  2024 Audi Q8 e‑tron    Electric       75595   
2                         2024 Audi A3        2.0L       36495   
3                         2024 Audi S3        2.0L       48095   
4                       2024 Audi RS 3        2.5L       62795   
..                                 ...         ...         ...   
270  2024 Volkswagen Atlas Cross Sport        2.0L       38065   
271              2024 Volkswagen Jetta        2.0L       22585   
272             2023 Volkswagen Arteon        2.0L       44305   
273             2024 Volkswagen Tiguan        2.0L       29855   
274              2024 Volkswagen Atlas        2.0L       39075   

     Starting Market Average  City MPG  Highway MPG  # of seats  Trim Levels  \
0                      58045        19           24           5            3   
1                      71650         0         

In [2]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import string

# Download NLTK resources (if not already downloaded)
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Load the CSV file
df = pd.read_csv('modified_data.csv')

# Function to preprocess and tokenize text
def process_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

# Create a dictionary to store word frequencies for each cluster
cluster_word_frequencies = {}

# Iterate through clusters
for cluster in df['Cluster'].unique():
    # Filter the DataFrame based on the current cluster
    cluster_df = df[df['Cluster'] == cluster]
    
    # Combine all descriptions for the current cluster into a single text
    cluster_text = ' '.join(cluster_df['Description'])
    
    # Tokenize and preprocess the text
    tokens = process_text(cluster_text)
    
    # Calculate word frequencies
    freq_dist = FreqDist(tokens)
    
    # Store the word frequencies in the dictionary
    cluster_word_frequencies[cluster] = freq_dist

# Print the most common words for each cluster
for cluster, freq_dist in cluster_word_frequencies.items():
    print(f"\nCluster {cluster}:")
    for word, frequency in freq_dist.most_common(15):  # Change 10 to the desired number of top words
        print(f"{word}: {frequency} times")



Cluster C1:
turbo: 49 times
5seater: 33 times
luxury: 22 times
30l: 17 times
performance: 16 times
v8: 16 times
i6: 16 times
suv: 15 times
rear: 15 times
interior: 14 times
i4: 14 times
7seater: 13 times
hybrid: 12 times
20l: 12 times
handling: 11 times

Cluster C3:
v6: 39 times
5seater: 31 times
rear: 25 times
four: 25 times
v8: 22 times
4wd: 21 times
19: 15 times
35l: 15 times
36l: 14 times
7seater: 13 times
electric: 12 times
rwd: 12 times
cabin: 12 times
cargo: 12 times
20: 12 times

Cluster C4:
turbo: 55 times
5seater: 53 times
i4: 47 times
20l: 42 times
luxury: 25 times
premium: 17 times
performance: 15 times
23: 14 times
suv: 14 times
interior: 13 times
2: 11 times
sedan: 11 times
22: 11 times
awd: 11 times
styling: 10 times

Cluster C2:
5seater: 86 times
i4: 51 times
front: 45 times
electric: 39 times
mpge: 28 times
interior: 25 times
range: 23 times
suv: 21 times
25l: 21 times
awd: 18 times
turbo: 18 times
20l: 18 times
hybrid: 18 times
compact: 17 times
fwd: 17 times


[nltk_data] Downloading package punkt to /home/cformanek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cformanek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
