In [2]:
import pandas as pd
from collections import defaultdict
from itertools import combinations

In [3]:


# Read the main CSV file into a DataFrame
df = pd.read_csv('CleanedDataCobb.csv')
#df = pd.read_csv("CleanedDataPickens.csv")
#df = pd.read_csv("CleanedDataCherokee.csv")
df = df[~((df['Label'] == 11) & (df.duplicated(subset=['Name', 'Label'])))]
df = df[df['Label'] != 20]

# Read the list of top 10000 most common English words
with open('google-10000-english-usa.txt', 'r') as f:
    all_common_english_words = set(f.read().splitlines())

# Initialize dictionaries to store the frequency of each root word across distinct categories and names
all_root_word_frequency = defaultdict(set)
all_root_word_count = defaultdict(int)

# Tokenize the names and associate root words and their combinations with categories
for index, row in df.iterrows():
    name, category = row['Name'], row['Label']
    tokens = name.split()

        
    for i in range(len(tokens)):
        for j in range(i + 1, len(tokens) + 1):
            root_word = ' '.join(tokens[i:j])
            all_root_word_frequency[root_word].add(category)
            all_root_word_count[root_word] += 1


# Sort, filter, and format
sorted_all_root_words = sorted(all_root_word_frequency.items(), key=lambda x: (len(x[1]), len(x[0])), reverse=True)
filtered_root_words = [(root_word, categories) for root_word, categories in sorted_all_root_words if root_word.lower() not in all_common_english_words and len(root_word) > 1]
filtered_result = [(root_word, len(categories), all_root_word_count[root_word]) for root_word, categories in filtered_root_words]

# Create DataFrame
filtered_df = pd.DataFrame(filtered_result, columns=['Root Word', 'Number of Categories', 'Number of Occurrences'])

# Filter rows based on character conditions
filtered_df = filtered_df[(filtered_df['Root Word'].str.len() > 2) & (~filtered_df['Root Word'].str.contains('[&-]'))]

In [4]:
# Add 'Number of Occurrences' to the filtered_df
filtered_df['Number of Occurrences'] = filtered_df['Root Word'].apply(lambda x: all_root_word_count[x])

# Filter out rows with 'Number of Categories' < 3
filtered_df = filtered_df[filtered_df['Number of Categories'] >= 3]

# Sort by the length of 'Root Word'
filtered_df['length'] = filtered_df['Root Word'].apply(len)
filtered_df = filtered_df.sort_values('length', ascending=False).drop('length', axis=1).reset_index(drop=True)

# Initialize a list to hold the rows to keep
rows_to_keep = []

for i, row in filtered_df.iterrows():
    root1 = row['Root Word']
    count1 = row['Number of Categories']
    occurrences1 = row['Number of Occurrences']
    categories1 = all_root_word_frequency[root1]
    
    if ' ' not in root1:  # Only consider one-word roots for removal
        should_remove = False
        for j, row2 in filtered_df.iterrows():
            root2 = row2['Root Word']
            count2 = row2['Number of Categories']
            if root1 in root2.split() and root1 != root2:
                categories2 = all_root_word_frequency[root2]
                if len(categories2) / len(categories1) >= 0.8:
                    should_remove = True
                    break
        if not should_remove:
            rows_to_keep.append((root1, count1, occurrences1))
    else:
        rows_to_keep.append((root1, count1, occurrences1))

# Create the final DataFrame from the rows to keep
final_filtered_df = pd.DataFrame(rows_to_keep, columns=['Root Word', 'Number of Categories', 'Number of Occurrences'])

# Sort by 'Number of Categories' and 'Root Word'
final_filtered_df = final_filtered_df.sort_values(by=['Number of Categories', 'Root Word'], ascending=[False, True])

# Additional filtering based on character conditions
final_filtered_df = final_filtered_df[(final_filtered_df['Root Word'].str.len() > 2) & (~final_filtered_df['Root Word'].str.contains('[&-]'))].reset_index(drop=True)

In [5]:
# Rank the rows based on 'Number of Categories' and 'Number of Occurrences'
final_filtered_df['Cat_Rank'] = final_filtered_df['Number of Categories'].rank(method='min', ascending=False)
final_filtered_df['Occ_Rank'] = final_filtered_df['Number of Occurrences'].rank(method='min', ascending=False)

# Calculate the 'Score' based on the ranks
final_filtered_df['Score'] = final_filtered_df['Cat_Rank'] * final_filtered_df['Occ_Rank']

# Drop the temporary rank columns
final_filtered_df.drop(['Cat_Rank', 'Occ_Rank'], axis=1, inplace=True)

# Initialize a list to hold the rows to remove based on the 10% rule
rows_to_remove = set()

for i, row1 in final_filtered_df.iterrows():
    root1, occurrences1 = row1['Root Word'], row1['Number of Occurrences']
    if ' ' not in root1:  # Only consider one-word roots for comparison
        for j, row2 in final_filtered_df.iterrows():
            root2, occurrences2 = row2['Root Word'], row2['Number of Occurrences']
            if root1 in root2.split() and root1 != root2:
                if occurrences2 / occurrences1 < 0.10:
                    rows_to_remove.add(root2)

# Remove the rows that violate the 10% rule
final_filtered_df = final_filtered_df[~final_filtered_df['Root Word'].isin(rows_to_remove)].reset_index(drop=True)

# List of prepositions to filter out
prepositions = ['of', 'at', 'in', 'by', 'for', 'with', 'on', 'to', 'about', 'against']

# Remove rows starting with a preposition or with three characters
final_filtered_df = final_filtered_df[~final_filtered_df['Root Word'].str.startswith(tuple(prepositions))]
final_filtered_df = final_filtered_df[~(final_filtered_df['Root Word'].str.len() == 3)]

# Reset index after filtering
final_filtered_df.reset_index(drop=True, inplace=True)


In [6]:
# Function to calculate the points based on category
def calculate_points(categories):
    points = 0
    for category in categories:
        if category in [10, 6, 1]:
            points += 10
        elif category == 9:
            points += 1
        else:
            points += 5
    return points

# Calculate 'Score2' for each row
final_filtered_df['Score2'] = final_filtered_df['Root Word'].apply(lambda x: calculate_points(all_root_word_frequency[x]))



In [7]:
final_filtered_df
final_filtered_df.to_csv("FinalFiltered.csv",index=False)

In [8]:
from sklearn.cluster import HDBSCAN
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_score

# Generate synthetic data
data, _ = make_blobs(1000, centers=5)

# Fit HDBSCAN model
clusterer = HDBSCAN(min_cluster_size=5)
clusterer.fit(data)

# Get labels and probabilities
labels = clusterer.labels_
probabilities = clusterer.probabilities_

# Calculate silhouette score
sil_score = silhouette_score(data, labels)
print(f"Silhouette Score: {sil_score}")

# You can also inspect the `probabilities` to gauge the "strength" of each data point's membership in its cluster
print(f"Membership Probabilities: {probabilities}")

Silhouette Score: 0.7263389155788527
Membership Probabilities: [1.         0.64260321 0.76954623 1.         1.         1.
 1.         0.70084081 1.         1.         0.79887914 1.
 0.63596115 1.         1.         0.94389778 0.97631853 1.
 1.         1.         1.         0.68966558 1.         1.
 1.         0.82955837 0.978473   0.82031846 1.         1.
 0.70209717 1.         0.35454362 1.         1.         0.87024052
 0.96845807 1.         1.         0.68672219 0.49173484 1.
 1.         1.         1.         1.         0.94130312 1.
 1.         0.5218708  0.74622779 1.         1.         1.
 0.19336747 1.         1.         1.         1.         0.5786924
 1.         0.39593388 0.77889301 0.35408904 0.44359407 1.
 1.         1.         1.         1.         1.         1.
 0.85592881 1.         1.         1.         1.         0.61005371
 1.         0.50054132 1.         1.         1.         1.
 1.         1.         1.         0.67474651 1.         0.90345169
 0.69211433 1.       