In [None]:
!pip install numpy pandas matplotlib scikit-learn spacy

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import spacy
import re
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score


In [None]:
%matplotlib widget

In [None]:
# Load the English NLP model from spaCy
nlp = spacy.load("en_core_web_sm")

In [None]:
# Read usernames from a CSV file with a header
df = pd.read_csv('usernames_dataset.csv')

In [None]:
# Function to count the number of special characters in a username
def count_special_characters(username):
    special_characters = re.findall(r'[!\"#\$%&\'\(\)\*\+,\-\.\/:;<=>\?@\[\\\]\^_`{\|}~]', username)
    return len(special_characters)

# Function to get a list of unique special characters in a username
def unique_special_characters(username):
    special_characters = re.findall(r'[!\"#\$%&\'\(\)\*\+,\-\.\/:;<=>\?@\[\\\]\^_`{\|}~]', username)
    return list(set(special_characters))

# Function to count the number of nouns in a username
def count_nouns(username):
    doc = nlp(username)
    nouns = [token.text for token in doc if token.pos_ == "NOUN"]
    return len(nouns)

In [None]:

# Feature Engineering
df['Length'] = df['Username'].apply(len)
df['Special Characters'] = df['Username'].apply(lambda username: int(bool(re.search(r'[!\"#\$%&\'\(\)\*\+,\-\.\/:;<=>\?@\[\\\]\^_`{\|}~]', username))))
df['Number of Special Characters'] = df['Username'].apply(count_special_characters)
df['Unique Special Characters'] = df['Username'].apply(unique_special_characters)
df['Numbers'] = df['Username'].apply(lambda username: int(bool(re.search(r'\d', username))))
# df['Uppercase Letters'] = df['Username'].apply(lambda username: sum(1 for char in username if char.isupper()))
df['Number of Words'] = df['Username'].apply(lambda username: len(re.findall(r'\w+', username)))
# df['Nouns'] = df['Username'].apply(count_nouns)


unique_chars = df['Username'].apply(unique_special_characters)
all_unique_chars = set(char for sublist in unique_chars for char in sublist)

# Create binary columns for unique special characters
for char in all_unique_chars:
    df[char] = df['Username'].apply(lambda username: int(char in unique_special_characters(username)))

In [None]:

# Normalize and scale features
scaler = StandardScaler()
# scaled_features = scaler.fit_transform(df[['Length', 'Special Characters', 'Numbers', 'Uppercase Letters', 'Number of Words', 'Nouns', 'Number of Special Characters'] + list(all_unique_chars)])
scaled_features = scaler.fit_transform(df[['Length', 'Special Characters', 'Numbers', 'Number of Words', 'Number of Special Characters'] + list(all_unique_chars)])


In [None]:

# Calculate cosine similarity matrix (since we have text-based features)
cosine_sim = cosine_similarity(scaled_features)


In [None]:

# Perform hierarchical clustering
linkage_matrix = linkage(cosine_sim, method='average', metric='cosine')


In [None]:
# Visualize the dendrogram (same as before)
plt.figure(figsize=(10, 6))
dendrogram(linkage_matrix, labels=df['Username'].tolist(), leaf_rotation=45, leaf_font_size=12)
plt.xlabel('Usernames')
plt.ylabel('Distance')
plt.title('Dendrogram of Usernames')
plt.show()


In [None]:

# Cut the dendrogram to get clusters (same as before)
num_clusters = 7  # Change this value based on your analysis
clusters = fcluster(linkage_matrix, t=num_clusters, criterion='maxclust')
clusters


In [None]:
# Calculate the Silhouette Score
silhouette_avg = silhouette_score(scaled_features, clusters)
print(f"Silhouette Score: {silhouette_avg}")

In [None]:

# Add cluster labels to the DataFrame
df['Cluster'] = clusters


In [None]:
# Train a decision tree classifier to predict clusters
classifier = DecisionTreeClassifier()
classifier.fit(scaled_features, clusters)  # 'clusters' are the obtained cluster labels


In [None]:
# Get feature importances
feature_importances = classifier.feature_importances_

In [None]:
np.unique(clusters)

In [None]:
# Create a dictionary to store the most important features for each cluster
cluster_most_important_features = {}

for cluster_label in np.unique(clusters):
    # Get instances that belong to the current cluster
    cluster_instances = df[clusters == cluster_label]
    
    # Get the most important features for the current cluster using the trained classifier
    important_features = df.columns[np.argsort(-feature_importances)]
    most_important_features = important_features[np.isin(important_features, cluster_instances.columns)]
    
    # Store the most important features in the dictionary
    cluster_most_important_features[cluster_label] = most_important_features

# Create a new column in the DataFrame with the most important features for each cluster
df['Most Important Features'] = df['Cluster'].map(cluster_most_important_features)


In [None]:
# Initialize an empty dictionary to store cluster-wise feature importances
cluster_avg_feature_importances = {}

# Iterate through each cluster
for cluster_label in np.unique(clusters):
    # Get rows for the current cluster
    cluster_rows = df[df['Cluster'] == cluster_label]
    
    # Get the indices of the rows in the current cluster
    cluster_indices = cluster_rows.index
    
    # Filter the feature importances based on the current cluster's indices
    cluster_feature_importances = feature_importances[cluster_indices]
    
    # Calculate the average feature importances for the current cluster
    avg_importances = np.mean(cluster_feature_importances, axis=0)
    
    # Store the average importances in the dictionary
    cluster_avg_feature_importances[cluster_label] = avg_importances

In [None]:
# Visualize feature importances
plt.figure(figsize=(10, 6))
plt.bar(range(len(feature_importances)), feature_importances)
plt.xticks(range(len(df.columns)), df.columns, rotation='vertical')
plt.xlabel('Features')
plt.ylabel('Feature Importance')
plt.title('Feature Importance Analysis')
plt.tight_layout()  # Ensure proper spacing of labels
plt.show()

In [None]:

# Display the clustered usernames (same as before)
print(df)

df.to_csv('clustered_usernames.csv', index=False)


In [None]:

# Analyze the patterns within each cluster (same as before)
for cluster_id in range(1, num_clusters + 1):
    cluster_data = df[df['Cluster'] == cluster_id]
    print(f"\nCluster {cluster_id} - Usernames: [{len(cluster_data['Username'].tolist())}]")
    print(cluster_data['Username'].tolist())
    # Additional analysis can be performed on each cluster, such as examining naming conventions.
