# This notebook contains a dump of unused and/or abandonned scripts' snippets 

In [1]:
import collections
import re 
# Re-reading the file and tokenizing the model names
with open('./data/text-generation.txt', 'r') as file:
    model_names = [line.split('/')[1].strip() for line in file if '/' in line]

# Tokenizing the suffixes using "." and "-" as separators
tokenized_suffixes = [re.split(r'[-\.]', name) for name in model_names if '-' in name or '.' in name]

tokenized_suffixes = [token for suffix in tokenized_suffixes for token in suffix]

# Counting the occurrence of individual tokens and combinations of tokens
token_counts = collections.Counter(tokenized_suffixes)
combination_counts = collections.Counter()

for name in model_names:
    tokens = re.split(r'[-\.]', name)
    if len(tokens) > 1:
        for i in range(len(tokens) - 1, 0, -1):
            combination = '.'.join(tokens[i:]) if '.' in tokens[i] else '-'.join(tokens[i:])
            combination_counts[combination] += 1

# Most common individual tokens and combinations
most_common_tokens = token_counts.most_common(10)
most_common_combinations = combination_counts.most_common(10)

most_common_tokens, most_common_combinations

([('7b', 5741),
  ('2', 5660),
  ('llama', 3060),
  ('1', 2628),
  ('7B', 2395),
  ('gpt2', 1887),
  ('13b', 1822),
  ('exl2', 1665),
  ('chat', 1586),
  ('13B', 1491)],
 [('exl2', 1388),
  ('GPTQ', 1329),
  ('AWQ', 962),
  ('h6-exl2', 911),
  ('0bpw-h6-exl2', 731),
  ('7b', 697),
  ('2', 654),
  ('1', 603),
  ('GGUF', 526),
  ('v2', 450)])

In [None]:
import re
import collections

# Read the file and tokenize the model names
with open('./data/text-generation.txt', 'r') as file:
    model_names = [line.strip() for line in file]

# Extract the name part from each model name if it exists
name_parts = [name.split('/')[1].split('-')[0].split('.')[0] for name in model_names if '/' in name]

# Find the name parts that are repeated in the file
repeated_name_parts = [name_part for name_part, count in collections.Counter(name_parts).items() if count > 1]

# Filter the model names to keep only the ones with repeated name parts
filtered_model_names = [name for name in model_names if '/' in name and name.split('/')[1].split('-')[0].split('.')[0] in repeated_name_parts]

# Tokenize the version part using "-" and "." as delimiters
tokenized_versions = [re.split(r'[-\.]', name.split('/')[1].split('-')[1]) for name in filtered_model_names if '/' in name and '-' in name.split('/')[1] ]

# Extract all the tokens
tokens = [token for version in tokenized_versions for token in version]

# Combine the tokens in different ways using "-" and "."
combinations = []
for i in range(1, 5):
    combinations.extend(['-'.join(tokens[j:j+i]) for j in range(len(tokens)-i+1)])
    combinations.extend(['.'.join(tokens[j:j+i]) for j in range(len(tokens)-i+1)])

# Count the occurrence of individual tokens and combinations
token_counts = collections.Counter(tokens)
combination_counts = collections.Counter(combinations)

# Find the top tokens and combinations
top_tokens = token_counts.most_common(10)
top_combinations = combination_counts.most_common(10)

# Group the model names by the repeated name parts
grouped_model_names = collections.defaultdict(list)
for name in filtered_model_names:
    if '-' in name.split('/')[1] :
        name_part = name.split('/')[1].split('-')[0].split('.')[0]
        grouped_model_names[name_part].append(name.split('/')[1].split('-')[1])

# Find the repeated patterns of versioning
repeated_patterns = []
for name_part, versions in grouped_model_names.items():
    version_counts = collections.Counter(versions)
    for version, count in version_counts.items():
        if count > 1:
            repeated_patterns.append((name_part, version, count))

# Sort the repeated patterns by count in descending order
repeated_patterns.sort(key=lambda x: x[2], reverse=True)

# Display the top repeated patterns of versioning
top_repeated_patterns = repeated_patterns[:10]
top_repeated_patterns


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Create a dataframe from the top_tokens list
df = pd.DataFrame(top_tokens, columns=['Token', 'Occurrences'])

# Set the maximum radius for the scatter points
max_radius = 100

# Calculate the radii for the scatter points based on the occurrence count
df['Radius'] = max_radius * (df['Occurrences'] / df['Occurrences'].max())

# Set the coordinates for the scatter points
df['X'] = range(len(df))
df['Y'] = 0

# Create the scatter plot using seaborn
sns.scatterplot(data=df, x='X', y='Y', size='Radius', sizes=(200, 20), alpha=0.5, x_jitter=0.2)

# Set the title and labels for the plot
plt.title('Bubble Map of Top Tokens')
plt.xlabel('Token')
plt.ylabel('Occurrences')

# Display the plot
plt.show()


In [None]:
import matplotlib.pyplot as plt
import networkx as nx

# Create a network graph
G = nx.Graph()

# Calculate the maximum count for scaling weights inversely
max_count = max(count for _, count in top_20_common_groups_3_or_more_new_two)

# Add nodes and edges with inverse weights
for tokens, count in top_20_common_groups_3_or_more_new_two:
    tokens = list(tokens)
    inverse_weight = max_count / count  # Inverse proportion
    for i in range(len(tokens)):
        for j in range(i + 1, len(tokens)):
            G.add_edge(tokens[i], tokens[j], weight=inverse_weight)

fig, ax = plt.subplots(figsize=(12, 12))

# Define the layout
pos = nx.spring_layout(G, weight='weight')  # Use the weight for layout

# Nodes
nx.draw_networkx_nodes(G, pos, node_size=300)

# Edges
for (u, v, d) in G.edges(data=True):
    nx.draw_networkx_edges(G, pos, edgelist=[(u, v)], width=1)

# Labels
nx.draw_networkx_labels(G, pos, font_size=6, font_family='sans-serif')

plt.axis('off')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import networkx as nx

# Your data in the specified format
data = top_20_common_groups_3_or_more_new_two

# Create a network graph
G = nx.Graph()

# Add nodes and edges with weights
for tokens, count in data:
    tokens = list(tokens)
    for i in range(len(tokens)):
        for j in range(i + 1, len(tokens)):
            G.add_edge(tokens[i], tokens[j], weight=count)

# Draw the graph using the Fruchterman-Reingold layout algorithm
pos = nx.spring_layout(G, k=0.3, iterations=1000)  # increase the value of k for more space between nodes

# Set up the matplotlib figure with a bigger size
fig, ax = plt.subplots(figsize=(12, 12))

# Nodes
nx.draw_networkx_nodes(G, pos, node_size=400)

# Edges
edge_widths = [d['weight']/2500 for u, v, d in G.edges(data=True)]
nx.draw_networkx_edges(G, pos, width=edge_widths)

# Labels
nx.draw_networkx_labels(G, pos, font_size=6, font_family='sans-serif', verticalalignment='center')

# Hide axis
plt.axis('off')

# Display the graph
plt.show()

In [None]:
import os
import re
import collections
# Define the directory path
directory = './data'

# Define the list of file names
files = ['text-classification.txt', 'text-generation.txt', 'text2text-generation.txt', 'token-classification.txt', 'text-to-image.txt', 'fill-mask.txt']

# Process each file
for file_name in files:
    # Construct the file path
    file_path = os.path.join(directory, file_name)
    
    # Read the file and tokenize the model names
    with open(file_path, 'r') as file:
        models = [line.split('/')[1].strip() for line in file if '/' in line]

    # Filter models with versions
    models = [model for model in models if '-' in model or '.' in model]

    # Extract model names
    models_names = [model.split('-')[0].split('.')[0] for model in models]

    # Find models names with multiple versions
    multi_versions_models_names = [name_part for name_part, count in collections.Counter(models_names).items() if count > 1]

    # Filter models with multiple versions
    multi_versions_models = [model for model in models if model.split('-')[0].split('.')[0] in multi_versions_models_names]

    # Initialize a dictionary to store the versions for each model
    multi_versions_models_dict = {}

    # Process each model with multiple versions
    for model in multi_versions_models:
        model_name = model.split('-')[0].split('.')[0]
        version = "-".join(model.split('-')[1:]) if model.split('-')[0] in multi_versions_models_names else ".".join(model.split('.')[1:]) 
        version = re.sub(r'-?[vV]\d\.?\d?\.?\d?', '', version) 
        version = re.split(r'-', version) 
        version = [re.split(r'\.', version) if re.search(r'\d\.\d',version) is None else [version] for version in version]
        version = [[version for version in versions if version != ''] for versions in version]
        version = [token for version in version for token in version]
        version = [token for token in version if not token.isdigit()]
        version = [token for token in version if len(token) > 1]
        
        # Update the dictionary with the versions for each model
        if model_name in multi_versions_models_dict:
            existing_tokens = multi_versions_models_dict[model_name]
            for token in version:
                if token.lower() not in existing_tokens:
                    multi_versions_models_dict[model_name].append(token.lower())
        else:
            multi_versions_models_dict[model_name] = [token.lower()]
    
    # Export the dictionary to a file
    output_file_path = f'multi_versions_models_dict_{file_name.replace(".txt", "")}.txt'
    with open(output_file_path, 'w') as file:
        for model, versions in multi_versions_models_dict.items():
            file.write(model + ': ' + ', '.join(versions) + '\n')

    # Extract all tokens from the dictionary
    tokens = [token for model, versions in multi_versions_models_dict.items() for token in versions]

    # Count the occurrence of individual tokens and combinations
    token_counts = collections.Counter(tokens)

    # Find the top tokens and combinations
    top_tokens = token_counts.most_common(100)

    # Print the top tokens and combinations for each file
    print(f"Top tokens and combinations for file {file_name}:")
    for token, count in top_tokens:
        print(f"{token}: {count}")
    print()


In [None]:
import os
# Set up the matplotlib figure
fig, axs = plt.subplots(2, 3, figsize=(15, 10))
axs = axs.flatten()

# Iterate through the global dictionary
for i, (file_name, file_dict) in enumerate(global_dictionary.items()):
    # file top tokens
    tokens = [token for model, versions in file_dict.items() for token in versions]
    token_counts = collections.Counter(tokens)
    top_tokens = token_counts.most_common(100)

    # Extract the strings and occurrences from the file's dictionary
    strings = [token[0] for token in top_tokens]
    occurrences = [token[1] for token in top_tokens]

    # Create a dictionary of word frequencies
    word_frequencies = dict(zip(strings, occurrences))

    # Generate the word cloud
    wordcloud = WordCloud(width=800, height=800, background_color='white').generate_from_frequencies(word_frequencies)

    # Display the word cloud in the corresponding subplot
    axs[i].imshow(wordcloud, interpolation='bilinear')
    axs[i].set_title(file_name)
    axs[i].axis('off')

    # Save the chart to a file
    image_path = os.path.join('./images', f'{file_name}_wordcloud.png')
    wordcloud.to_file(image_path)

# Adjust the spacing between subplots
plt.tight_layout()

# Display the plot
plt.show()


In [3]:
from collections import defaultdict
import itertools

# Read and parse the file
file_path_new = './multi_versions_models_dictionary.txt'

with open(file_path_new, 'r') as file:
    lines = file.readlines()

model_tokens_new = {}
for line in lines:
    parts = line.strip().split(':')
    if len(parts) == 2:
        model, tokens = parts
        model_tokens_new[model.strip()] = set(tokens.strip().split(', '))

# Find common groups of tokens
common_groups_count_new = defaultdict(int)
for (model1, tokens1), (model2, tokens2) in itertools.combinations(model_tokens_new.items(), 2):
    common_tokens = frozenset(tokens1.intersection(tokens2))
    if common_tokens:
        common_groups_count_new[common_tokens] += 1

# Filter groups to include only those with 3 or more elements
filtered_common_groups_3_or_more_new = {group: count for group, count in common_groups_count_new.items() if len(group) == 3}

# Sort the filtered dictionary by count
sorted_filtered_common_groups_3_or_more_new = sorted(filtered_common_groups_3_or_more_new.items(), key=lambda x: x[1], reverse=True)

# Displaying the top 20 groups for clarity
top_20_common_groups_3_or_more_new = sorted_filtered_common_groups_3_or_more_new[:40]

# Code to print the results
for group, count in top_20_common_groups_3_or_more_new:
    print(f"Tokens: {group} - Occurred in {count} combinations")

Tokens: frozenset({'13b', 'gptq', 'awq'}) - Occurred in 6937 combinations
Tokens: frozenset({'7b', 'gptq', 'awq'}) - Occurred in 6788 combinations
Tokens: frozenset({'gguf', 'gptq', 'awq'}) - Occurred in 4740 combinations
Tokens: frozenset({'exl2', 'gptq', 'awq'}) - Occurred in 2719 combinations
Tokens: frozenset({'7b', '13b', 'gptq'}) - Occurred in 2093 combinations
Tokens: frozenset({'7b', '13b', 'llama'}) - Occurred in 941 combinations
Tokens: frozenset({'70b', 'gptq', 'awq'}) - Occurred in 780 combinations
Tokens: frozenset({'7b', '13b', 'chat'}) - Occurred in 729 combinations
Tokens: frozenset({'7b', '13b', 'llama2'}) - Occurred in 450 combinations
Tokens: frozenset({'7b', 'gguf', 'gptq'}) - Occurred in 443 combinations
Tokens: frozenset({'7b', 'gptq', '4bit'}) - Occurred in 406 combinations
Tokens: frozenset({'7b', '13b', 'awq'}) - Occurred in 343 combinations
Tokens: frozenset({'7b', 'chat', 'llama'}) - Occurred in 326 combinations
Tokens: frozenset({'7b', '13b', 'instruct'}) - 