# 1 - DEFINE THE PROBLEM
We are exploring the contents of the `imagenet_class_names.txt` file, which includes the class labels used in ImageNet-based deep learning models. This notebook will help us understand the label structure and prepare it for use in machine learning pipelines.

In [None]:
# 2 - IMPORT REQUIRED LIBRARIES
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string

In [None]:
# 3 - LOAD THE DATA
with open('../datasets/imagenet_class_names.txt', 'r') as file:
    class_names = [line.strip() for line in file.readlines() if line.strip()]

print(f'Total classes: {len(class_names)}')
class_names[:10]  # Preview first 10

In [None]:
# 4 - EDA (Exploratory Data Analysis)
# Convert to DataFrame for analysis
df = pd.DataFrame(class_names, columns=['class_name'])
df['length'] = df['class_name'].apply(len)
df['first_letter'] = df['class_name'].str[0]
df.describe(include='all')

In [None]:
# 5 - VISUALIZE THE DATA
# Histogram of class name lengths
sns.histplot(df['length'], bins=20)
plt.title('Class Name Length Distribution')
plt.xlabel('Length')
plt.ylabel('Count')
plt.show()

# Frequency of first letters
plt.figure(figsize=(10, 5))
sns.countplot(x='first_letter', data=df,
              order=sorted(df['first_letter'].unique()))
plt.title('Distribution of First Letters in Class Names')
plt.show()

In [None]:
# 6 - PREPROCESS THE DATA
# Basic standardization
df['class_name_clean'] = df['class_name'].str.lower().str.replace('-', ' ').str.replace('_', ' ')
df['class_name_clean'] = df['class_name_clean'].str.replace(r'[^a-z ]+', '', regex=True)
df.head()

In [None]:
# 7 - SPLIT THE DATA (Optional example)
# We can split classes into groups for manual use (e.g., 80% train, 20% holdout)
from sklearn.model_selection import train_test_split
train_classes, test_classes = train_test_split(df['class_name_clean'], test_size=0.2, random_state=42)
print(f'Train classes: {len(train_classes)}')
print(f'Test classes: {len(test_classes)}')
train_classes[:5]