In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

**Train Set Analysis**

In [None]:
X_train = pd.read_csv('.csv')
y_train = pd.read_csv('.csv')
df_train = X_train.copy()
df_train['category'] = y_train

Show how categories are distributed over the train set

In [None]:
label_counts = df_train['category'].value_counts()

plt.figure(figsize=(6, 4))
colors = ######
label_counts.plot(kind='bar', color=colors)
plt.title('Distribution of y_train')
plt.xlabel('Category')
plt.ylabel('Count')
plt.show()

Show how categories are distribbuted over the groups in the training set

In [None]:
group_category_counts = df_train.groupby('group')['category'].value_counts()
colors = #####
group_category_counts.plot(kind='barh', figsize=(10, 4), color = colors)
plt.title('Counts Categories for each Group')
plt.xlabel('Group')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
num_groups = df_train['group'].nunique()
num_cols = 3
num_rows = -(-num_groups // num_cols) 

fig, axes = plt.subplots(num_rows, num_cols, figsize=(10, num_rows * 4))
axes = axes.flatten()

for i, (group, counts) in enumerate(group_category_counts.groupby(level=0)):
    categories = counts.index
    counts_values = counts.values
    
    axes[i].barh(categories, counts_values, color='')
    
    axes[i].set_title(f'Group {group}')
    axes[i].set_xlabel('Count')
    axes[i].set_ylabel('Category')

for j in range(num_groups, num_rows * num_cols):
    axes[j].axis('off')

plt.tight_layout()
plt.show()

For each group, show the categories' distribution 

In [None]:
num_rows = 3
num_cols = 3
grid = sns.FacetGrid(df_train, col='group', col_wrap=num_cols, height=4)
grid.map(sns.countplot, 'category', palette='magma')
grid.set_titles(f'Group {group}')
grid.set_axis_labels('Count', 'Category')
plt.tight_layout()
plt.show()

**Test Set Analysis**

In [None]:
best_parameters = 
X_test =
y_test = 
y_pred = 
df_test = X_test.copy()
df_test['y_pred'] = y_pred
df_test['y_test'] = y_test

Confusion Matrix

In [None]:
num_classes = #######
confusion_matrix_values = np.array([
    ############
])
total_true = np.sum(confusion_matrix_values, axis=1)
total_pred = np.sum(confusion_matrix_values, axis=0)
confusion_matrix_normalized = confusion_matrix_values / total_true[:, np.newaxis]

plt.figure(figsize=(8, 6))
plt.imshow(confusion_matrix_normalized, cmap='Blues', interpolation='nearest')

for i in range(num_classes):
    for j in range(num_classes):
        plt.text(j, i, f'{confusion_matrix_values[i, j]}\n({confusion_matrix_normalized[i, j]*100:.2f}%)',
                 ha='center', va='center', color='white' if confusion_matrix_normalized[i, j] > 0.5 else 'black')

plt.xticks(range(num_classes), [f'Classe {i}' for i in range(num_classes)])
plt.yticks(range(num_classes), [f'Classe {i}' for i in range(num_classes)])

plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.colorbar()
plt.show()

Print best combination of parameters

In [None]:
print('Best Combination of Parameter is:\t', best_parameters)

For each group, show the distribution of true and predicted categories

In [None]:
grid = sns.FacetGrid(df_train, col='group', col_wrap=num_cols, height=4)
grid.map(sns.countplot, 'y_true', palette='inferno', alpha=0.5)
grid.map(sns.countplot, 'y_pred', palette='viridis', alpha=0.5)

grid.set_titles(f'Group {group}')
grid.set_axis_labels('Count', 'Group')

grid.axes[0].legend(labels=['y_true', 'y_pred'])
plt.tight_layout()
plt.show()

Show True vs Predicted

In [None]:
y_pred_counts = df_test['y_pred'].value_counts()
y_test_counts = df_test['y_test'].value_counts()

categories = label_counts.index.tolist()
colors_true = ['']
colors_predicted = []
bar_width = 0.35

plt.figure(figsize=(5, 4))
plt.bar([category - bar_width/2 for category in range(len(categories))], y_pred_counts, width=bar_width, color=colors_predicted, label='Predicted')
plt.bar([category + bar_width/2 for category in range(len(categories))], y_test_counts, width=bar_width, color=colors_true, label='True')

plt.xlabel('Category')
plt.ylabel('Count')
plt.title('Distribution of Categories over the Test set')
plt.xticks(range(len(categories)), categories)
plt.legend()
plt.tight_layout()
plt.show()