In [None]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

file_path = "Data_Entry_2017.csv"

df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "nih-chest-xrays/data",
  file_path,
)

In [None]:
all_labels = df['Finding Labels'].apply(lambda x: x.split('|'))
flat_labels = [label for sublist in all_labels for label in sublist]
label_counts = Counter(flat_labels)

sorted_labels = sorted(label_counts.items(), key=lambda item: item[1], reverse=True)
sorted_keys = [item[0] for item in sorted_labels]
sorted_values = [item[1] for item in sorted_labels]

plt.figure(figsize=(12, 6))
plt.bar(sorted_keys, sorted_values)
plt.xticks(rotation=45)
plt.title('Disease Distribution')
plt.xlabel('Disease')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
new_df = df.copy()

df['Finding Labels'] = df['Finding Labels'].str.split('|')
expanded_df = df.explode('Finding Labels')

image_count = expanded_df['Finding Labels'].value_counts()
unique_records = expanded_df.drop_duplicates(subset=['Patient ID', 'Finding Labels'])
patient_count = unique_records.groupby('Finding Labels')['Patient ID'].nunique()

combined_df = pd.DataFrame({
    'Image Count': image_count,
    'Patient Count': patient_count
}).fillna(0).astype(int)

combined_df = combined_df.sort_values(by='Image Count', ascending=False)

plt.figure(figsize=(14, 8))
bar_width = 0.4
x = range(len(combined_df))

plt.bar(x, combined_df['Image Count'], width=bar_width, label='Image Count', alpha=0.8)
plt.bar([i + bar_width for i in x], combined_df['Patient Count'], width=bar_width, label='Patient Count', alpha=0.8)
plt.xticks([i + bar_width / 2 for i in x], combined_df.index, rotation=45)
plt.xlabel('Disease')
plt.ylabel('Count')
plt.title('Image Count vs Patient Count')
plt.legend()
plt.tight_layout()
plt.show()
df = new_df

In [None]:
df['num_labels'] = df['Finding Labels'].str.split('|').apply(len)
label_counts = df['num_labels'].value_counts().sort_index()

merged_counts = label_counts.copy()
if len(merged_counts[merged_counts.index > 3]) > 0:
    merged_counts = pd.Series({
        **merged_counts[merged_counts.index <= 3].to_dict(),
        '>3': merged_counts[merged_counts.index > 3].sum()
    })

plt.figure(figsize=(8, 5))
merged_counts.plot(kind='pie', autopct='%1.1f%%', startangle=90, colors=plt.cm.Paired.colors[:len(merged_counts)])
plt.title("Distribution of the Number of Labels per Image")
plt.axis('equal')
plt.show()

In [None]:
label_col = 'Finding Labels'

all_labels = df[label_col].str.split('|')
unique_labels = sorted(set(label for sublist in all_labels for label in sublist if label != 'No Finding'))

multi_hot_df = pd.DataFrame(0, index=df.index, columns=unique_labels)
for i, labels in enumerate(all_labels):
    for label in labels:
        if label != 'No Finding':
            multi_hot_df.loc[i, label] = 1

co_matrix = multi_hot_df.T.dot(multi_hot_df)

plt.figure(figsize=(12, 10))
sns.heatmap(co_matrix, cmap='coolwarm', annot=True, fmt='d')
plt.title('Multi-Label Co-occurrence Matrix')
plt.show()