# Exploratory Data Analysis
This notebook provides exploratory data analysis for the datasets in the `data/` directory.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)

train_features = pd.read_csv('data/training_features.csv')
train_targets = pd.read_csv('data/training_targets.csv')
test_features = pd.read_csv('data/test_features.csv')


In [None]:
# Data shape and types
print('Train features shape:', train_features.shape)
print('Test features shape:', test_features.shape)
print('Train targets shape:', train_targets.shape)
train_features.dtypes.value_counts()

In [None]:
# Missing value analysis
missing_train = train_features.isna().sum().sort_values(ascending=False)
missing_test = test_features.isna().sum().sort_values(ascending=False)
print('Missing values in training features:
', missing_train.head())
print('
Missing values in test features:
', missing_test.head())

In [None]:
# Statistical summaries
train_features.describe().T

In [None]:
# Distribution plots for numerical features
numeric_cols = train_features.select_dtypes(include='number').columns[:10]
train_features[numeric_cols].hist(figsize=(15, 10))
plt.tight_layout()
plt.show()

In [None]:
# Value counts for categorical features
cat_cols = train_features.select_dtypes(include='object').columns[:5]
for col in cat_cols:
    display(train_features[col].value_counts().head())

In [None]:
# Target variable analysis
train_targets['radiant_win'].value_counts().plot(kind='bar')
plt.title('Target Distribution')
plt.show()

In [None]:
# Initial feature correlations
combined = train_features.join(train_targets['radiant_win'])
corr = combined.corr(numeric_only=True)['radiant_win'].sort_values(ascending=False)
print(corr.head(10))