# Data Exploration for sample_train.csv
This notebook demonstrates basic data exploration and preprocessing steps for the sample training data.

In [None]:
import pandas as pd
# Load the sample data
df = pd.read_csv('files/sample_train.csv')

## Basic Info
Show the shape, columns, and missing values in the dataset.

In [None]:
print('Shape:', df.shape)
print('Columns:', df.columns.tolist())
print('Missing values per column:')
print(df.isnull().sum())

## Cardinality of Categorical Features
Check the number of unique values for each categorical feature (ID_01 to ID_22).

In [None]:
cat_features = [col for col in df.columns if col.startswith('ID_')]
for col in cat_features:
    print(f'{col}: {df[col].nunique()} unique values')

## Data Preview
Show the first few rows and value counts for a few categorical features.

In [None]:
print(df.head())
for col in cat_features[:3]:
    print(f'Value counts for {col}:')
    print(df[col].value_counts().head())

In [None]:
# Suggest encoding type based on cardinality
one_hot_cols = []
hashing_cols = []
threshold = 20  # You can adjust this threshold
for col in cat_features:
    n_unique = df[col].nunique()
    if n_unique <= threshold:
        one_hot_cols.append(col)
    else:
        hashing_cols.append(col)
print('Columns suitable for one-hot encoding:', one_hot_cols)
print('Columns suitable for hashing/target encoding:', hashing_cols)

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction import FeatureHasher

# One-hot encoding for low-cardinality columns
if one_hot_cols:
    ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
    ohe_encoded = ohe.fit_transform(df[one_hot_cols])
    ohe_feature_names = ohe.get_feature_names_out(one_hot_cols)
    df_ohe = pd.DataFrame(ohe_encoded, columns=ohe_feature_names, index=df.index)
else:
    df_ohe = pd.DataFrame(index=df.index)

# Feature hashing for high-cardinality columns
if hashing_cols:
    hasher = FeatureHasher(n_features=10, input_type='string')  # n_features can be tuned
    hashed_features = hasher.transform(df[hashing_cols].astype(str).values)
    df_hash = pd.DataFrame(hashed_features.toarray(), index=df.index)
else:
    df_hash = pd.DataFrame(index=df.index)

# Combine all features
X = pd.concat([df_ohe, df_hash], axis=1)
print('Encoded feature shape:', X.shape)
X.head()

In [None]:
# Save encoded features to a new CSV file
X.to_csv('files/encoded_sample.csv', index=False)
print('Encoded features saved to files/encoded_sample.csv')