In [2]:
import pandas as pd
from sklearn.feature_extraction import FeatureHasher, DictVectorizer
from sklearn.preprocessing import StandardScaler

def load_powerlifting_dataset():
    url = "data/openpowerlifting.csv"
    df = pd.read_csv(url)
    return df

def preprocess_data(df):
    cols_to_use = ['Sex', 'Equipment', 'Event', 'Division', 'Age', 'BodyweightKg', 'Best3SquatKg', 'Best3BenchKg', 'Best3DeadliftKg', 'TotalKg']
    df = df[cols_to_use]
    df = df.dropna() 
    return df

#Execercio 8a:
def apply_feature_hashing(df, n_features=16):
    categorical_cols = ['Sex', 'Equipment', 'Event', 'Division']
    hasher = FeatureHasher(n_features=n_features, input_type='string')
    
    cat_features = df[categorical_cols].astype(str).values
    cat_feature_list = list(map(lambda x: [f"{col}={val}" for col, val in zip(categorical_cols, x)], cat_features))
    
    X_hashed = hasher.transform(cat_feature_list)
    hashed_df = pd.DataFrame(X_hashed.toarray(), columns=[f"hash_{i}" for i in range(n_features)])

    numerical_df = df.drop(columns=categorical_cols).reset_index(drop=True)
    scaler = StandardScaler()
    numerical_scaled = pd.DataFrame(scaler.fit_transform(numerical_df), columns=numerical_df.columns)

    final_df = pd.concat([numerical_scaled, hashed_df], axis=1)
    return final_df

# Exercício 8b: Aplicar Bin Counting
def apply_bin_counting(df):
    cat_cols = ['Sex', 'Equipment', 'Event', 'Division']
    num_df = df.drop(columns=cat_cols).reset_index(drop=True)

    # Transformar variáveis categóricas em dicionários
    dicts = df[cat_cols].astype(str).to_dict(orient='records')

    # Usar DictVectorizer para contar categorias (bin counting)
    vec = DictVectorizer(sparse=False)
    bin_counted = vec.fit_transform(dicts)
    bin_df = pd.DataFrame(bin_counted, columns=vec.get_feature_names_out())

    # Normalizar as variáveis numéricas
    scaler = StandardScaler()
    num_scaled = pd.DataFrame(scaler.fit_transform(num_df), columns=num_df.columns)

    # Concatenar final
    final_df = pd.concat([num_scaled, bin_df], axis=1)
    return final_df

if __name__ == "__main__":
    df_raw = load_powerlifting_dataset()
    df_clean = preprocess_data(df_raw)
    df_hashed = apply_feature_hashing(df_clean, n_features=16) # Exercício 8a
    df_bin = apply_bin_counting(df_clean) # Exercício 8b

    print("\nDataset com Feature Hashing aplicado:")
    print(df_hashed.head())

    print("\nDataset com Bin Counting aplicado:")
    print(df_bin.head())


  df = pd.read_csv(url)



Dataset com Feature Hashing aplicado:
        Age  BodyweightKg  Best3SquatKg  Best3BenchKg  Best3DeadliftKg  \
0 -1.275408     -1.089148     -1.380623     -1.508836        -1.577157   
1 -0.664742     -0.937581     -1.873129     -1.403521        -1.577157   
2 -1.438252     -0.730899     -1.456393     -1.350864        -1.577157   
3 -1.275408     -1.034033     -1.001772     -0.666318        -0.593487   
4 -1.234697     -0.427764     -1.683704     -1.508836        -1.413212   

    TotalKg  hash_0  hash_1  hash_2  hash_3  ...  hash_6  hash_7  hash_8  \
0 -1.535482     0.0    -1.0    -1.0     0.0  ...     0.0     0.0     0.0   
1 -1.698554     0.0    -1.0     0.0     0.0  ...     0.0     0.0    -1.0   
2 -1.520658    -1.0    -1.0    -1.0     0.0  ...     0.0     0.0     0.0   
3 -0.794249     0.0    -1.0    -1.0     0.0  ...     0.0     0.0     0.0   
4 -1.594781     0.0    -1.0     0.0     0.0  ...     0.0     0.0     0.0   

   hash_9  hash_10  hash_11  hash_12  hash_13  hash_14  has