In [69]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

In [70]:
def load_dataset() -> tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame]):
    train = pd.read_csv("C:\\Projects\\kaggle\\competitions\\identify-age-related-conditions\\data\\train.csv")
    greeks = pd.read_csv("C:\\Projects\\kaggle\\competitions\\identify-age-related-conditions\\data\\greeks.csv")
    test = pd.read_csv("C:\\Projects\\kaggle\\competitions\\identify-age-related-conditions\\data\\test.csv")
    train.head()
    return (train, greeks, test)

In [71]:
def split_data(df: pd.DataFrame, split)->tuple([pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]):
    X = df.loc[:, df.columns != "Class"]
    y = df.loc[:, "Class"]
    return train_test_split(X, y, test_size=split, random_state=42)

In [72]:
def build_pipeline(df: pd.DataFrame) -> ColumnTransformer:
    # Preprocessing for numerical data    
    numerical_transformer = SimpleImputer(strategy='constant')
    
    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Bundle preprocessing for numerical and categorical data
    numerical_cols = [cname for cname in df.columns if df[cname].dtype in ["int64", "float64"]]
    categorical_cols = [cname for cname in df.columns if df[cname].nunique() < 10]
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])
    return preprocessor

In [73]:
# Define model
model = RandomForestClassifier(n_estimators=100, random_state=22)

#Splitting
train, greeks, test = load_dataset()
print(f"dataset shape: {train.shape}")
X_train, X_valid, y_train, y_valid = split_data(train, 0.3)
print(f"X shape: {X_train.shape} and y shape: {y_train.shape}")

# Building Pipeline
preprocessor = build_pipeline(X_train)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                        ('model', model)
                        ])

dataset shape: (617, 58)
X shape: (431, 57) and y shape: (431,)


In [74]:
pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_valid)
score = accuracy_score(y_valid, preds)
print(f"{score} out of {len(y_valid)} are right")

ValueError: A given column is not a column of the dataframe