In [4]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import root_mean_squared_error
import joblib
import pandas as pd

In [5]:
data = pd.read_csv(r"C:\Users\asadb\Downloads\playground-series-s5e2\train.csv")
test = r"C:\Users\asadb\Downloads\playground-series-s5e2\test.csv"
test_data = pd.read_csv(test)

In [6]:
dependent = data["Price"]
independent = data.drop(columns=["id", "Price"])

In [7]:
onehot_cols = ["Color","Brand","Material","Style"]
ordinal_cols = ["Size","Laptop Compartment", "Waterproof"]
numeric_cols = ["Weight Capacity (kg)", "Compartments"]
cat_cols = independent.select_dtypes(["object", "category"]).columns
num_cols = independent.select_dtypes(["number"]).columns

In [8]:
def to_dataframe(X, feature_names):
    return pd.DataFrame(X, columns=feature_names)
def strategy(l: list)-> LinearRegression:
    cat_imputer = SimpleImputer(strategy=l, fill_value="na")
    num_imputer = SimpleImputer(strategy=l, fill_value=-1)
    onehot = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    ordinal = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    scaler = MinMaxScaler()
    imputer_transformer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_cols),
    ("num_imputer", num_imputer, num_cols)
    ])
    preprocessor_encode = ColumnTransformer([
    ("onehot", onehot, onehot_cols),
    ("ordinal", ordinal, ordinal_cols),
    ("scaler", scaler, num_cols),
    ])
    pipeline = Pipeline([
    ("imputer", imputer_transformer),
    ("rename1", FunctionTransformer(lambda X: to_dataframe(X, list(cat_cols)+list(num_cols)))),
    ("preprocess", preprocessor_encode),
    ("rename2", FunctionTransformer(lambda X: to_dataframe(X, list(preprocessor_encode.named_transformers_["onehot"].get_feature_names_out(onehot_cols))+ordinal_cols+ list(num_cols))))
    ])
    pipeline.fit(independent)
    independent_data= pipeline.transform(independent)
    lm = LinearRegression()
    param_grid = {
        "fit_intercept": [True, False],
        "copy_X": [True, False]
    }
    grid_search = GridSearchCV(lm, param_grid, cv=7, scoring='r2', n_jobs=-1)
    x_train, x_test, y_train, y_test = train_test_split(independent_data, dependent, test_size=0.2, random_state=1)
    grid_search.fit(x_train, y_train)
    best = grid_search.best_estimator_
    predictions = best.predict(x_test)
    print("Strategy: ", l)
    print("RMSE: ",root_mean_squared_error(y_test, predictions))
    return best
    

In [9]:
models = []
for i in ["constant", "most_frequent"]:
    models.append(strategy(i))

Strategy:  constant
RMSE:  39.067460396210166
Strategy:  most_frequent
RMSE:  39.06950511948285


In [10]:
models[0].get_params()

{'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False}

In [11]:
data_p_ex = r"C:\Users\asadb\Downloads\playground-series-s5e2\training_extra.csv"
data_complete = pd.concat([data, pd.read_csv(data_p_ex)])

In [12]:
dependent = data_complete["Price"]
independent = data_complete.drop(columns=["id", "Price"])

In [13]:
def rename_columns_imputer(X):
    return pd.DataFrame(X, columns=list(cat_cols)+list(num_cols))

def rename_columns_preprocess(X):
    return pd.DataFrame(X, columns=list(preprocessor_encode.named_transformers_["onehot"].get_feature_names_out(onehot_cols))+ordinal_cols+ list(num_cols))

cat_imputer = SimpleImputer(strategy="constant", fill_value="na")
num_imputer = SimpleImputer(strategy="constant", fill_value=-1)
onehot = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
ordinal = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
scaler = MinMaxScaler()
imputer_transformer = ColumnTransformer([
("cat_imputer", cat_imputer, cat_cols),
("num_imputer", num_imputer, num_cols)
])
preprocessor_encode = ColumnTransformer([
("onehot", onehot, onehot_cols),
("ordinal", ordinal, ordinal_cols),
("scaler", scaler, num_cols),
])
pipeline = Pipeline([
("imputer", imputer_transformer),
("rename1", FunctionTransformer(rename_columns_imputer)),
("preprocess", preprocessor_encode),
("rename2", FunctionTransformer(rename_columns_preprocess)),
("model", LinearRegression(copy_X=True, fit_intercept=True, positive=False))
])
pipeline.fit(independent,dependent)

In [14]:
joblib.dump(pipeline, "pipeline.pkl")
loaded_model = joblib.load("pipeline.pkl")

In [15]:
t1=pd.DataFrame(test_data.iloc[0,1:].values.reshape(1, -1), columns=independent.columns)
loaded_model.predict(t1)

array([81.6871149])

In [16]:
predictions = loaded_model.predict(test_data)

In [131]:
output = pd.DataFrame(predictions).reset_index().rename(columns={0:"Price", 'index':'id'})
output['id'] += 300000
output.to_csv("submission.csv", index=False)