<a href="https://colab.research.google.com/github/ayushpgs/C-program/blob/main/Untitled4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error
import tensorflow as tf

# Load data
train_df = pd.read_csv('/train.csv')
test_df = pd.read_csv('/test.csv')

In [2]:
from sklearn.impute import SimpleImputer

#  Check missing values in both datasets
print("Missing values in train dataset:\n", train_df.isnull().sum()[train_df.isnull().sum() > 0])
print("\nMissing values in test dataset:\n", test_df.isnull().sum()[test_df.isnull().sum() > 0])



#  Drop columns with too many missing values (optional, for very sparse columns)
drop_columns = ['Alley', 'PoolQC', 'Fence', 'MiscFeature']

# Only drop columns if they exist in the dataset
train_df.drop([col for col in drop_columns if col in train_df], axis=1, inplace=True)
test_df.drop([col for col in drop_columns if col in test_df], axis=1, inplace=True)

#  Impute missing values for numerical features with the median
numerical_features = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

imputer_num = SimpleImputer(strategy='median')
# Fit on the training data and transform both train and test data
train_df[numerical_features] = imputer_num.fit_transform(train_df[numerical_features])
test_df[numerical_features] = imputer_num.transform(test_df[numerical_features])

#  Impute missing values for categorical features with the most frequent category
categorical_features = ['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
                        'BsmtFinType1', 'BsmtFinType2', 'Electrical',
                        'FireplaceQu', 'GarageType', 'GarageFinish',
                        'GarageQual', 'GarageCond']

imputer_cat = SimpleImputer(strategy='most_frequent')

# Fit on the training data and transform both train and test data
train_df[categorical_features] = imputer_cat.fit_transform(train_df[categorical_features])
test_df[categorical_features] = imputer_cat.transform(test_df[categorical_features])
print("\nMissing values in train dataset after imputation:\n", train_df.isnull().sum()[train_df.isnull().sum() > 0])
print("\nMissing values in test dataset after imputation:\n", test_df.isnull().sum()[test_df.isnull().sum() > 0])

Missing values in train dataset:
 LotFrontage      259
Alley           1369
MasVnrType       872
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

Missing values in test dataset:
 MSZoning           4
LotFrontage      227
Alley           1352
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType       894
MasVnrArea        15
BsmtQual          44
BsmtCond          45
BsmtExposure      44
BsmtFinType1      42
BsmtFinSF1         1
BsmtFinType2      42
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu      730
GarageType        76
GarageYrBlt       78
GarageFinish    

In [3]:
remaining_numeric_features = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
                              'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea']
remaining_categorical_features = ['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd',
                                  'KitchenQual', 'Functional', 'SaleType']

# Impute missing numeric features in the test dataset with median
test_df[remaining_numeric_features] = imputer_num.fit_transform(test_df[remaining_numeric_features])

# Impute missing categorical features in the test dataset with most frequent value
test_df[remaining_categorical_features] = imputer_cat.fit_transform(test_df[remaining_categorical_features])

# Check if any missing values remain in the test dataset
print("\nMissing values in test dataset after additional imputation:\n", test_df.isnull().sum()[test_df.isnull().sum() > 0])


Missing values in test dataset after additional imputation:
 Series([], dtype: int64)


In [4]:
categorical_columns = train_df.select_dtypes(include=['object']).columns

# Use Label Encoding for categorical features
for col in categorical_columns:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])

In [5]:
target_column = 'SalePrice'
X_train = train_df.drop(columns=[target_column])
y_train = train_df[target_column]

# Ensure the same columns are present in both train and test datasets
numeric_columns = X_train.select_dtypes(include=['int64', 'float64']).columns

# Initialize the scaler
scaler = StandardScaler()

# Scale only the numeric features in both training and test sets
X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
test_df[numeric_columns] = scaler.transform(test_df[numeric_columns])


In [6]:
!pip install optuna xgboost lightgbm catboost



In [7]:
import xgboost as xgb
print(xgb.__version__)


2.1.1


In [8]:
import optuna
import warnings
warnings.filterwarnings("ignore")
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd

# Assuming X_train and y_train are already defined and preprocessed

# Split the training data into training and validation sets
X_train_split, X_valid, y_train_split, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Define the objective function for XGBoost
def objective_xgboost(trial):
    param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'booster': trial.suggest_categorical('booster', ['gbtree', 'dart']),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 1e-8, 10.0, log=True),
    }

    # Convert to DMatrix format
    dtrain = xgb.DMatrix(X_train_split, label=y_train_split)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)

    # Train the model with early stopping
    evals = [(dtrain, 'train'), (dvalid, 'valid')]

    model = xgb.train(params=param, dtrain=dtrain, num_boost_round=500,
                      evals=evals, early_stopping_rounds=50, verbose_eval=False)

    # Make predictions on the validation set
    preds = model.predict(dvalid)
    mse = mean_squared_error(y_valid, preds)

    return mse

# Create and run the Optuna study
study_xgboost = optuna.create_study(direction='minimize')
study_xgboost.optimize(objective_xgboost, n_trials=20, timeout=600)

# Get the best parameters and retrain the model
best_params_xgboost = study_xgboost.best_params

# Fit the model on the full training set
final_dtrain = xgb.DMatrix(X_train, label=y_train)
final_xgboost_model = xgb.train(params=best_params_xgboost, dtrain=final_dtrain, num_boost_round=500)

# Make predictions on the test_df
test_dmatrix = xgb.DMatrix(test_df)  # Ensure test_df is prepared similarly
xgboost_predictions = final_xgboost_model.predict(test_dmatrix)

# Optionally, save predictions to a CSV file
submission_df = pd.DataFrame({'Id': test_df['Id'], 'SalePrice': xgboost_predictions})
submission_df.to_csv('submission.csv', index=False)
output_xgboost = pd.DataFrame({
    'Id': test_df.index.astype('int32'),  # Ensure 'Id' is integer
    'SalePrice': xgboost_predictions.flatten()
})
output_xgboost.to_csv('xgboost_predictions.csv', index=False)


[I 2024-09-30 19:49:33,785] A new study created in memory with name: no-name-220bedef-3dbb-4f43-9000-b3b80a5b74b1
[I 2024-09-30 19:50:31,327] Trial 0 finished with value: 628211794.6985673 and parameters: {'booster': 'dart', 'lambda': 5.689155965727605e-07, 'alpha': 0.002840113908542828, 'subsample': 0.5711429886171111, 'colsample_bytree': 0.9197077127669728, 'learning_rate': 0.08882196303790904, 'n_estimators': 281, 'max_depth': 6, 'min_child_weight': 2, 'gamma': 0.386659163782805}. Best is trial 0 with value: 628211794.6985673.
[I 2024-09-30 19:50:41,178] Trial 1 finished with value: 705399060.186711 and parameters: {'booster': 'dart', 'lambda': 0.0006763057355800345, 'alpha': 0.2532624823468575, 'subsample': 0.6216254387596174, 'colsample_bytree': 0.48844856590441904, 'learning_rate': 0.11135232252223107, 'n_estimators': 201, 'max_depth': 7, 'min_child_weight': 9, 'gamma': 1.0110837870523074e-07}. Best is trial 0 with value: 628211794.6985673.
[I 2024-09-30 19:50:41,954] Trial 2 fin