In [58]:
import pandas as pd
import constants as cs
import numpy as np
import matplotlib.pyplot as plt
import zipfile
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, ParameterGrid
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.neural_network import MLPRegressor
import joblib
from sklearn.ensemble import VotingRegressor

In [59]:
def load_data(filename: str) -> pd.DataFrame:
    with zipfile.ZipFile(f"../{filename}.zip") as data_zip:
        with data_zip.open(f"{filename}.csv") as developer_data:
            return pd.read_csv(developer_data)


In [60]:
def custom_accuracy_score(y_true, y_pred):
        accuracy_count = 0
        y_true = np.array(y_true, dtype='float64')
        for true_val, pred_val in zip(y_true, y_pred):
            if abs(true_val - pred_val) <= 20000:
                accuracy_count += 1

        accuracy = accuracy_count / len(y_true)
        return accuracy

In [61]:
def modelPredictAndScore(X_valid, y_valid, model, modelName):
    custom_score_valid = custom_accuracy_score(y_valid, model.predict(X_valid))
#     Calculate Root Mean Square Error (RMSE) for training and validation data
    rmse_valid = np.sqrt(mean_squared_error(y_valid, model.predict(X_valid)))

    print(f"{modelName} accuracy is:{custom_score_valid}")
    print(f"{modelName} root mean square is:{rmse_valid}")


In [62]:
na_data = load_data(cs.NA_TRAIN_DATA)
X_valid = load_data(cs.X_VALID)
y_valid = load_data(cs.Y_VALID)
print(y_valid.head(5))

scaler_model = joblib.load(f'{cs.ML_MODELS_FOLDER}/scaler.pkl')
svr_model = joblib.load(f'{cs.ML_MODELS_FOLDER}/{cs.BEST_SVR_MODELS}.pkl')
rf_model = joblib.load(f'{cs.ML_MODELS_FOLDER}/{cs.BEST_RF_MODELS}.pkl')
gb_model = joblib.load(f'{cs.ML_MODELS_FOLDER}/{cs.BEST_GB_MODELS}.pkl')
xgboost_model = joblib.load(f'{cs.ML_MODELS_FOLDER}/{cs.BEST_XGBOOST_MODELS}.pkl')

modelPredictAndScore(X_valid, y_valid, rf_model, cs.RANDOM_FOREST_MODELS)
X_valid_scaled = scaler_model.transform(X_valid)
modelPredictAndScore(X_valid_scaled, y_valid, svr_model, cs.SVR_MODELS)
modelPredictAndScore(X_valid, y_valid, gb_model, cs.GB_MODELS)
modelPredictAndScore(X_valid, y_valid, xgboost_model, cs.XGBOOST_MODELS)

       CompTotal
0   93823.979947
1  172636.123103
2  112588.775936
3   73000.000000
4  117092.326974
Random_forest_models accuracy is:0.3864528795811518
Random_forest_models root mean square is:198685.65710284893
SVR_models accuracy is:0.41492146596858637
SVR_models root mean square is:198047.07757991535
Random_forest_models accuracy is:0.40346858638743455
Random_forest_models root mean square is:220798.07283953435
XGBoost_models accuracy is:0.393651832460733
XGBoost_models root mean square is:211061.05853955593


In [82]:
voting_regressor = VotingRegressor(
    estimators=[('rf', rf_model), ('gb', gb_model), ('xgboost', xgboost_model), ('svr', svr_model)]
)

In [83]:
X_train = load_data('X_train_data')
y_train = load_data('y_train_data')
y_train = y_train.values.ravel()
voting_regressor.fit(X_train, y_train)

VotingRegressor(estimators=[('rf',
                             RandomForestRegressor(max_depth=125,
                                                   min_samples_leaf=10,
                                                   n_estimators=150)),
                            ('gb',
                             GradientBoostingRegressor(learning_rate=0.05,
                                                       max_depth=5)),
                            ('xgboost',
                             XGBRegressor(base_score=None, booster=None,
                                          callbacks=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None,
                                          early_stopping_rounds=None,
                                          enable_categor...
                                          interaction_constraints=None,
       

In [84]:
modelPredictAndScore(X_valid, y_valid, voting_regressor, "Voting Classifier")

Voting Classifier accuracy is:0.43913612565445026
Voting Classifier root mean square is:125021.71423185374
