In [9]:
# Import Libraries
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# Path to the .txt file
file_path = 'D:/week3 data/MachineLearningRating_v3.txt'

# Load the data into a DataFrame with pipe delimiter
df = pd.read_csv(file_path, delimiter='|')

# Feature Engineering: Create a feature representing the age of the vehicle
df['VehicleAge'] = 2024 - df['RegistrationYear']

# Select relevant features
selected_features = ['cubiccapacity', 'kilowatts', 'VehicleAge']

# Prepare Training and Test Sets
from sklearn.model_selection import train_test_split

# Define target variable and features
X = df[selected_features]
y_premium = df['TotalPremium']

# Drop rows where the target variable has NaN values
X = X[~y_premium.isna()]
y_premium = y_premium.dropna()

# Split the data
X_train, X_test, y_train_premium, y_test_premium = train_test_split(X, y_premium, test_size=0.3, random_state=42)

# Sample a smaller subset to fit into memory
X_train_sampled = X_train.sample(n=10000, random_state=42)
y_train_premium_sampled = y_train_premium.loc[X_train_sampled.index]

# Impute Missing Values for Selected Features
imputer = SimpleImputer(strategy='mean')
X_train_sampled_small_imputed = imputer.fit_transform(X_train_sampled)
X_test_small_imputed = imputer.transform(X_test)

# Polynomial Features
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_train_poly = poly.fit_transform(X_train_sampled_small_imputed)
X_test_poly = poly.transform(X_test_small_imputed)

# Log Transformation
y_train_premium_log = np.log1p(y_train_premium_sampled)
y_test_premium_log = np.log1p(y_test_premium)

# Ensure the dimensions match
non_nan_indices = ~np.isnan(y_train_premium_log)
X_train_poly = X_train_poly[non_nan_indices]
y_train_premium_log = y_train_premium_log[non_nan_indices]

# Enhanced Linear Regression Model
model_premium_poly = LinearRegression()
model_premium_poly.fit(X_train_poly, y_train_premium_log)
y_pred_premium_log = model_premium_poly.predict(X_test_poly)
y_pred_premium = np.expm1(y_pred_premium_log)
mse_premium_poly = mean_squared_error(y_test_premium, y_pred_premium)
r2_premium_poly = r2_score(y_test_premium, y_pred_premium)
print("Enhanced Linear Regression - Mean Squared Error for TotalPremium:", mse_premium_poly)
print("Enhanced Linear Regression - R-squared for TotalPremium:", r2_premium_poly)

# Random Forest with Hyperparameter Tuning
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
grid_search_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_rf, cv=3)
grid_search_rf.fit(X_train_poly, y_train_premium_log)
best_rf_model = grid_search_rf.best_estimator_
y_pred_rf_log = best_rf_model.predict(X_test_poly)
y_pred_rf = np.expm1(y_pred_rf_log)
mse_rf_poly = mean_squared_error(y_test_premium, y_pred_rf)
r2_rf_poly = r2_score(y_test_premium, y_pred_rf)
print("Enhanced Random Forest - Mean Squared Error for TotalPremium:", mse_rf_poly)
print("Enhanced Random Forest - R-squared for TotalPremium:", r2_rf_poly)


  df = pd.read_csv(file_path, delimiter='|')
  result = getattr(ufunc, method)(*inputs, **kwargs)


Enhanced Linear Regression - Mean Squared Error for TotalPremium: 56243.334935469145
Enhanced Linear Regression - R-squared for TotalPremium: -0.06048970841964496
Enhanced Random Forest - Mean Squared Error for TotalPremium: 56165.11596915949
Enhanced Random Forest - R-squared for TotalPremium: -0.059014859730999625


In [4]:
# Enhanced Linear Regression Model
model_premium_poly = LinearRegression()
model_premium_poly.fit(X_train_poly, y_train_premium_log)
y_pred_premium_log = model_premium_poly.predict(X_test_poly)
y_pred_premium = np.expm1(y_pred_premium_log)
mse_premium_poly = mean_squared_error(y_test_premium, y_pred_premium)
r2_premium_poly = r2_score(y_test_premium, y_pred_premium)
print("Enhanced Linear Regression - Mean Squared Error for TotalPremium:", mse_premium_poly)
print("Enhanced Linear Regression - R-squared for TotalPremium:", r2_premium_poly)

# Random Forest Model
forest_premium = RandomForestRegressor(random_state=42)
forest_premium.fit(X_train_poly, y_train_premium_log)
y_pred_rf_log = forest_premium.predict(X_test_poly)
y_pred_rf = np.expm1(y_pred_rf_log)
mse_rf_poly = mean_squared_error(y_test_premium, y_pred_rf)
r2_rf_poly = r2_score(y_test_premium, y_pred_rf)
print("Enhanced Random Forest - Mean Squared Error for TotalPremium:", mse_rf_poly)
print("Enhanced Random Forest - R-squared for TotalPremium:", r2_rf_poly)

Enhanced Linear Regression - Mean Squared Error for TotalPremium: 56243.334935469145
Enhanced Linear Regression - R-squared for TotalPremium: -0.06048970841964496
Enhanced Random Forest - Mean Squared Error for TotalPremium: 56140.73912087744
Enhanced Random Forest - R-squared for TotalPremium: -0.05855522488259535


In [5]:
# Import Libraries
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# Path to the .txt file
file_path = 'D:/week3 data/MachineLearningRating_v3.txt'

# Load the data into a DataFrame with pipe delimiter
df = pd.read_csv(file_path, delimiter='|')

# Feature Engineering: Create a feature representing the age of the vehicle
df['VehicleAge'] = 2024 - df['RegistrationYear']

# Select relevant features
selected_features = ['cubiccapacity', 'kilowatts', 'VehicleAge']

# Prepare Training and Test Sets
from sklearn.model_selection import train_test_split

# Define target variable and features
X = df[selected_features]
y_premium = df['TotalPremium']

# Drop rows where the target variable has NaN values
X = X[~y_premium.isna()]
y_premium = y_premium.dropna()

# Split the data
X_train, X_test, y_train_premium, y_test_premium = train_test_split(X, y_premium, test_size=0.3, random_state=42)

# Sample a smaller subset to fit into memory
X_train_sampled = X_train.sample(n=10000, random_state=42)
y_train_premium_sampled = y_train_premium.loc[X_train_sampled.index]

# Impute Missing Values for Selected Features
imputer = SimpleImputer(strategy='mean')
X_train_sampled_small_imputed = imputer.fit_transform(X_train_sampled)
X_test_small_imputed = imputer.transform(X_test)

# Polynomial Features
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_train_poly = poly.fit_transform(X_train_sampled_small_imputed)
X_test_poly = poly.transform(X_test_small_imputed)

# Log Transformation
y_train_premium_log = np.log1p(y_train_premium_sampled)
y_test_premium_log = np.log1p(y_test_premium)

# Ensure the dimensions match
non_nan_indices = ~np.isnan(y_train_premium_log)
X_train_poly = X_train_poly[non_nan_indices]
y_train_premium_log = y_train_premium_log[non_nan_indices]

# Train XGBoost Model
xgb_premium = xgb.XGBRegressor(random_state=42)
xgb_premium.fit(X_train_poly, y_train_premium_log)

# Predict on the test set
y_pred_xgb_log = xgb_premium.predict(X_test_poly)
y_pred_xgb = np.expm1(y_pred_xgb_log)

# Evaluate the model
mse_xgb_poly = mean_squared_error(y_test_premium, y_pred_xgb)
r2_xgb_poly = r2_score(y_test_premium, y_pred_xgb)

print("XGBoost - Mean Squared Error for TotalPremium:", mse_xgb_poly)
print("XGBoost - R-squared for TotalPremium:", r2_xgb_poly)

  df = pd.read_csv(file_path, delimiter='|')
  result = getattr(ufunc, method)(*inputs, **kwargs)


XGBoost - Mean Squared Error for TotalPremium: 56275.01480341622
XGBoost - R-squared for TotalPremium: -0.06108704451218849
