In [8]:
# First we import the necessary libraries
import pandas as pd
pd.set_option("display.precision", 4)
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from statsmodels.formula.api import ols
import plotnine as p9
import warnings
warnings.filterwarnings("ignore")
pd.options.display.float_format = '{:.4f}'.format
pd.options.mode.chained_assignment = None  # default='warn'
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

---

### 1 Question [30 points]
#### The file eBayAuctions.csv contains information on 1972 auctions transacted on eBay.com during May–June 2004. The goal is to use these data to build a model that will distinguish competitive auctions from noncompetitive ones. A competitive auction is defined as an auction with at least two bids placed on the item being auctioned. The data include variables that describe the item (auction category), the seller (his or her eBay rating), and the auction terms that the seller selected (auction duration, opening price, currency, day of week of auction close). In addition, we have the price at which the auction closed. The goal is to predict whether or not an auction of interest will be competitive.

#### Pull the relevant data

In [4]:
data = pd.read_csv("eBayAuctions.csv")
data.head(5)

Unnamed: 0,Category,currency,sellerRating,Duration,endDay,ClosePrice,OpenPrice,Competitive?
0,Music/Movie/Game,US,3249,5,Mon,0.01,0.01,0
1,Music/Movie/Game,US,3249,5,Mon,0.01,0.01,0
2,Music/Movie/Game,US,3249,5,Mon,0.01,0.01,0
3,Music/Movie/Game,US,3249,5,Mon,0.01,0.01,0
4,Music/Movie/Game,US,3249,5,Mon,0.01,0.01,0


#### Split the data into training and test sets

In [17]:
#Partition the data into training (60%) and validation (40%) set.
train_df, test_df = train_test_split(data, test_size=0.4, random_state=123)

### 1.1 Discuss if you can use all the variables to predict at the start of an auction whether it will be competitive.

To determine whether we can use all the given variables to predict whether an auction will be competitive at the start, we need to consider the relevance and relationship of each variable to the target variable (i.e., "Competitive?").

- Category: The type of product being sold could potentially have an impact on competitiveness, as some categories may attract more bidders than others.

- Currency: The currency used to price the item may impact the competitiveness, as different currencies can affect the price and thus bidding behavior.

- SellerRating: The rating of the seller could have an impact on the number of bidders and the competitiveness of the auction, as buyers may be more likely to bid on items from more reputable sellers.

- Duration: The length of the auction could impact competitiveness, as longer auctions may attract more bidders and lead to more competition.

- EndDay: The day of the week on which the auction ends could also impact competitiveness, as some days may attract more bidders than others.

- ClosePrice: The final price of the auction will directly reflect the level of competitiveness. However, it is not a variable that can be used to predict competitiveness at the start of the auction.

- OpenPrice: The starting price of the auction could have an impact on competitiveness, as a low starting price could attract more bidders and lead to more competition.

In [42]:
from boruta import BorutaPy
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import Ridge, Lasso
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold

# Encode the categorical variables using label encoding
le = LabelEncoder()

data['Category'] = le.fit_transform(data['Category'])
data['currency'] = le.fit_transform(data['currency'])
data['endDay'] = le.fit_transform(data['endDay'])
train_df, test_df = train_test_split(data, test_size=0.4, random_state=123)


# Separate the predictor variables and the target variable
X_train = train_df.drop('Competitive?', axis=1)
y_train = train_df['Competitive?']
X_test = test_df.drop('Competitive?', axis=1)
y_test = test_df['Competitive?']

# Standardize the numerical variables using z-score normalization
scaler = StandardScaler()
num_cols = ['sellerRating', 'Duration', 'ClosePrice', 'OpenPrice']
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

# Use Boruta to select the relevant variables
rf = RandomForestClassifier(n_estimators=100, random_state=42)
boruta = BorutaPy(rf, n_estimators='auto', verbose=0, random_state=42)
boruta.fit(X_train.values, y_train.values)

# Print the selected variables
selected_vars = X_train.columns[boruta.support_]
print("=============================\nSelected variables by Boruta: ", selected_vars)

# Use forward step selection to select the relevant variables
lr = LinearRegression()
sfs = SequentialFeatureSelector(lr, k_features=4, forward=True, scoring='r2', cv=5)
sfs.fit(X_train, y_train)

# Print the selected variables
selected_vars = list(sfs.k_feature_names_)
print("==============================\nSelected variables by forward step selection: ", selected_vars)

# Use lasso regression to select the relevant variables
lasso = Lasso(alpha=0.05)
lasso.fit(X_train, y_train)

# Print the selected variables
selected_vars = list(X_train.columns[lasso.coef_ != 0])
print("==============================\nSelected variables by lasso regression: ", selected_vars)


# Use LassoCV to perform hyperparameter tuning
lasso_cv = LassoCV(cv=KFold(n_splits=5))
lasso_cv.fit(X_train, y_train)

# Print the selected regularization parameter
print("=============================\nSelected regularization parameter by LassoCV: ", lasso_cv.alpha_)

# Use RFECV to perform variable selection
estimator = Lasso(alpha=lasso_cv.alpha_)
rfe_cv = RFECV(estimator, cv=KFold(n_splits=5))
rfe_cv.fit(X_train, y_train)

# Print the selected variables
selected_vars = list(X_train.columns[rfe_cv.support_])
print("Selected variables by RFECV: ", selected_vars)

# Use forward step selection to perform variable selection
estimator = Lasso(alpha=lasso_cv.alpha_)
fss_cv = RFECV(estimator, step=1, cv=KFold(n_splits=5))
fss_cv.fit(X_train, y_train)

# Print the selected variables
selected_vars = list(X_train.columns[fss_cv.support_])
print("==============================\nSelected variables by forward step selection with lassoCV reg params: ", selected_vars)


Selected variables by Boruta:  Index(['sellerRating', 'ClosePrice', 'OpenPrice'], dtype='object')
Selected variables by forward step selection:  ['Category', 'sellerRating', 'ClosePrice', 'OpenPrice']
Selected variables by lasso regression:  ['Category', 'ClosePrice', 'OpenPrice']
Selected regularization parameter by LassoCV:  0.0030904479605003638
Selected variables by RFECV:  ['ClosePrice', 'OpenPrice']
Selected variables by forward step selection with lassoCV reg params:  ['ClosePrice', 'OpenPrice']


### 1.2 Build a classification tree, a boosted tree, a bagged tree, and a random forest model (with mtry = 4). Choose the tuning parameters for these models by optimizing the performance on the validation set. Report accuracies of these four models as well as their confusion matrices on the validation set.

In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix

# Split the training set into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_df.drop('Competitive?', axis=1), train_df['Competitive?'], test_size=0.2, random_state=42)

# Define the parameter grid for the decision tree classifier
param_grid = {
    'max_depth': [3, 5, 7],
    'min_samples_leaf': [5, 10, 15],
    'criterion': ['gini', 'entropy']
}

# Train the decision tree classifier using grid search cross-validation to find the best hyperparameters
dtc = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(dtc, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and their corresponding accuracy score
print("Best hyperparameters: ", grid_search.best_params_)
print("Validation accuracy: ", grid_search.best_score_)

# Use the best model to make predictions on the validation set
y_pred = grid_search.predict(X_val)

# Print the accuracy score and confusion matrix of the best model on the validation set
print("Validation accuracy: ", accuracy_score(y_val, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_val, y_pred))

# Use the best model to make predictions on the test set
X_test = test_df.drop('Competitive?', axis=1)
y_test = test_df['Competitive?']
y_pred_test = grid_search.predict(X_test)

# Print the accuracy score and confusion matrix of the best model on the test set
print("Test accuracy: ", accuracy_score(y_test, y_pred_test))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_test))


ValueError: 
All the 90 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/amankrishna/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/amankrishna/opt/anaconda3/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 969, in fit
    super().fit(
  File "/Users/amankrishna/opt/anaconda3/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 172, in fit
    X, y = self._validate_data(
  File "/Users/amankrishna/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 591, in _validate_data
    X = check_array(X, input_name="X", **check_X_params)
  File "/Users/amankrishna/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 856, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
  File "/Users/amankrishna/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py", line 2064, in __array__
    return np.asarray(self._values, dtype=dtype)
ValueError: could not convert string to float: 'Music/Movie/Game'

--------------------------------------------------------------------------------
72 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/amankrishna/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/amankrishna/opt/anaconda3/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 969, in fit
    super().fit(
  File "/Users/amankrishna/opt/anaconda3/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 172, in fit
    X, y = self._validate_data(
  File "/Users/amankrishna/opt/anaconda3/lib/python3.9/site-packages/sklearn/base.py", line 591, in _validate_data
    X = check_array(X, input_name="X", **check_X_params)
  File "/Users/amankrishna/opt/anaconda3/lib/python3.9/site-packages/sklearn/utils/validation.py", line 856, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
  File "/Users/amankrishna/opt/anaconda3/lib/python3.9/site-packages/pandas/core/generic.py", line 2064, in __array__
    return np.asarray(self._values, dtype=dtype)
ValueError: could not convert string to float: 'Coins/Stamps'
