In [1]:
import numpy as np
import pandas as pd
from scipy.stats import zscore
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

# Load the dataset into a pandas DataFrame
df = pd.read_csv('car_listings.csv')

print("\nCounts before removing outliers and na values:")
print(df[['price', 'manufacturer', 'odometer', 'year']].count())


# Drop rows with missing values in the 'price', 'manufacturer', 'odometer', and 'year' columns
#df_filtered = df.dropna(subset=['price', 'manufacturer', 'odometer', 'year'])
df_filtered = df.dropna(subset=['price', 'manufacturer', 'odometer', 'year', 'type'])


# Calculate the mean of the odometer readings
average_odometer_mean = df_filtered['odometer'].mean()
print(f"The average odometer reading is: {average_odometer_mean}")

# Remove outliers for 'price' based on the z-score
# Calculate z-scores of 'price'
z_scores_price = np.abs(zscore(df_filtered['price']))
# Keep rows with z-scores less than 3
df_filtered = df_filtered[(z_scores_price < 3)]

# Remove negative prices, if any
df_filtered = df_filtered[df_filtered['price'] >= 0]

# Perform the same outlier removal for 'odometer' and 'year'
# Calculate z-scores for 'odometer' and 'year'
z_scores_odometer = np.abs(zscore(df_filtered['odometer']))
z_scores_year = np.abs(zscore(df_filtered['year']))

# Filter out the outliers
df_filtered = df_filtered[(z_scores_odometer < 3) & (z_scores_year < 3)]

# After removing outliers, you may want to check the mean values again
average_odometer_mean = df_filtered['odometer'].mean()
average_price_mean = df_filtered['price'].mean()
average_year_mean = df_filtered['year'].mean()

# Print out the cleaned mean values
print(f"Cleaned average odometer reading: {average_odometer_mean}")
print(f"Cleaned average price: {average_price_mean}")
print(f"Cleaned average year: {average_year_mean}")

# Display counts after removing outliers
print("\nCounts after removing outliers and na values:")
print(df_filtered[['price', 'manufacturer', 'odometer', 'year', 'type']].count())

print("\nCounts of NA values for 'price', 'year', 'odometer', 'manufacturer', 'type' before dropping missing values and outliers:")
print(df[['price', 'year', 'odometer', 'manufacturer', 'type']].isna().sum())


print("\nCounts of NA values for 'price', 'year', 'odometer', 'manufacturer', 'type' after dropping missing values and outliers:")
print(df_filtered[['price', 'year', 'odometer', 'manufacturer', 'type']].isna().sum())

# Encode the 'manufacturer' target variable
#label_encoder = LabelEncoder()
#y = label_encoder.fit_transform(df_filtered['manufacturer'])

# Apply one-hot encoding to categorical features including 'type'
#X = pd.get_dummies(df_filtered.drop(['manufacturer'], axis=1))

# Ready for further analysis or model training

label_encoder = LabelEncoder()
df_filtered['type'] = label_encoder.fit_transform(df_filtered['type'])

# The data is now cleaned and ready for further analysis or model training.



Counts before removing outliers and na values:
price           426853
manufacturer    409234
odometer        422480
year            425675
dtype: int64
The average odometer reading is: 90087.63958015172
Cleaned average odometer reading: 87101.48424280442
Cleaned average price: 18657.11817339443
Cleaned average year: 2012.9153152235176

Counts after removing outliers and na values:
price           314047
manufacturer    314047
odometer        314047
year            314047
type            314047
dtype: int64

Counts of NA values for 'price', 'year', 'odometer', 'manufacturer', 'type' before dropping missing values and outliers:
price               0
year             1178
odometer         4373
manufacturer    17619
type            92831
dtype: int64

Counts of NA values for 'price', 'year', 'odometer', 'manufacturer', 'type' after dropping missing values and outliers:
price           0
year            0
odometer        0
manufacturer    0
type            0
dtype: int64


In [2]:
features = df_filtered[['price', 'year', 'odometer', 'type']]
target = df_filtered['manufacturer']

from sklearn.model_selection import train_test_split

feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.3, random_state=1)


In [3]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
feature_train = sc.fit_transform(feature_train)
feature_test = sc.transform(feature_test)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


### Logistic Regression Training

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

logreg = LogisticRegression(solver='lbfgs')
logreg.fit(feature_train, target_train)

predictions = logreg.predict(feature_test)

print("\nPredictions:")
print(predictions)

accuracy = accuracy_score(target_test, predictions)
print("Accuracy:", accuracy)


Predictions:
['ford' 'ford' 'ford' ... 'ford' 'ford' 'ford']
Accuracy: 0.16540890516372128


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Decision Tree Training

In [5]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=1)
decision_tree.fit(feature_train, target_train)

predictions = decision_tree.predict(feature_test)
accuracy = accuracy_score(target_test, predictions)
print(predictions)
print("Accuracy:", accuracy)

['chevrolet' 'ford' 'volkswagen' ... 'subaru' 'lexus' 'ram']
Accuracy: 0.6258239133895876


### Random Forest Training

In [6]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=1)
random_forest.fit(feature_train, target_train)

predictions = random_forest.predict(feature_test)
accuracy = accuracy_score(target_test, predictions)
print(predictions)
print("Accuracy:", accuracy)

['chevrolet' 'ford' 'volkswagen' ... 'subaru' 'honda' 'ram']
Accuracy: 0.6412354720585894


In [7]:
from sklearn.model_selection import GridSearchCV
import psutil

# Logistic Regression

# Assuming your dataset is a pandas DataFrame 'df'
sample_df = df_filtered.sample(frac=0.1, random_state=0)  # Sampling 10% of the data

# Now split this sampled data into features and target, and then into training and test sets
feature_sample = sample_df[['price', 'year', 'odometer', 'type']]
target_sample = sample_df['manufacturer']

feature_train_sample, feature_test_sample, target_train_sample, target_test_sample = train_test_split(feature_sample, target_sample, test_size=0.3, random_state=0)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
feature_train_sample = sc.fit_transform(feature_train_sample)
feature_test_sample = sc.transform(feature_test_sample)

param_grid = {
    #'C': [0.01, 0.1, 1, 10, 100],
    'C': [0.01, 0.1, 10],
    'penalty': ['l2', 'none'],
    'solver': ['lbfgs', 'saga']
    #'C': [0.1, 1, 10],
    #'penalty': ['l2'],
}

before_memory = psutil.virtual_memory().used / (1024 ** 2)

#grid_search = GridSearchCV(LogisticRegression(solver='lbfgs', max_iter=1000), param_grid, cv=5, scoring='accuracy')
grid_search = GridSearchCV(LogisticRegression(max_iter=500), param_grid, cv=3, scoring='accuracy')
grid_search.fit(feature_train_sample, target_train_sample)

after_memory = psutil.virtual_memory().used / (1024 ** 2)  # Convert to MB

# Calculate memory usage increase
memory_increase = after_memory - before_memory

print(f"Memory Usage Increase: {memory_increase} MB")

print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)

best_grid = grid_search.best_estimator_
grid_predictions = best_grid.predict(feature_test_sample)
print("Grid Search Test Accuracy: ", accuracy_score(target_test_sample, grid_predictions))


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown 

Memory Usage Increase: -183.61328125 MB
Best Parameters:  {'C': 0.01, 'penalty': 'l2', 'solver': 'saga'}
Best Score:  0.1736796208873523
Grid Search Test Accuracy:  0.1686478454680535


In [8]:
from sklearn.model_selection import RandomizedSearchCV

#randomized_search = RandomizedSearchCV(LogisticRegression(solver='lbfgs', max_iter=1000), param_distributions=param_grid, n_iter=5, cv=3, scoring='accuracy', random_state=0)
before_memory = psutil.virtual_memory().used / (1024 ** 2)


randomized_search = RandomizedSearchCV(LogisticRegression(max_iter=500), param_distributions=param_grid, n_iter=5, cv=3, scoring='accuracy', random_state=0)
randomized_search.fit(feature_train_sample, target_train_sample)

after_memory = psutil.virtual_memory().used / (1024 ** 2)  # Convert to MB

# Calculate memory usage increase
memory_increase = after_memory - before_memory

print(f"Memory Usage Increase: {memory_increase} MB")

print("Best Parameters: ", randomized_search.best_params_)
print("Best Score: ", randomized_search.best_score_)

best_random = randomized_search.best_estimator_
random_predictions = best_random.predict(feature_test_sample)
print("Randomized Search Test Accuracy: ", accuracy_score(target_test_sample, random_predictions))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Memory Usage Increase: 800.80859375 MB
Best Parameters:  {'solver': 'lbfgs', 'penalty': 'l2', 'C': 0.1}
Best Score:  0.17290639951256037
Randomized Search Test Accuracy:  0.17013372956909362


In [9]:
param_grid_dt = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

before_memory = psutil.virtual_memory().used / (1024 ** 2)

grid_search_dt = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid_dt, cv=5, scoring='accuracy')
grid_search_dt.fit(feature_train_sample, target_train_sample)

after_memory = psutil.virtual_memory().used / (1024 ** 2)  # Convert to MB

# Calculate memory usage increase
memory_increase = after_memory - before_memory


print(f"Memory Usage Increase: {memory_increase} MB")

print("Decision Tree Best Parameters: ", grid_search_dt.best_params_)
print("Decision Tree Best Score: ", grid_search_dt.best_score_)

best_dt = grid_search_dt.best_estimator_
dt_predictions = best_dt.predict(feature_test_sample)
print("Decision Tree Test Accuracy: ", accuracy_score(target_test_sample, dt_predictions))




Memory Usage Increase: -69.75 MB
Decision Tree Best Parameters:  {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2}
Decision Tree Best Score:  0.3375335114540624
Decision Tree Test Accuracy:  0.3593716832944173


In [10]:
before_memory = psutil.virtual_memory().used / (1024 ** 2)

randomized_search_dt = RandomizedSearchCV(DecisionTreeClassifier(random_state=0), param_distributions=param_grid_dt, n_iter=20, cv=5, scoring='accuracy', random_state=0)
randomized_search_dt.fit(feature_train_sample, target_train_sample)

after_memory = psutil.virtual_memory().used / (1024 ** 2)  # Convert to MB

# Calculate memory usage increase
memory_increase = after_memory - before_memory

print(f"Memory Usage Increase: {memory_increase} MB")

# Print the best parameters found by Randomized Search
print("Decision Tree Best Parameters: ", randomized_search_dt.best_params_)
print("Decision Tree Best Score: ", randomized_search_dt.best_score_)

# Use the best estimator to make predictions
best_dt_random = randomized_search_dt.best_estimator_
dt_random_predictions = best_dt_random.predict(feature_test_sample)

# Evaluate the predictions
print("Decision Tree (Randomized Search) Test Accuracy: ", accuracy_score(target_test_sample, dt_random_predictions))




Memory Usage Increase: 17.3984375 MB
Decision Tree Best Parameters:  {'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 30}
Decision Tree Best Score:  0.3375335114540624
Decision Tree (Randomized Search) Test Accuracy:  0.3593716832944173


In [11]:
param_grid_rf = {
    'n_estimators': [10, 50 ,100],
    #'max_depth': [None, 20],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10]
}

before_memory = psutil.virtual_memory().used / (1024 ** 2)

grid_search_rf = GridSearchCV(RandomForestClassifier(random_state=0), param_grid_rf, cv=3, scoring='accuracy')
grid_search_rf.fit(feature_train_sample, target_train_sample)

after_memory = psutil.virtual_memory().used / (1024 ** 2)  # Convert to MB

# Calculate memory usage increase
memory_increase = after_memory - before_memory


print(f"Memory Usage Increase: {memory_increase} MB")

print("Random Forest Best Parameters: ", grid_search_rf.best_params_)
print("Random Forest Best Score: ", grid_search_rf.best_score_)

best_rf = grid_search_rf.best_estimator_
rf_predictions = best_rf.predict(feature_test_sample)
print("Random Forest Test Accuracy: ", accuracy_score(target_test_sample, rf_predictions))




Memory Usage Increase: -36.98046875 MB
Random Forest Best Parameters:  {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 100}
Random Forest Best Score:  0.3570941776035635
Random Forest Test Accuracy:  0.38017406070897897


In [12]:
before_memory = psutil.virtual_memory().used / (1024 ** 2)

randomized_search_rf = RandomizedSearchCV(RandomForestClassifier(random_state=0), param_distributions=param_grid_rf, n_iter=20, cv=3, scoring='accuracy', random_state=0)
randomized_search_rf.fit(feature_train_sample, target_train_sample)

after_memory = psutil.virtual_memory().used / (1024 ** 2)  # Convert to MB

memory_increase = after_memory - before_memory

print(f"Memory Usage Increase: {memory_increase} MB")

# Print the best parameters found by Randomized Search
print("Random Forest Best Parameters: ", randomized_search_rf.best_params_)
print("Random Forest Best Score: ", randomized_search_rf.best_score_)

# Use the best estimator to make predictions
best_rf_random = randomized_search_rf.best_estimator_
rf_random_predictions = best_rf_random.predict(feature_test_sample)

# Evaluate the predictions
print("Random Forest (Randomized Search) Test Accuracy: ", accuracy_score(target_test_sample, rf_random_predictions))




Memory Usage Increase: 289.34765625 MB
Random Forest Best Parameters:  {'n_estimators': 100, 'min_samples_split': 5, 'max_depth': 30}
Random Forest Best Score:  0.3549561722023129
Random Forest (Randomized Search) Test Accuracy:  0.38410104011887075


In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def evaluate_classification_model(model_name, model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    # Basic Metrics
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    f1 = f1_score(y_test, predictions, average='weighted')
    cm = confusion_matrix(y_test, predictions)

    print(f"Model: {model_name}")
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-Score:", f1)
    print("Confusion Matrix:\n", cm)
    print("\n")


evaluate_classification_model("Logistic Regression", logreg, feature_train, target_train, feature_test, target_test)
evaluate_classification_model("Decision Tree", decision_tree, feature_train, target_train, feature_test, target_test)
evaluate_classification_model("Random Forest", random_forest, feature_train, target_train, feature_test, target_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))


Model: Logistic Regression
Accuracy: 0.16540890516372128
Precision: 0.08048669033681075
Recall: 0.16540890516372128
F1-Score: 0.07483030341995761
Confusion Matrix:
 [[ 0  0  0 ...  0  1  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 ...
 [ 0  0  0 ...  0 10  0]
 [ 0  0  0 ...  0  6  0]
 [ 0  0  0 ...  0  1  0]]


Model: Decision Tree
Accuracy: 0.6258239133895876
Precision: 0.6260713171156923
Recall: 0.6258239133895876
F1-Score: 0.625893204100852
Confusion Matrix:
 [[1115    3    0 ...   75    8    9]
 [   1  210    0 ...    0    1    1]
 [   0    0    2 ...    0    0    0]
 ...
 [  60    0    0 ... 4898  102   31]
 [   6    1    0 ...   88 1363    8]
 [   6    1    0 ...   26   13  512]]


Model: Random Forest
Accuracy: 0.6412354720585894
Precision: 0.6422372449535945
Recall: 0.6412354720585894
F1-Score: 0.6410339120880725
Confusion Matrix:
 [[1117    3    0 ...   79    6    9]
 [   0  211    0 ...    0    2    1]
 [   0    0    1 ...    0    0    0]
 ...
 [  43    1    0 ... 5