# Clean Used Cars Dataset (craigslist)
source: https://www.kaggle.com/datasets/austinreese/craigslist-carstrucks-data?select=vehicles.csv

In [None]:
import pandas as pd
import numpy as np

In [None]:
dataset_path = '/home/pico/code/car_mini_project/vehicles_craigslist.csv'
raw_df = pd.read_csv(dataset_path)

### Check stats of the features

In [None]:
raw_df.describe(include='object')

#### Notes: 
    * some categories are unbalanced (transmission, title_status, fuel)
    * cylinders must be converted to number

In [None]:
raw_df.describe()

### Remove unused columns

In [None]:
cl_df = raw_df.drop(columns=['url', 'region', 
                    'region_url', 'title_status', 
                    'VIN', 'image_url', 'description', 
                    'state', 'posting_date',
                    'id', 'county'])

### Check types in dataset

In [None]:
cl_df.dtypes

Cylinders should be a number. Change data type

In [None]:
cl_df.tail()

In [None]:
# Function to extract numeric characters and convert to int
def extract_numeric(s):
    if pd.isna(s):  # Check for NaN
        return np.nan
    numeric_str = ''.join(filter(str.isdigit, s))  # Keep only numeric characters
    return int(numeric_str) if numeric_str else np.nan  # Convert to int or return NaN

In [None]:
# Apply the function to the column
cl_df['n_cylinders'] = cl_df['cylinders'].apply(extract_numeric)

In [None]:
# remove string column
cl_df.drop(columns=['cylinders'], inplace=True)

In [None]:
cl_df.tail()

### Show counts of categorical variables

In [None]:
# Show counts for all categorical features
for col in cl_df.select_dtypes(include=['category', 'object']).columns:
    print(f"Counts for {col}:")
    print(cl_df[col].value_counts())
    print()

### Check outliers

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def draw_boxplots(df):
    # Set the style of seaborn
    sns.set(style="whitegrid")
    
    # Select only numeric features
    numeric_features = df.select_dtypes(include=['number']).columns
    
    # Determine the number of rows and columns for the grid
    n_features = len(numeric_features)
    n_cols = 3  # Number of columns in the grid
    n_rows = int(np.ceil(n_features / n_cols))  # Calculate number of rows needed
    
    # Create a grid of boxplots
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))  # Adjust the figure size as needed
    axes = axes.flatten()  # Flatten the axes array for easy indexing
    
    # Loop through each numeric feature and create a boxplot
    for i, column in enumerate(numeric_features):
        sns.boxplot(y=df[column], ax=axes[i])
        axes[i].set_title(column)
    
    # Hide any unused subplots
    for j in range(i + 1, n_rows * n_cols):
        fig.delaxes(axes[j])
    
    plt.tight_layout()  # Adjust layout to prevent overlap

In [None]:
cl_df.shape

In [None]:
draw_boxplots(cl_df)

In [None]:
# Function to count outliers based on IQR
def count_outliers(df):
    outlier_counts = {}
    
    # Loop through each numerical feature
    for column in df.select_dtypes(include=['number']).columns:
        # Calculate Q1 (25th percentile) and Q3 (75th percentile)
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        
        # Calculate IQR
        IQR = Q3 - Q1
        
        # Determine lower and upper bounds
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Count outliers
        outlier_count = df[(df[column] < lower_bound) | (df[column] > upper_bound)].shape[0]
        outlier_counts[column] = outlier_count
    
    return outlier_counts

# Count outliers
outliers = count_outliers(cl_df)

# Display the number of outliers for each feature
print("Number of outliers in each feature:")
for feature, count in outliers.items():
    print(f"{feature}: {count}")

### Remove Outliers

In [None]:
# Function to remove outliers based on interquartile range (IQR) 
def remove_outliers(df):
    # Create a copy of the DataFrame to avoid modifying the original
    df_cleaned = df.copy()
    
    # Loop through each numerical feature
    for column in df_cleaned.select_dtypes(include=['number']).columns:
        # Calculate Q1 (25th percentile) and Q3 (75th percentile)
        Q1 = df_cleaned[column].quantile(0.25)
        Q3 = df_cleaned[column].quantile(0.75)
        
        # Calculate IQR
        IQR = Q3 - Q1
        
        # Determine lower and upper bounds
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Filter the DataFrame to remove outliers
        no = df_cleaned[(df_cleaned[column] >= lower_bound) & (df_cleaned[column] <= upper_bound)].shape[0]
        print(f'{column}: {no}')
        df_cleaned = df_cleaned[(df_cleaned[column] >= lower_bound) & (df_cleaned[column] <= upper_bound)]
    
    return df_cleaned

In [None]:
no_df = remove_outliers(cl_df)

In [None]:
no_df.shape

In [None]:
draw_boxplots(no_df)

### Check NaNs

In [None]:
cl_df.isna().sum().sort_values(ascending=False)

In [None]:
# First fast approach
# Remove rows that contain any NaN values
cl_df = cl_df.dropna()

# Remove model column (there are too many)
cl_df = cl_df.drop(columns=['model'])

In [None]:
cl_df.shape

## Create first random forest model to check feature importance

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

### One hot encoding for categrical features

In [None]:
# Identify categorical columns
categorical_cols = cl_df.select_dtypes(include=['object', 'category']).columns

In [None]:
# Convert categorical columns to one-hot encoding
encoded_df = pd.get_dummies(cl_df, columns=categorical_cols)

In [None]:
encoded_df.shape

In [None]:
# Split the dataset into features and target
X = encoded_df.drop(columns=['price'])
y = encoded_df['price']

# Perform stratified split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=X, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = model.predict(X_test)

# Calculate errors
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'MAE: {mae}, MSE: {mse}, RMSE: {rmse}')

In [None]:
y_pred

In [None]:
abs(y_test - y_pred).sum() / y_test.shape[0]

In [None]:
y_test.shape

In [None]:
cl_df['drive'].value_counts()

In [None]:
cl_df['model'].value_counts()

In [None]:
#for i in cl_df['model'].unique():
#    print(i)

In [None]:
v_df = pd.read_csv('/home/pico/code/car_mini_project/all-vehicles-model.csv', delimiter=';')
v_df.columns

In [None]:
v_df.describe(include='all')

In [None]:
cl_df['manufacturer'].unique()

In [None]:
cl_df.isna().sum()

In [None]:
cl_df[cl_df['manufacturer'].isna()]

In [None]:
v_df['Make'].unique()

In [None]:
cl_df[ cl_df['manufacturer'] == 'ford']

In [None]:
v_df[ v_df['Make'] == 'Ford']