In [1]:
#Imports
from sklearn.feature_extraction import FeatureHasher
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd

In [3]:
df = pd.read_csv('filtered_cars.csv')
df = df.drop('Unnamed: 0', axis=1)
df = df.drop('model', axis=1)
# Rename the index column to 'id'
df.index.name = 'id'
df.reset_index(inplace=True)

# Display the DataFrame
print(df.head())


   id  price    year manufacturer  condition    cylinders   fuel  odometer  \
0   0  24990  2018.0        acura       good  6 cylinders  other   45464.0   
1   1   2399  2003.0        acura       fair  6 cylinders    gas  143000.0   
2   2  10950  2012.0        acura       good  4 cylinders    gas  115162.0   
3   3  30995  2018.0        acura   like new  6 cylinders    gas   48743.0   
4   4  15995  2012.0        acura  excellent  6 cylinders    gas   58265.0   

  title_status transmission drive   type paint_color state standard_model  
0        clean        other   fwd  sedan        blue    oh            tlx  
1        clean    automatic   4wd    SUV        grey    nj            mdx  
2        clean    automatic   fwd  sedan       white    ks            tsx  
3        clean    automatic   4wd    SUV        grey    nj            mdx  
4        clean    automatic   4wd    SUV       white    ok            mdx  


In [4]:
#Feature hashing
# Assuming 'df' is your DataFrame containing the data
categorical_columns = ['manufacturer', 'condition', 'cylinders', 'fuel', 'title_status', 
                       'transmission', 'drive', 'type', 'paint_color', 'state', 'standard_model']

# Create a copy of the DataFrame
df_hashed = df.copy()

# Apply feature hashing to each categorical column
for col in categorical_columns:
    fh = FeatureHasher(n_features=32, input_type='string')
    hashed_features = fh.fit_transform(df[[col]].astype(str).values)
    hashed_df = pd.DataFrame(hashed_features.toarray(), columns=[f"{col}_hash_{i}" for i in range(32)])
    df_hashed = pd.concat([df_hashed, hashed_df], axis=1)

# Drop the original categorical columns
df_hashed = df_hashed.drop(categorical_columns, axis=1)

# Display the resulting DataFrame
print(df_hashed)

            id  price    year  odometer  manufacturer_hash_0  \
0            0  24990  2018.0   45464.0                  0.0   
1            1   2399  2003.0  143000.0                  0.0   
2            2  10950  2012.0  115162.0                  0.0   
3            3  30995  2018.0   48743.0                  0.0   
4            4  15995  2012.0   58265.0                  0.0   
...        ...    ...     ...       ...                  ...   
109262  109262   6500  2007.0  110000.0                  0.0   
109263  109263   8600  2012.0   99813.0                  0.0   
109264  109264   1000  2004.0  140254.0                  0.0   
109265  109265   3995  2004.0  254484.0                  0.0   
109266  109266   4995  2008.0   91008.0                  0.0   

        manufacturer_hash_1  manufacturer_hash_2  manufacturer_hash_3  \
0                       0.0                  0.0                  0.0   
1                       0.0                  0.0                  0.0   
2           

In [5]:
#Scale Numerical Data
numerical_features = ['odometer','year']
scaler = StandardScaler()
df_hashed[numerical_features] = scaler.fit_transform(df_hashed[numerical_features])

In [6]:
if 'predicted_price' in df_hashed.columns:
    df_hashed.drop('predicted_price', axis=1, inplace=True)
X = df_hashed.drop('price', axis=1)
X = X.drop('id', axis = 1)
y = df_hashed['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 16887487.942956354
R-squared: 0.903905051708404


In [7]:
# Use the trained random forest model to predict prices
df_hashed['predicted_price'] = rf.predict(X)

# Print actual and predicted prices
print(df_hashed[['price', 'predicted_price']])

        price  predicted_price
0       24990     24797.200000
1        2399      2264.280000
2       10950     11002.473214
3       30995     32844.340000
4       15995     15942.640000
...       ...              ...
109262   6500      7802.530000
109263   8600      7787.510000
109264   1000      1364.000000
109265   3995      4155.112667
109266   4995      4995.000000

[109267 rows x 2 columns]


In [8]:
# Get feature importances
feature_importances = rf.feature_importances_

feature_importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
})

# Sort the features by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print the feature importances
print(feature_importance_df)

                  Feature  Importance
0                    year    0.398194
1                odometer    0.145763
91      cylinders_hash_25    0.109074
126          fuel_hash_28    0.047981
204         drive_hash_10    0.043215
..                    ...         ...
197          drive_hash_3    0.000000
198          drive_hash_4    0.000000
199          drive_hash_5    0.000000
200          drive_hash_6    0.000000
177  transmission_hash_15    0.000000

[354 rows x 2 columns]


In [7]:
important_features_df = feature_importance_df[feature_importance_df['Importance'] > 0.005]
print(important_features_df)

                    Feature  Importance
0                      year    0.398194
1                  odometer    0.145763
91        cylinders_hash_25    0.109074
126            fuel_hash_28    0.047981
204           drive_hash_10    0.043215
77        cylinders_hash_11    0.041257
339  standard_model_hash_17    0.007837
186    transmission_hash_24    0.006603
19     manufacturer_hash_17    0.006440
224           drive_hash_30    0.006272
11      manufacturer_hash_9    0.006191
15     manufacturer_hash_13    0.005769
35         condition_hash_1    0.005354
239            type_hash_13    0.005214


In [8]:
#Create new dataframe with Random Forest predicted prices
df_merged = pd.merge(df, df_hashed[['id', 'predicted_price']], on='id', how='left')
df_merged.head()

Unnamed: 0,id,price,year,manufacturer,condition,cylinders,fuel,odometer,title_status,transmission,drive,type,paint_color,state,standard_model,predicted_price
0,0,24990,2018.0,acura,good,6 cylinders,other,45464.0,clean,other,fwd,sedan,blue,oh,tlx,24797.2
1,1,2399,2003.0,acura,fair,6 cylinders,gas,143000.0,clean,automatic,4wd,SUV,grey,nj,mdx,2264.28
2,2,10950,2012.0,acura,good,4 cylinders,gas,115162.0,clean,automatic,fwd,sedan,white,ks,tsx,11002.473214
3,3,30995,2018.0,acura,like new,6 cylinders,gas,48743.0,clean,automatic,4wd,SUV,grey,nj,mdx,32844.34
4,4,15995,2012.0,acura,excellent,6 cylinders,gas,58265.0,clean,automatic,4wd,SUV,white,ok,mdx,15942.64


In [9]:
df_merged.to_csv('predicted_prices.csv', index=False)
important_features_df.to_csv('important_features.csv', index = False)
df_hashed.to_csv('df_hashed.csv', index=False)