In [52]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
import tensorflow as tf

In [53]:
# Check if a GPU is available and set TensorFlow to use it
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

# If GPUs are available, ensure TensorFlow uses them
if tf.config.experimental.list_physical_devices('GPU'):
    tf.config.experimental.set_memory_growth(tf.config.experimental.list_physical_devices('GPU')[0], True)

Num GPUs Available:  0


In [54]:
# Load the datasets
train_data = pd.read_csv('content/train.csv')
test_data = pd.read_csv('content/test.csv')

In [55]:
# Separate features and target variable from training data
X_train = train_data.drop(columns=['price'])
y_train = train_data['price']

In [56]:
# Identify categorical and numerical columns
categorical_columns = ['brand', 'model', 'fuel_type', 'transmission', 'accident','engine', 'milage', 'ext_col','int_col','clean_title']
numerical_columns = ['model_year']

In [57]:
# Ensure all categorical columns are of type string
X_train[categorical_columns] = X_train[categorical_columns].astype(str)
test_data[categorical_columns] = test_data[categorical_columns].astype(str)
print(X_train[categorical_columns])

         brand                             model      fuel_type  \
0         Ford                      F-150 Lariat       Gasoline   
1          BMW                             335 i       Gasoline   
2       Jaguar                         XF Luxury       Gasoline   
3          BMW                      X7 xDrive40i         Hybrid   
4      Pontiac                     Firebird Base       Gasoline   
...        ...                               ...            ...   
54268      BMW                      X6 xDrive50i       Gasoline   
54269     Audi                   A4 2.0T Premium  E85 Flex Fuel   
54270  Porsche                         Cayenne S       Gasoline   
54271  Porsche                 911 Carrera 4 GTS       Gasoline   
54272     Audi  A5 Sportback S line Premium Plus         Hybrid   

                            transmission  \
0                           10-Speed A/T   
1                            6-Speed M/T   
2                            6-Speed A/T   
3         Transmiss

In [58]:
# Handle missing values for categorical columns
categorical_imputer = SimpleImputer(strategy='most_frequent')
X_train[categorical_columns] = categorical_imputer.fit_transform(X_train[categorical_columns])
test_data[categorical_columns] = categorical_imputer.transform(test_data[categorical_columns])

In [59]:
# Handle missing values for numerical columns
numerical_imputer = SimpleImputer(strategy='mean')
X_train[numerical_columns] = numerical_imputer.fit_transform(X_train[numerical_columns].astype(int))
test_data[numerical_columns] = numerical_imputer.transform(test_data[numerical_columns].astype(int))
print(X_train[numerical_columns])

       model_year
0          2018.0
1          2007.0
2          2009.0
3          2022.0
4          2001.0
...           ...
54268      2017.0
54269      2015.0
54270      2013.0
54271      2023.0
54272      2021.0

[54273 rows x 1 columns]


In [60]:
# Combine training and test data for label encoding
combined_data = pd.concat([X_train[categorical_columns], test_data[categorical_columns]])
print(combined_data)

           brand                            model      fuel_type  \
0           Ford                     F-150 Lariat       Gasoline   
1            BMW                            335 i       Gasoline   
2         Jaguar                        XF Luxury       Gasoline   
3            BMW                     X7 xDrive40i         Hybrid   
4        Pontiac                    Firebird Base       Gasoline   
...          ...                              ...            ...   
36178        GMC                     Yukon Denali       Gasoline   
36179  Chevrolet  Silverado 1500 Z71 Extended Cab       Gasoline   
36180     Toyota                       Corolla LE       Gasoline   
36181    Lincoln                Navigator Reserve       Gasoline   
36182  Chevrolet                         Tahoe LT  E85 Flex Fuel   

                         transmission                                accident  \
0                        10-Speed A/T                           None reported   
1                    

In [61]:
# Encode categorical variables
label_encoders = {}
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    combined_data[column] = label_encoders[column].fit_transform(combined_data[column])

In [62]:
# Separate back the encoded data
X_train[categorical_columns] = combined_data.iloc[:X_train.shape[0], :].values
test_data[categorical_columns] = combined_data.iloc[X_train.shape[0]:, :].values

In [63]:
# Combine encoded categorical columns and numerical columns for scaling
X_train_combined = np.hstack((X_train[categorical_columns].astype(float), X_train[numerical_columns]))
X_test_combined = np.hstack((test_data[categorical_columns].astype(float), test_data[numerical_columns]))
print(X_train_combined)

[[1.400e+01 6.490e+02 2.000e+00 ... 6.300e+01 0.000e+00 2.018e+03]
 [4.000e+00 4.900e+01 2.000e+00 ... 1.200e+01 0.000e+00 2.007e+03]
 [2.100e+01 1.803e+03 2.000e+00 ... 9.000e+00 0.000e+00 2.009e+03]
 ...
 [4.100e+01 4.240e+02 2.000e+00 ... 1.200e+01 0.000e+00 2.013e+03]
 [4.100e+01 1.440e+02 2.000e+00 ... 2.800e+01 0.000e+00 2.023e+03]
 [3.000e+00 1.780e+02 3.000e+00 ... 1.200e+01 0.000e+00 2.021e+03]]


In [64]:
# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_combined)

In [65]:
X_test_scaled = scaler.transform(X_test_combined)

In [66]:
# Prepare the data for the LSTM model
n_input = 1
n_features = X_train_scaled.shape[1]

In [67]:
train_generator = TimeseriesGenerator(X_train_scaled, y_train, length=n_input, batch_size=32)
#test_generator = TimeseriesGenerator(X_test_scaled, np.zeros(len(X_test_scaled)), length=n_input, batch_size=32)

# Build the LSTM model
model = Sequential()
model.add(LSTM(30, activation='relu', input_shape=(n_input, n_features)))
model.add(Dropout(0.2))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

  super().__init__(**kwargs)


In [68]:
# Train the model
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit(train_generator, epochs=170, callbacks=[early_stopping])

#model.fit(train_generator, epochs=20)

Epoch 1/170
[1m 118/1696[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 1ms/step - loss: 9087170560.0000

  self._warn_if_super_not_called()


[1m1696/1696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - loss: 7879025664.0000
Epoch 2/170
[1m  99/1696[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 2ms/step - loss: 8515620864.0000

  current = self.get_monitor_value(logs)


[1m1696/1696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 6736046592.0000
Epoch 3/170
[1m1696/1696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 6004651520.0000
Epoch 4/170
[1m1696/1696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 7627243008.0000
Epoch 5/170
[1m1696/1696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 6573514240.0000
Epoch 6/170
[1m1696/1696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 7266580992.0000
Epoch 7/170
[1m1696/1696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 5786790912.0000
Epoch 8/170
[1m1696/1696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 6147581440.0000
Epoch 9/170
[1m1696/1696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 6422343168.0000
Epoch 10/170
[1m1696/1696[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - loss: 5696284672

In [69]:
# Make predictions
#predictions = model.predict(test_generator)
#predictions = predictions.flatten()
X_test_expanded = np.expand_dims(X_test_scaled, axis=1)
predictions = model.predict(X_test_expanded)
predictions = predictions.flatten()


[1m1131/1131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step


In [70]:
print(len(predictions))
print(len(test_data))
print(predictions)
predictions_clean = test_data.dropna()
print(len(test_data))
preds=test_data
#preds=preds[:-1]
print(len(preds))

36183
36183
[38976.03  38768.46  36386.133 ... 38099.504 33706.11  36513.273]
36183
36183


In [71]:
preds=test_data
#preds=preds[:-1]
output = pd.DataFrame({'id': preds['id'], 'predicted_price': predictions})
output = output.iloc[:len(predictions)]
if len(predictions) < len(preds):
    output = output.iloc[:len(predictions)]
    
output.to_csv('predicted_prices_new.csv', index=False)


print("Predictions saved to predicted_prices.csv")

Predictions saved to predicted_prices.csv
