In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
import tensorflow as tf

In [None]:
# Check if a GPU is available and set TensorFlow to use it
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

# If GPUs are available, ensure TensorFlow uses them
if tf.config.experimental.list_physical_devices('GPU'):
    tf.config.experimental.set_memory_growth(tf.config.experimental.list_physical_devices('GPU')[0], True)

In [None]:
# Load the datasets
train_data = pd.read_csv('content/train.csv')
test_data = pd.read_csv('content/test.csv')

In [None]:
# Separate features and target variable from training data
X_train = train_data.drop(columns=['price'])
y_train = train_data['price']

In [None]:
# Identify categorical and numerical columns
categorical_columns = ['brand', 'model', 'fuel_type', 'transmission', 'accident','engine', 'milage', 'ext_col','int_col','clean_title']
numerical_columns = ['model_year']

In [None]:
# Ensure all categorical columns are of type string
X_train[categorical_columns] = X_train[categorical_columns].astype(str)
test_data[categorical_columns] = test_data[categorical_columns].astype(str)
print(X_train[categorical_columns])

In [None]:
# Handle missing values for categorical columns
categorical_imputer = SimpleImputer(strategy='most_frequent')
X_train[categorical_columns] = categorical_imputer.fit_transform(X_train[categorical_columns])
test_data[categorical_columns] = categorical_imputer.transform(test_data[categorical_columns])

In [None]:
# Handle missing values for numerical columns
numerical_imputer = SimpleImputer(strategy='mean')
X_train[numerical_columns] = numerical_imputer.fit_transform(X_train[numerical_columns].astype(int))
test_data[numerical_columns] = numerical_imputer.transform(test_data[numerical_columns].astype(int))
print(X_train[numerical_columns])

In [None]:
# Combine training and test data for label encoding
combined_data = pd.concat([X_train[categorical_columns], test_data[categorical_columns]])
print(combined_data)

In [None]:
# Encode categorical variables
label_encoders = {}
for column in categorical_columns:
    label_encoders[column] = LabelEncoder()
    combined_data[column] = label_encoders[column].fit_transform(combined_data[column])

In [None]:
# Separate back the encoded data
X_train[categorical_columns] = combined_data.iloc[:X_train.shape[0], :].values
test_data[categorical_columns] = combined_data.iloc[X_train.shape[0]:, :].values

In [None]:
# Combine encoded categorical columns and numerical columns for scaling
X_train_combined = np.hstack((X_train[categorical_columns].astype(float), X_train[numerical_columns]))
X_test_combined = np.hstack((test_data[categorical_columns].astype(float), test_data[numerical_columns]))
print(X_train_combined)

In [None]:
# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_combined)

In [None]:
X_test_scaled = scaler.transform(X_test_combined)

In [None]:
# Prepare the data for the LSTM model
n_input = 1
n_features = X_train_scaled.shape[1]

In [None]:
train_generator = TimeseriesGenerator(X_train_scaled, y_train, length=n_input, batch_size=32)
#test_generator = TimeseriesGenerator(X_test_scaled, np.zeros(len(X_test_scaled)), length=n_input, batch_size=32)

# Build the LSTM model
model = Sequential()
model.add(LSTM(30, activation='relu', input_shape=(n_input, n_features)))
model.add(Dropout(0.2))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

In [None]:
# Train the model
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit(train_generator, epochs=170, callbacks=[early_stopping])

#model.fit(train_generator, epochs=20)

In [None]:
# Make predictions
#predictions = model.predict(test_generator)
#predictions = predictions.flatten()
X_test_expanded = np.expand_dims(X_test_scaled, axis=1)
predictions = model.predict(X_test_expanded)
predictions = predictions.flatten()


In [None]:
print(len(predictions))
print(len(test_data))
print(predictions)
predictions_clean = test_data.dropna()
print(len(test_data))
preds=test_data
#preds=preds[:-1]
print(len(preds))

In [None]:
preds=test_data
#preds=preds[:-1]
output = pd.DataFrame({'id': preds['id'], 'predicted_price': predictions})
output = output.iloc[:len(predictions)]
if len(predictions) < len(preds):
    output = output.iloc[:len(predictions)]
    
output.to_csv('predicted_prices.csv', index=False)


print("Predictions saved to predicted_prices.csv")