In [1]:
import pandas as pd
import numpy as np 
import optuna

from keras import layers
from keras import models
from keras.losses import mean_absolute_percentage_error

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import  r2_score

import lightgbm as lgb
from nltk.corpus import stopwords as nltk_stopwords

In [2]:
# чтение данных 
train = pd.read_csv('train_pr.csv')
test = pd.read_csv('test_pr.csv')

In [3]:
features_train = train.drop(columns=['sellingprice'])
target_train = train['sellingprice']

In [4]:
# создание pipeline
num_transform = Pipeline(steps=[('imputer', SimpleImputer(strategy = 'constant')), 
                                ('scaler', StandardScaler())])

cat_transform_1 = Pipeline(steps=[('inputer', SimpleImputer(strategy = 'constant')), 
                                  ('onehot', OneHotEncoder(drop = 'first', handle_unknown = 'ignore'))])

cat_transform_2 = Pipeline(steps=[('inputer', SimpleImputer(strategy = 'constant')), 
                                  ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])

text_transform = Pipeline(steps=[('CV', TfidfVectorizer(stop_words=list(nltk_stopwords.words('english')), 
                                                        max_features=50))])

In [5]:
# обрабатываемые колонки
num_features = ['condition', 'odometer', 'years', 'test', 'odometer_2']
cat_features = ['make', 'model', 'body', 'transmission', 'state', 'color', 'interior']
text_features = 'lema'

### поиск параметров на lightgbm при помощи optuna

In [6]:
#финальный обработчик для lightgbm
preprocessor = ColumnTransformer(transformers = [
    ('num', num_transform, num_features), 
    ('cat', cat_transform_2, cat_features),
    ('text', text_transform, text_features)
], remainder="passthrough")

In [7]:
#обработанные признаки
train_lgbm = preprocessor.fit_transform(features_train)
test_lgbm = preprocessor.transform(test)

In [8]:
def objective_lgbm(trial):
    max_depth = trial.suggest_int('max_depth', 2, 20)
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 0.1, log=True)
    n_estimators = trial.suggest_int('n_estimators', 2000, 5500)
    
    score = cross_val_score(
        lgb.LGBMRegressor(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators), 
        train_lgbm, 
        target_train, 
        cv=3, 
        scoring='neg_mean_absolute_percentage_error', 
        n_jobs=-1).mean()
    
    return score

In [9]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_lgbm, n_trials=15)

[I 2023-08-30 19:07:49,836] A new study created in memory with name: no-name-827a300f-4696-4ca7-9c96-33e6d142ad75
[I 2023-08-30 19:15:29,771] Trial 0 finished with value: -0.27382897362796993 and parameters: {'max_depth': 18, 'learning_rate': 0.0024750904035885472, 'n_estimators': 4867}. Best is trial 0 with value: -0.27382897362796993.
[I 2023-08-30 19:20:45,499] Trial 1 finished with value: -0.9780651962205135 and parameters: {'max_depth': 7, 'learning_rate': 0.00030005532622769864, 'n_estimators': 3327}. Best is trial 0 with value: -0.27382897362796993.
[I 2023-08-30 19:28:58,574] Trial 2 finished with value: -0.35389820231890284 and parameters: {'max_depth': 6, 'learning_rate': 0.001142865072309764, 'n_estimators': 5259}. Best is trial 0 with value: -0.27382897362796993.
[I 2023-08-30 19:32:07,581] Trial 3 finished with value: -1.077341974925589 and parameters: {'max_depth': 12, 'learning_rate': 0.0003788157372829839, 'n_estimators': 2206}. Best is trial 0 with value: -0.2738289736

In [10]:
study.best_params

{'max_depth': 10, 'learning_rate': 0.0739257099781428, 'n_estimators': 2998}

In [13]:
#обучение на лучших параметрах
model = lgb.LGBMRegressor(**study.best_params)
model.fit(train_lgbm, target_train)

In [16]:
answer = pd.read_csv('sample_submission.csv')

In [21]:
#сохранение результатов
answer['sellingprice'] = model.predict(test_lgbm)
answer.to_csv('answer_1.csv', index=False)

### создание нейронной сети

In [22]:
# препроцессор для нейронной сети
preprocessor = ColumnTransformer(transformers = [
    ('num', num_transform, num_features), 
    ('cat', cat_transform_1, cat_features),
    ('text', text_transform, text_features)
], remainder="passthrough")

In [25]:
train_nn = preprocessor.fit_transform(features_train)
test_nn = preprocessor.transform(test)



In [31]:
train_nn = train_nn.toarray()
test_nn = test_nn.toarray()

In [32]:
model = models.Sequential([
    layers.Dense(2048, activation='relu', input_shape=(train_nn.shape[1],)),
    layers.Dense(1024, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(1024, activation='relu'),
    layers.Dense(512, activation='relu'),
    layers.BatchNormalization(),
    layers.Dense(512, activation='relu'),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(256, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1)
])

In [33]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_10 (Dense)            (None, 2048)              2568192   
                                                                 
 dense_11 (Dense)            (None, 1024)              2098176   
                                                                 
 dropout_2 (Dropout)         (None, 1024)              0         
                                                                 
 dense_12 (Dense)            (None, 1024)              1049600   
                                                                 
 dense_13 (Dense)            (None, 512)               524800    
                                                                 
 batch_normalization_1 (Batc  (None, 512)              2048      
 hNormalization)                                                 
                                                      

In [34]:
model.compile(optimizer='adam', loss=mean_absolute_percentage_error, metrics=['mae'])

In [35]:
his = model.fit(train_nn, target_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [36]:
answer['sellingprice'] = model.predict(test_nn).reshape(-1)
answer.to_csv('answer_2.csv', index=False)

