In [1]:
from datetime import datetime
import os
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
RAND = 10

In [None]:
file_path = 'csv_data/mashina_kg/'
file_list = os.listdir(file_path)
df = pd.concat([pd.read_csv(f"csv_data/mashina_kg/{f}") for f in file_list], ignore_index=True)
df = df.replace('Land Rover', 'Land-Rover', regex=True)
df.year = df.year.apply(lambda x: int(x.split(' ')[0]))
df['age'] = [datetime.now().year] - df['year']
df.mileage = df.mileage.fillna('0 км').replace('', '0 км')
df.drop_duplicates(inplace=True)
df.transmission = df.transmission.fillna(df.engine_cap)
df.engine_cap = df.engine_cap.replace('автомат', 2).replace('вариатор', 1.5)
df.price_usd = df.price_usd.str.slice(1).str.strip().replace(' ', '', regex=True).astype(int)
df.price_som = df.price_som.replace(' сом', '', regex=True).replace(' ', '', regex=True).astype(int)
df.engine_cap = df.engine_cap.astype(float)
last_sorted = pd.read_csv('csv_data/sorted_dfs/df_sorted_upd.csv')
df_c = pd.concat([df, last_sorted], ignore_index=True)
df_c.to_csv(f'csv_data/sorted_dfs/df_sorted_upd.csv', index=False)
df_c = df_c[df_c["transmission"] != 'механика']
multi_color = df_c["color"].value_counts() > 5
df_c = df_c[df_c["color"].isin(multi_color[multi_color].index)]
multi_body_type = df_c["body_type"].value_counts() > 5
df_c = df_c[df_c["body_type"].isin(multi_body_type[multi_body_type].index)]
multi_age = df_c["age"].value_counts() > 5
df_c = df_c[df_c["age"].isin(multi_age[multi_age].index)]
multi_engine_cap = df_c["engine_cap"].value_counts() > 5
df_c = df_c[df_c["engine_cap"].isin(multi_engine_cap[multi_engine_cap].index)]
multi_make_model = df_c["make_model"].value_counts() > 5
df_c = df_c[df_c["make_model"].isin(multi_make_model[multi_make_model].index)]
df_c.reset_index(drop=True, inplace=True)
df_group = df_c.groupby(['make_model', 'year']).describe()
df_group.columns = ['__'.join(col).strip() for col in df_group.columns.values]
dfn = df_group[['price_usd__count', 'price_usd__mean']]
dfn = dfn[dfn['price_usd__count'] > 3]
dfn.reset_index(inplace=True)
dfn.rename(columns={'price_usd__count':'count', 'price_usd__mean': 'mean_price'}, inplace=True)
dfn.to_csv('csv_data/sorted_dfs/mean_prices.csv', index=False)
for f in file_list:
    path_to_file = os.path.join(file_path, f)
    os.remove(path_to_file)

In [11]:
import numpy as np

In [34]:
df_m = pd.read_csv('csv_data/sorted_dfs/df_sorted_4487.csv')
df_m['mileage'] = df_m['mileage'].replace('0 км', np.nan).replace(' км', '', regex=True).replace(' ', '', regex=True)
df_m.dropna(inplace=True)
df_m = df_m[df_m["transmission"] != 'механика']
multi_color = df_m["color"].value_counts() > 5
df_m = df_m[df_m["color"].isin(multi_color[multi_color].index)]
multi_body_type = df_m["body_type"].value_counts() > 5
df_m = df_m[df_m["body_type"].isin(multi_body_type[multi_body_type].index)]
multi_age = df_m["age"].value_counts() > 5
df_m = df_m[df_m["age"].isin(multi_age[multi_age].index)]
multi_engine_cap = df_m["engine_cap"].value_counts() > 5
df_m = df_m[df_m["engine_cap"].isin(multi_engine_cap[multi_engine_cap].index)]
multi_make_model = df_m["make_model"].value_counts() > 5
df_m = df_m[df_m["make_model"].isin(multi_make_model[multi_make_model].index)]
df_m.reset_index(drop=True, inplace=True)
cols_cat = df_m.select_dtypes('object').columns
df_m[cols_cat] = df_m[cols_cat].astype('category')

In [35]:
X = df_m.drop(['price_usd'], axis=1)
y = df_m['price_usd']

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    shuffle=True,
                                                    random_state=RAND)

X_train_, X_val, y_train_, y_val = train_test_split(X_train,
                                                    y_train,
                                                    test_size=0.16,
                                                    random_state=RAND)

eval_set = [(X_val, y_val)]

In [36]:
from sklearn.compose import ColumnTransformer


transformers_list = [
    ('encode', OneHotEncoder(dtype='int', drop='first'), [
        'make_model', 'engine_type', 'transmission',
        'wheel_pos', 'body_type', 'color'
    ]),
    ('scale', StandardScaler(),
     ['age', 'engine_cap', 'mileage'])
]

column_transformer = ColumnTransformer(transformers_list)

In [37]:
pipe = Pipeline([('columnTransformer', column_transformer),
                 ('lr', LinearRegression())])
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)

In [38]:
mean_absolute_error(y_test, y_pred)

926.3271583161986

In [21]:
y_test

1400    10400
2547    12500
813      7000
1989    13500
2499     8300
        ...  
1764    11900
384     11800
1112     4614
281      6800
1688     8800
Name: price_usd, Length: 668, dtype: int64

In [22]:
y_pred

array([10369.14482781, 10717.26313152,  7001.28299474, 12180.81695207,
        9072.41992455, 14210.01039208,  8574.83625138, 13635.87384182,
       11600.26973283, 12531.448183  ,  6172.88989304, 10852.65902213,
       14434.62101908,  8070.11502513, 11928.74291299,  9982.58774879,
        9130.37131022,  9799.79343396, 13855.56198553,  8365.23485012,
        9439.54030944,  5684.9586792 ,  9049.42226231, 12662.10209736,
       10165.97800349,  8180.94813023, 13713.75071107, 10470.26255816,
       10117.12540501, 12908.54347668,  9276.92869895,  9586.63003783,
       11926.63323368, 10120.74074973, 11972.178356  , 10703.96493786,
        9409.28977061, 10370.15257213, 12330.40070705, 13950.39222146,
       13113.35422512,  7700.33878537, 12563.97745908,  8904.02446276,
        8194.55219588, 11918.78612131,  7308.52373529, 14172.50509382,
       12636.86574437,  2986.99753389,  6901.80238866, 13063.19250296,
        7128.7181131 , 11269.21902321,  8081.41845094, 13981.96600564,
      