In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

def preprocess_data(data_path):
    df = pd.read_csv(data_path)
    df.drop(columns=['Unnamed: 0', 'New_Price'], inplace=True)
    df['Mileage'] = df['Mileage'].str.split(expand=True)[0].astype(float)
    df['Engine'] = df['Engine'].str.split(expand=True)[0].astype(float)
    df['Power'] = df['Power'].replace('null bhp', None)
    df['Power'] = df['Power'].str.split(expand=True)[0].astype(float)
    df['Mileage'] = df['Mileage'].fillna(df['Mileage'].mean())
    df['Engine'] = df['Engine'].fillna(df['Engine'].mean())
    df['Power'] = df['Power'].fillna(df['Power'].mean())
    df['Seats'] = df['Seats'].fillna(df['Seats'].mean())
    df['Company'] = df['Name'].str.split(expand=True)[0].str.lower()
    df['Model'] = df['Name'].str.split().str[0:2].str.join(' ').str.lower()
    df.drop(columns=['Name'], inplace=True)
    CatCols = ['Company', 'Model', 'Location', 'Owner_Type', 'Fuel_Type', 'Transmission']
    df = pd.get_dummies(df, columns=CatCols, drop_first=True)
    return df

def train_random_forest(df):
    X = df.drop(columns=['Price'])
    y = df['Price']
    random_forest = RandomForestRegressor(n_estimators=50, max_depth=10, min_samples_split=5)
    random_forest.fit(X, y)
    return random_forest

def predict_car_price(model, sample_features):
    sample_df = pd.DataFrame([sample_features])
    sample_df_processed = preprocess_sample_data(sample_df, processed_df.columns)
    predicted_price = model.predict(sample_df_processed)
    return predicted_price[0]

def preprocess_sample_data(sample_df, column_names):
    processed_column_names = column_names.drop('Price')
    sample_df_processed = pd.get_dummies(sample_df, columns=sample_df.columns)
    sample_df_processed = sample_df_processed.reindex(columns=processed_column_names, fill_value=0)
    return sample_df_processed

if __name__ == "__main__":
    data_path = "C:/Users/ASUS/Desktop/set/train-data.csv"
    processed_df = preprocess_data(data_path)
    model = train_random_forest(processed_df)

    sample_features = {}
    input_labels = ['Company', 'Model', 'Location', 'Owner_Type', 'Fuel_Type', 'Transmission',
                    'Year', 'Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Seats']
    for label in input_labels:
        sample_features[label] = input(f"{label}: ")

    predicted_price = predict_car_price(model, sample_features)
    # Fiyatı 1 milyonla çarpıp tam sayı kısmını alarak, her üç basamakta bir nokta ekleyerek görüntüleme
    predicted_price *= 1000000
    predicted_price = "{:,.0f}".format(predicted_price)
    print(f"Tahmin Edilen Araba Fiyatı: {predicted_price}")


KeyboardInterrupt: Interrupted by user

In [8]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy import stats


def preprocess_data(data_path):
    df = pd.read_csv(data_path)
    df.drop(columns=['Unnamed: 0', 'New_Price'], inplace=True)
    df['Mileage'] = df['Mileage'].str.split(expand=True)[0].astype(float)
    df['Engine'] = df['Engine'].str.split(expand=True)[0].astype(float)
    df['Power'] = df['Power'].replace('null bhp', None)
    df['Power'] = df['Power'].str.split(expand=True)[0].astype(float)
    df['Mileage'] = df['Mileage'].fillna(df['Mileage'].mean())
    df['Engine'] = df['Engine'].fillna(df['Engine'].mean())
    df['Power'] = df['Power'].fillna(df['Power'].mean())
    df['Seats'] = df['Seats'].fillna(df['Seats'].mean())
    Z_scores = stats.zscore(df["Kilometers_Driven"])
    threshold = 2.5
    outliers = df['Kilometers_Driven'][abs(Z_scores) > threshold]
    df = df[df["Kilometers_Driven"] <= 300000]
    minimum_seats = df["Seats"].min()
    df = df[df['Seats'] != 0]
    df['Company'] = df['Name'].str.split(expand=True)[0].str.lower()
    df['Model'] = df['Name'].str.split().str[0:2].str.join(' ').str.lower()
    df.drop(columns=['Name'], inplace=True)
    new_order = ['Company', 'Model', 'Location', 'Owner_Type', 'Fuel_Type', 'Transmission',
                 'Year', 'Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Seats', 'Price']
    df = df.reindex(columns=new_order)
    CatCols = ['Company', 'Model', 'Location', 'Owner_Type', 'Fuel_Type', 'Transmission']
    df = pd.get_dummies(df, columns=CatCols, drop_first=True)
    return df

def train_random_forest(df):
    X = df.drop(columns=['Price'])
    y = df['Price']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    random_forest = RandomForestRegressor(n_estimators=200, max_depth=20, min_samples_split=10)
    random_forest.fit(X_train, y_train)
    return random_forest, X_test, y_test

def predict_price(model, sample_df):
    predicted_price = model.predict(sample_df)
    return predicted_price

if __name__ == "__main__":
    data_path = "C:/Users/ASUS/Desktop/set/train-data.csv"
    df = preprocess_data(data_path)
    model, X_test, y_test = train_random_forest(df)
    predictions = predict_price(model, X_test)
    
    mae = mean_absolute_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    rmse = mse ** 0.5
    
    print("MAE (Mean Absolute Error):", mae)
    print("MSE (Mean Squared Error):", mse)
    print("RMSE (Root Mean Squared Error):", rmse)


MAE (Mean Absolute Error): 1.40758056347473
MSE (Mean Squared Error): 9.635950811417509
RMSE (Root Mean Squared Error): 3.104182792848628
