In [67]:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score,accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import pickle
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
import warnings
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from xgboost import XGBRegressor


In [68]:
data = pd.read_excel(r"C:\Users\baran\OneDrive\Masaüstü\Zeynep Baran\Veri Toplama\SUN CREAM-ENCODED.xlsx")
df = pd.DataFrame(data)
rmse_values=[]


In [69]:
def xgboost_1(df):
    selected_columns = df[['TYPE','PRODUCT QUANTITY','BRAND',"SPF","COLOR"]]
    X = selected_columns
    y = df['PRICE']


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # model oluştur
    xgb_model = XGBRegressor(n_estimators=150, learning_rate=0.1, max_depth=5)
    xgb_model.fit(X_train_scaled, y_train)

    y_pred = xgb_model.predict(X_test_scaled)

    # Performans
    mse = mean_squared_error(y_test, y_pred)
    rmse = (mse)**0.5
    r2 = r2_score(y_test, y_pred)
    print(rmse)
    return rmse,xgb_model


In [70]:
xgboost_1(df)

327.7413157547584


(327.7413157547584,
 XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=150, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...))

In [71]:
def decision_tree(df):
    X = df[['TYPE', 'COLOR', 'BRAND',"SPF","PRODUCT QUANTITY"]]
    y = df['PRICE']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # scale yap
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # model oluştur
    model = DecisionTreeRegressor()
    model.fit(X_train_scaled, y_train)

    # Train seti için R-kare
    y_train_pred = model.predict(X_train_scaled)
    r2_train = r2_score(y_train, y_train_pred)

    # Test seti için R-kare
    y_test_pred = model.predict(X_test_scaled)
    r2_test = r2_score(y_test, y_test_pred)

    # performans
    mse = mean_squared_error(y_test, y_test_pred)
    print(np.sqrt(mse))
    return np.sqrt(mse),model


In [72]:
def granding_boosting(df):
    selected_columns = df[['TYPE', 'COLOR', 'PRODUCT QUANTITY',"SPF","BRAND"]]
    X = selected_columns
    y = df['PRICE']

    #test eğitim olarak ayır
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # model oluştur
    model = GradientBoostingClassifier(random_state=42)

    # Parametre uzayını belirle
    param_grid = {
        'n_estimators': [50, 100],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5],
    }

    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=skf)

    # GridSearchCV'yi eğit
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        grid_search.fit(X_train, y_train)

    # En iyi parametreleri ve modeli al
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_

    # tahmin et
    y_pred = best_model.predict(X_test)
    print(np.sqrt(mean_squared_error(y_test,y_pred)))
    return np.sqrt(mean_squared_error(y_test,y_pred)),best_model
    

In [73]:
def knn(df):
    selected_columns = df[['TYPE', 'COLOR', 'BRAND',"SPF","PRODUCT QUANTITY"]]
    X = selected_columns
    y = df['PRICE']

    # Veriyi eğitim ve test setlerine ayırın
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Veriyi ölçeklendirme (scaling)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # KNeighborsRegressor modelini oluştur
    knn_model = KNeighborsRegressor(n_neighbors=5)  # n_neighbors, komşuluk sayısını belirten bir parametredir

    # KNeighborsRegressor modelini eğit
    knn_model.fit(X_train_scaled, y_train)

    # Test seti üzerinde tahmin yap
    y_pred = knn_model.predict(X_test_scaled)

    # performans
    mse = mean_squared_error(y_test, y_pred)
    rmse = (mse)**0.5
    print(rmse)
    return rmse,knn_model

In [74]:
def lightgbm(df):
    # Belirli sütunları seç
    selected_columns = df[['TYPE', 'COLOR', 'BRAND',"SPF","PRODUCT QUANTITY"]]
    X = selected_columns
    y = df['PRICE']

    # test eğitim verilerini ayır
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # moel oluştur
    lgbm_model = LGBMRegressor(boosting_type='gbdt', num_leaves=31, learning_rate=0.05)

    # modeli eğit
    lgbm_model.fit(X_train_scaled, y_train)

    # tahmin et
    y_pred = lgbm_model.predict(X_test_scaled)

    # performans
    mse = mean_squared_error(y_test, y_pred)
    rmse = (mse)**0.5
    r2 = r2_score(y_test, y_pred)
    print(rmse)
    return rmse,lgbm_model

In [75]:
lightgbm(df)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000051 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 30
[LightGBM] [Info] Number of data points in the train set: 412, number of used features: 4
[LightGBM] [Info] Start training from score 396.966019
319.8456874411688


(319.8456874411688, LGBMRegressor(learning_rate=0.05))

In [76]:
def lineer_reg(df):
    # Belirli sütunları seç
    selected_columns = df[['TYPE', 'COLOR', 'BRAND',"SPF","PRODUCT QUANTITY"]]
    X = selected_columns
    y = df['PRICE']

    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    #model oluştur
    linear_model = LinearRegression()

    linear_model.fit(X_train_scaled, y_train)

    #tahmin yap
    y_pred = linear_model.predict(X_test_scaled)

    # performans
    mse = mean_squared_error(y_test, y_pred)
    rmse = (mse) ** 0.5
    r2 = r2_score(y_test, y_pred)
    print(rmse)
    return rmse, linear_model

In [77]:
def random_forest(df):
    def evaluate_model(y_true, y_pred):
        # Precision (Hassasiyet) ve Recall (Duyarlılık) değerlerini hesapla
        precision = precision_score(y_true, y_pred, average='weighted', zero_division=1)
        recall = recall_score(y_true, y_pred, average='weighted', zero_division=1)

        return precision, recall


    # değişkenleri belirle
    X = df[['TYPE', 'COLOR', 'BRAND',"SPF","PRODUCT QUANTITY"]]
    y = df['PRICE']

    # veriyi ayır
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # model oluşturma
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

    # train
    rf_model.fit(X_train, y_train)

    y_pred = rf_model.predict(X_test)

    #performans
    accuracy = accuracy_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    print(np.sqrt(mse))
    return np.sqrt(mse),rf_model

In [78]:
def svr(df):
    selected_columns = df[['TYPE', 'COLOR', 'BRAND',"SPF","PRODUCT QUANTITY"]]
    X = selected_columns
    y = df['PRICE']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # (scaling)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

  
    svr_model = SVR(kernel='linear', C=1.0)


    svr_model.fit(X_train_scaled, y_train)

   
    y_pred = svr_model.predict(X_test_scaled)

    mse = mean_squared_error(y_test, y_pred)
    rmse = (mse)**0.5
    r2 = r2_score(y_test, y_pred)
    print(rmse)
    return rmse,svr_model

In [79]:
svr(df)

355.42354963393694


(355.42354963393694, SVR(kernel='linear'))

In [80]:
random_forest(df)

367.106026852538


(367.106026852538, RandomForestClassifier(random_state=42))

In [81]:
lineer_reg(df)

348.86923919003436


(348.86923919003436, LinearRegression())

In [82]:
results={"xgboost_1":xgboost_1(df),"decision_tree":decision_tree(df),"granding_boosting":granding_boosting(df),"knn":knn(df),"lightgbm":lightgbm(df),"lineer_reg":lineer_reg(df),"random_forest":random_forest(df),"svr":svr(df)
}

327.7413157547584
323.0629532217036
362.51903197444585
325.0355589210306
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000045 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 30
[LightGBM] [Info] Number of data points in the train set: 412, number of used features: 4
[LightGBM] [Info] Start training from score 396.966019
319.8456874411688
348.86923919003436
367.106026852538
355.42354963393694


In [83]:
for keys,values in results.items():
    print(keys)
    print(values[0])
best_model_name = min(results, key=lambda k: results[k][0])
best_model_rmse, best_model = results[best_model_name]
print(best_model_name," ",best_model_rmse)



xgboost_1
327.7413157547584
decision_tree
323.0629532217036
granding_boosting
362.51903197444585
knn
325.0355589210306
lightgbm
319.8456874411688
lineer_reg
348.86923919003436
random_forest
367.106026852538
svr
355.42354963393694
lightgbm   319.8456874411688


In [84]:
with open(f'best_model_{best_model_name}.pkl', 'wb') as file:
    pickle.dump(best_model, file)