In [None]:
!pip install --upgrade numpy
!pip install --force-reinstall catboost


Collecting numpy
  Downloading numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.8/16.8 MB[0m [31m69.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.2.6 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.2.6 which is i

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, HuberRegressor, BayesianRidge, PassiveAggressiveRegressor, TheilSenRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor, StackingRegressor, VotingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor

data_path = "/content/03_NewSegmentBasedData_100m_Combined/CombinedDataWithNewSegments_100m.csv"
df = pd.read_csv(data_path)

#features = ['segment_length', 'slope', 'avg_vehicle_speed', 'avg_Acceleration', 'avg_Total_Mass', 'avg_Torque_Measured']
features = ['segment_length','slope', 'avg_vehicle_speed', 'avg_Acceleration', 'avg_Total_Mass']
target = 'Total_Energy_Consumption'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0, random_state=42),
    "Lasso Regression": Lasso(alpha=0.1, random_state=42),
    "ElasticNet": ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Extra Trees": ExtraTreesRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "AdaBoost": AdaBoostRegressor(random_state=42),
    "Support Vector Regressor (SVR)": make_pipeline(StandardScaler(), SVR()),
    "K-Nearest Neighbors (KNN)": KNeighborsRegressor(n_neighbors=5),
    "XGBoost": XGBRegressor(random_state=42),
    "LightGBM": LGBMRegressor(random_state=42),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42),
    "Polynomial Regression (Degree 3)": make_pipeline(PolynomialFeatures(degree=3), Ridge()),
    "Huber Regressor": HuberRegressor(),
    "Theil-Sen Regressor": TheilSenRegressor(random_state=42),
    "Passive Aggressive Regressor": PassiveAggressiveRegressor(random_state=42),
    "Bayesian Ridge": BayesianRidge(),
    "Gaussian Process": GaussianProcessRegressor(),
    "Neural Network (MLP)": MLPRegressor(hidden_layer_sizes=(100, 50), random_state=42, max_iter=1000),
    "Stacking Regressor": StackingRegressor(
        estimators=[
            ('rf', RandomForestRegressor(random_state=42)),
            ('gb', GradientBoostingRegressor(random_state=42))
        ],
        final_estimator=Ridge()
    ),
    "Voting Regressor": VotingRegressor(
        estimators=[
            ('rf', RandomForestRegressor(random_state=42)),
            ('xgb', XGBRegressor(random_state=42)),
            ('lgbm', LGBMRegressor(random_state=42))
        ]
    )
}

best_model = models["CatBoost"]



In [None]:
results = []

for name, model in models.items():
    # Model eğitimi
    model.fit(X_train, y_train)
    # Tahminler
    y_pred = model.predict(X_test)
    # Performans metrikleri
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)

    # Ağırlıkları alma
    if name == "Linear Regression" or "Polynomial Regression" in name:
        if hasattr(model, "named_steps"):  # Pipeline kontrolü
            weights = model.named_steps['ridge'].coef_ if 'ridge' in model.named_steps else model.named_steps['linearregression'].coef_
        else:
            weights = model.coef_
    elif name in ["Random Forest", "Extra Trees", "XGBoost"]:
        weights = model.feature_importances_
    else:
        weights = None  # SVR için ağırlıkları doğrudan almak zordur.

    # Sonuçları kaydetme
    results.append({"Model": name, "R^2 Score": r2, "MAE": mae, "RMSE": rmse, "Weights": weights})

# Sonuçları DataFrame olarak göster
results_df = pd.DataFrame(results).sort_values(by="R^2 Score", ascending=False)

# Ağırlıkları her model için özelliğe bağlama
for result in results:
    if result["Weights"] is not None:
        print(f"\n{result['Model']} - Feature Weights:")
        for feature, weight in zip(features, result["Weights"]):
            print(f"  {feature}: {weight}")

# Performans sonuçları
print(results_df.drop(columns=["Weights"]))


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000132 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 272
[LightGBM] [Info] Number of data points in the train set: 196, number of used features: 5
[LightGBM] [Info] Start training from score 11.632347


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000040 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 272
[LightGBM] [Info] Number of data points in the train set: 196, number of used features: 5
[LightGBM] [Info] Start training from score 11.632347

Linear Regression - Feature Weights:
  segment_length: -0.1822509183005963
  slope: 2.7312114719599934
  avg_vehicle_speed: 0.04258689007967753
  avg_Acceleration: 5.094333746829139
  avg_Total_Mass: 0.009959902576865964

Random Forest - Feature Weights:
  segment_length: 0.045900151017446524
  slope: 0.6465693071400667
  avg_vehicle_speed: 0.01831517501371602
  avg_Acceleration: 0.24969275043007574
  avg_Total_Mass: 0.03952261639869505

Extra Trees - Feature Weights:
  segment_length: 0.06957369477300959
  slope: 0.5941694316638949
  avg_vehicle_speed: 0.027634074155114634
  avg_Acceleration: 0.2505926750356008
  avg_Total_Mass: 0.058030124372380185



In [None]:
import os

# Yeni CSV dosyalarının bulunduğu klasör yolu
new_data_folder = "03_NewSegmentBasedData_100m_Combined"

# Tahminleri ekleyeceğiniz çıkış klasörü
output_folder = "04_PredictedData_100m"

# Çıkış klasörü yoksa oluştur
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Yeni CSV dosyalarını işleme
for filename in os.listdir(new_data_folder):
    if filename.endswith(".csv"):
        file_path = os.path.join(new_data_folder, filename)
        df_new = pd.read_csv(file_path)

        # Özelliklerin mevcut olduğundan emin olun
        missing_features = set(features) - set(df_new.columns)
        if missing_features:
            print(f"{filename} dosyasında eksik özellikler var: {missing_features}")
            continue  # Eksik özellikler varsa bu dosyayı atla

        # Özellikleri seç
        X_new = df_new[features]

        # Tahmin yap
        y_pred_new = best_model.predict(X_new)

        # Tahminleri DataFrame'e ekle
        df_new['Predicted_Total_Energy_Consumption'] = y_pred_new

        # Sonuçları yeni bir CSV olarak kaydet
        output_path = os.path.join(output_folder, f"predicted_{filename}")
        df_new.to_csv(output_path, index=False)

        print(f"{filename} dosyası işlendi ve tahminler eklendi.")



CombinedDataWithNewSegments_100m.csv dosyası işlendi ve tahminler eklendi.


In [None]:
import joblib
import json

# Modeli kaydet
joblib.dump(best_model, "best_model.pkl")
print("Model başarıyla 'best_model.pkl' olarak kaydedildi.")

# Özellik listesini kaydet (Flask'te input'u kontrol etmek için)
with open("features.json", "w") as f:
    json.dump(features, f)
print("Kullanılan feature listesi 'features.json' olarak kaydedildi.")


Model başarıyla 'best_model.pkl' olarak kaydedildi.
Kullanılan feature listesi 'features.json' olarak kaydedildi.
