In [1]:
# Connect database and pre-processing data

# Import necessary libraries
from pymongo import MongoClient
import pandas as pd
import urllib.parse
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
import mlflow
import mlflow.sklearn

# Database connection setup
username = "alextran21211"
password = urllib.parse.quote("database12345")
host = "cluster0.zsych.mongodb.net"
url = f"mongodb+srv://{username}:{password}@{host}/?retryWrites=true&w=majority"

# Connect to the MongoDB database and retrieve data
client = MongoClient(url)
db = client['group_5_project']
collection = db["co2_emission"]

documents = collection.find()

# Data extraction and preprocessing
data = []
for doc in documents:
    for country, country_data in doc.items():
        if country == "_id":
            continue  
        iso_code = country_data.get('iso_code')
        for entry in country_data.get('data', []):
            year = entry.get('year')
            population = entry.get('population')
            co2 = entry.get('cumulative_luc_co2')
           
            data.append({
                'Country': country,
                'ISO_Code': iso_code,
                'Year': year,
                'Population': population,
                'CO2': co2
            })

# Create DataFrame
df = pd.DataFrame(data)

# Fill missing values
df['Population'] = df['Population'].fillna(df['Population'].mean()) 
df['CO2'] = df['CO2'].fillna(df['CO2'].mean())  
df['CO2_per_capita'] = df['CO2'] / df['Population']
df.drop_duplicates(inplace=True)

# Drop 'ISO_Code' as it's not needed for modeling
df = df.drop(columns=['ISO_Code'])

# Print to verify data
print(df.head())

       Country  Year  Population        CO2  CO2_per_capita
0  Afghanistan  1850   3752993.0   2.979601    7.939267e-07
1  Afghanistan  1851   3767956.0   5.981443    1.587450e-06
2  Afghanistan  1852   3783940.0   9.002998    2.379266e-06
3  Afghanistan  1853   3800954.0  12.041333    3.167977e-06
4  Afghanistan  1854   3818038.0  15.094068    3.953357e-06


In [None]:
# start mlflow ui in terminal
# type: mflow ui

In [4]:
# set mlflow ui
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

In [5]:
# ML models with MLflow tracking

# Define features and target
X = df[['Year', 'Population']]
y = df['CO2']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model configurations
model_configs = {
    "LinearRegression": {
        "model": LinearRegression,
        "param_name": "fit_intercept",
        "param_values": [True, False, None],  # Simulated variations
        "additional_params": {}
    },
    "RandomForestRegressor": {
        "model": RandomForestRegressor,
        "param_name": "n_estimators",
        "param_values": [50, 100, 200],
        "additional_params": {"random_state": 42}
    },
    "KNeighborsRegressor": {
        "model": KNeighborsRegressor,
        "param_name": "n_neighbors",
        "param_values": [3, 5, 7],
        "additional_params": {}
    }
}

# Start MLflow autologging
mlflow.sklearn.autolog()

# Model training and MLflow tracking
for model_name, config in model_configs.items():
    param_name = config["param_name"]
    param_values = config["param_values"]
    additional_params = config["additional_params"]
    
    for param_value in param_values:
        with mlflow.start_run(run_name=f"{model_name}_{param_name}={param_value}"):
            model_params = {param_name: param_value} if param_value is not None else {}
            model = config["model"](**model_params, **additional_params)
            
            # Train model
            model.fit(X_train, y_train)
            
            # Predict and evaluate
            predictions = model.predict(X_test)
            mse = mean_squared_error(y_test, predictions)
            r2 = r2_score(y_test, predictions)
            
            # Log metrics
            mlflow.log_metric("mse", mse)
            mlflow.log_metric("r2_score", r2)

            print(f"{model_name} with {param_name}={param_value} | MSE: {mse:.4f} | R^2 Score: {r2:.4f}")

# Check MLflow UI for experiment tracking results


2024/11/05 16:10:54 INFO mlflow.tracking._tracking_service.client: 🏃 View run LinearRegression_fit_intercept=True at: http://127.0.0.1:5000/#/experiments/0/runs/20fd90bd4faf4a5fb8b1963646ac135d.
2024/11/05 16:10:54 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0.


LinearRegression with fit_intercept=True | MSE: 390295237.8799 | R^2 Score: 0.6900


2024/11/05 16:10:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run LinearRegression_fit_intercept=False at: http://127.0.0.1:5000/#/experiments/0/runs/f902283685c74d5ca31feb81323c9882.
2024/11/05 16:10:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0.


LinearRegression with fit_intercept=False | MSE: 394582169.7664 | R^2 Score: 0.6866


2024/11/05 16:10:57 INFO mlflow.tracking._tracking_service.client: 🏃 View run LinearRegression_fit_intercept=None at: http://127.0.0.1:5000/#/experiments/0/runs/0d83667bb0074016b896e825f423789b.
2024/11/05 16:10:57 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0.


LinearRegression with fit_intercept=None | MSE: 390295237.8799 | R^2 Score: 0.6900


2024/11/05 16:11:03 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForestRegressor_n_estimators=50 at: http://127.0.0.1:5000/#/experiments/0/runs/7600faff5ae14625999405edb2c3638d.
2024/11/05 16:11:03 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0.


RandomForestRegressor with n_estimators=50 | MSE: 161059678.0350 | R^2 Score: 0.8721


2024/11/05 16:11:14 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForestRegressor_n_estimators=100 at: http://127.0.0.1:5000/#/experiments/0/runs/7d2f66b03c714e9ba5c6360282acdb63.
2024/11/05 16:11:14 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0.


RandomForestRegressor with n_estimators=100 | MSE: 158126532.6932 | R^2 Score: 0.8744


2024/11/05 16:11:33 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForestRegressor_n_estimators=200 at: http://127.0.0.1:5000/#/experiments/0/runs/276ae900bedc4c069b29f4e5762bc5b0.
2024/11/05 16:11:33 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0.


RandomForestRegressor with n_estimators=200 | MSE: 156577394.8043 | R^2 Score: 0.8756


2024/11/05 16:11:35 INFO mlflow.tracking._tracking_service.client: 🏃 View run KNeighborsRegressor_n_neighbors=3 at: http://127.0.0.1:5000/#/experiments/0/runs/9bf392aec8964194ad25ccb020c06b4a.
2024/11/05 16:11:35 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0.


KNeighborsRegressor with n_neighbors=3 | MSE: 521494662.0817 | R^2 Score: 0.5857


2024/11/05 16:11:37 INFO mlflow.tracking._tracking_service.client: 🏃 View run KNeighborsRegressor_n_neighbors=5 at: http://127.0.0.1:5000/#/experiments/0/runs/5881b1ca879a441e83c0ddbae53edd1e.
2024/11/05 16:11:37 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0.


KNeighborsRegressor with n_neighbors=5 | MSE: 451813521.0611 | R^2 Score: 0.6411


2024/11/05 16:11:39 INFO mlflow.tracking._tracking_service.client: 🏃 View run KNeighborsRegressor_n_neighbors=7 at: http://127.0.0.1:5000/#/experiments/0/runs/af0d7563b33b4777a5829378da866fbb.
2024/11/05 16:11:39 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0.


KNeighborsRegressor with n_neighbors=7 | MSE: 420598037.1949 | R^2 Score: 0.6659
