Dataset Making

In [10]:
import pandas as pd
import numpy as np
import pickle

# Define the number of samples
num_samples = 1000

# Define the possible values for categorical features
offset_methods = ['Afforestation', 'Renewable Energy', 'Energy Efficiency', 'Reforestation']
project_locations = ['India', 'USA', 'China', 'Brazil']
verification_statuses = ['Verified', 'Pending', 'Rejected']
technologies_used = ['Solar', 'Wind', 'Hydro', 'Biomass']

# Generate random data
np.random.seed(42)
data = {
    'OffsetMethod': np.random.choice(offset_methods, num_samples),
    'ProjectLocation': np.random.choice(project_locations, num_samples),
    'VerificationStatus': np.random.choice(verification_statuses, num_samples),
    'TechnologyUsed': np.random.choice(technologies_used, num_samples),
    'EmissionReduction': np.random.uniform(100, 10000, num_samples),
    'ProjectSize': np.random.uniform(1, 1000, num_samples)
}

# Calculate the carbon credit price based on a linear combination of the factors
# Coefficients are assumed for demonstration purposes and should be adjusted based on domain knowledge
coefficients = {
    'EmissionReduction': 0.005,
    'ProjectSize': 0.01,
    'OffsetMethod': {'Afforestation': 10, 'Renewable Energy': 20, 'Energy Efficiency': 15, 'Reforestation': 12},
    'ProjectLocation': {'India': 5, 'USA': 10, 'China': 8, 'Brazil': 6},
    'VerificationStatus': {'Verified': 10, 'Pending': 5, 'Rejected': -5},
    'TechnologyUsed': {'Solar': 8, 'Wind': 7, 'Hydro': 6, 'Biomass': 5}
}

# Apply the coefficients to calculate the carbon credit price
carbon_credit_price = (
    coefficients['EmissionReduction'] * data['EmissionReduction'] +
    coefficients['ProjectSize'] * data['ProjectSize'] +
    [coefficients['OffsetMethod'][method] for method in data['OffsetMethod']] +
    [coefficients['ProjectLocation'][location] for location in data['ProjectLocation']] +
    [coefficients['VerificationStatus'][status] for status in data['VerificationStatus']] +
    [coefficients['TechnologyUsed'][tech] for tech in data['TechnologyUsed']]
)

data['CarbonCreditPrice'] = carbon_credit_price

# Create a DataFrame
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv('carbon_credit_data_combined.csv', index=False)

print("Dataset generated and saved as 'carbon_credit_data_combined.csv'")


Dataset generated and saved as 'carbon_credit_data_combined.csv'


Model Training

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Load the dataset
df = pd.read_csv('carbon_credit_data_combined.csv')

# Define the features and target variable
X = df.drop(columns=['CarbonCreditPrice'])
y = df['CarbonCreditPrice']

# Identify categorical and numerical columns
categorical_cols = ['OffsetMethod', 'ProjectLocation', 'VerificationStatus', 'TechnologyUsed']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Preprocessing for numerical data: scaling
numerical_transformer = StandardScaler()

# Preprocessing for categorical data: one-hot encoding
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Create and evaluate the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)
                          ])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

# Save the model
joblib.dump(pipeline, 'carbon_credit_price_model.pkl')

print("Model trained and saved as 'carbon_credit_price_model.pkl'")


Mean Squared Error: 8.146227519967926
R^2 Score: 0.9703614118663575
Model trained and saved as 'carbon_credit_price_model.pkl'


Predictions

In [12]:
import joblib
import pandas as pd

# Load the trained model
model = joblib.load('carbon_credit_price_model.pkl')

# Define a function to predict carbon credit price based on user inputs
def predict_carbon_credit_price(offset_method, project_location, verification_status, technology_used, emission_reduction, project_size):
    # Create a DataFrame with the user inputs
    data = {
        'OffsetMethod': [offset_method],
        'ProjectLocation': [project_location],
        'VerificationStatus': [verification_status],
        'TechnologyUsed': [technology_used],
        'EmissionReduction': [emission_reduction],
        'ProjectSize': [project_size]
    }
    df = pd.DataFrame(data)

    # Make the prediction
    predicted_price = model.predict(df)

    return predicted_price[0]

# Example usage
offset_method = 'Afforestation'
project_location = 'India'
verification_status = 'Verified'
technology_used = 'Solar'
emission_reduction = 5000
project_size = 100

predicted_price = predict_carbon_credit_price(offset_method, project_location, verification_status, technology_used, emission_reduction, project_size)
predicted_price = predicted_price * 81
print(f'Predicted Carbon Credit Price: ₹{predicted_price:.2f}')

Predicted Carbon Credit Price: ₹4953.96


In [14]:
# Save the model and label encoder
with open('carbonCreditPrice.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)
