Linear Regression TO Predict Global Carbons

In [1]:
import pandas as pd

# Load the data from the Excel file
tree_cover_data = pd.read_excel('tree_cover_1.xlsx')

# Display the first few rows of the dataframe to understand its structure and contents
tree_cover_data.head()
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Selecting features and target variable for the regression
features = [
    'umd_tree_cover_extent_2000__ha',
    'gfw_aboveground_carbon_stocks_2000__Mg_C',
    'avg_gfw_aboveground_carbon_stocks_2000__Mg_C_ha.1',
    'Land_data_arable',
    'GDP_value',
    'Population'
]
target = 'gfw_forest_carbon_gross_emissions__Mg_CO2e_yr.1'

# Prepare the data for training and testing
X = tree_cover_data[features]
y = tree_cover_data[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
linear_regressor = LinearRegression()

# Fit the model on the training data
linear_regressor.fit(X_train, y_train)

# Make predictions on the test data
y_pred = linear_regressor.predict(X_test)

# Evaluate the model performance
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

rmse, r2


FileNotFoundError: ignored

Clustering analysis for diffrent countries

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Features selected for clustering
cluster_features = [
    'umd_tree_cover_extent_2000__ha',
    'gfw_aboveground_carbon_stocks_2000__Mg_C',
    'GDP_value',
    'Population'
]

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(tree_cover_data[cluster_features])

# Using the Elbow method to find the optimal number of clusters
inertia = []
k_values = range(1, 11)

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

# Plotting the Elbow curve
plt.figure(figsize=(8, 4))
plt.plot(k_values, inertia, 'o-')
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.xticks(k_values)
plt.grid(True)
plt.show()


NameError: ignored

In [None]:
# Choosing 3 clusters based on the Elbow method
kmeans = KMeans(n_clusters=3, random_state=42)

# Fit the KMeans model to the scaled data
kmeans.fit(X_scaled)

# Predict the cluster for each data point
clusters = kmeans.predict(X_scaled)

# Adding the cluster information to the original dataframe for visualization
tree_cover_data['Cluster'] = clusters

# Visualizing the clusters using a pairplot for a subset of features
import seaborn as sns

# Selecting a subset of features to plot
plot_features = ['GDP_value', 'Population', 'Cluster']
pairplot_data = tree_cover_data[plot_features]

# Pairplot with hue set to the cluster assignment
sns.pairplot(pairplot_data, hue='Cluster', palette='viridis', diag_kind='kde')


Evaluate The Prediction Accuracy Of Diffrent Models

In [None]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Initialize the models
ridge_regressor = Ridge(random_state=42)
lasso_regressor = Lasso(random_state=42)
decision_tree_regressor = DecisionTreeRegressor(random_state=42)
random_forest_regressor = RandomForestRegressor(random_state=42)
gradient_boosting_regressor = GradientBoostingRegressor(random_state=42)

# List of models for iteration
models = [
    ('Ridge', ridge_regressor),
    ('Lasso', lasso_regressor),
    ('Decision Tree', decision_tree_regressor),
    ('Random Forest', random_forest_regressor),
    ('Gradient Boosting', gradient_boosting_regressor)
]

# Dictionary to store the performance of each model
model_performance = {}

# Train and evaluate each model
for name, model in models:
    # Fit the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate performance metrics
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    # Store the performance in the dictionary
    model_performance[name] = {'RMSE': rmse, 'R2': r2}

# Convert the performance dictionary to a DataFrame for easier viewing
model_performance_df = pd.DataFrame(model_performance).T
model_performance_df
