# Sweet windowed deep learning example from chatgpt

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense

# Example data (30 years of scores)
data = np.array([80, 85, 90, 92, 95, 96, 100, 104, 108, 112, 115, 118, 120, 123, 125,
                 128, 130, 133, 135, 138, 140, 145, 150, 155, 160, 162, 165, 168, 170, 175])

# Function to create sliding windows
def create_windowed_data(data, window_size):
    X = []
    y = []
    for i in range(len(data) - window_size):
        X.append(data[i:i + window_size])
        y.append(data[i + window_size])  # Predict the next year's score
    return np.array(X), np.array(y)

# Test different window sizes (5, 10, 15, 20 years)
window_sizes = [5, 10, 15, 20]

for window_size in window_sizes:
    X, y = create_windowed_data(data, window_size)
    
    # Split into training and testing data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    
    # Build the model
    model = Sequential()
    model.add(Dense(10, input_dim=window_size, activation='relu'))  # Window size as input dimension
    model.add(Dense(1))  # Output prediction
    
    # Compile and train the model
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(X_train, y_train, epochs=100, verbose=0)
    
    # Evaluate the model on the test data
    loss = model.evaluate(X_test, y_test, verbose=0)
    print(f"Window Size {window_size}: Test Loss = {loss}")


2025-03-20 16:31:40.728822: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-20 16:31:40.755144: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1742484700.789280    7885 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1742484700.799851    7885 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-20 16:31:40.833842: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Window Size 5: Test Loss = 157.21214294433594
Window Size 10: Test Loss = 1304.747802734375
Window Size 15: Test Loss = 30258.431640625
Window Size 20: Test Loss = 1.4807991981506348


In [10]:
import pandas as pd
df = pd.read_csv("../data/data-ml.csv")
df.head(5)

unique_combinations = df[['id', 'country']].drop_duplicates()
unique_combinations_list = [tuple(x) for x in unique_combinations.to_records(index=False)]
print(unique_combinations_list)

[('AFG', 'Afghanistan'), ('ALB', 'Albania'), ('DZA', 'Algeria'), ('AGO', 'Angola'), ('ARG', 'Argentina'), ('ARM', 'Armenia'), ('AUS', 'Australia'), ('AUT', 'Austria'), ('AZE', 'Azerbaijan'), ('BHS', 'Bahamas, The'), ('BHR', 'Bahrain'), ('BGD', 'Bangladesh'), ('BRB', 'Barbados'), ('BLR', 'Belarus'), ('BEL', 'Belgium'), ('BLZ', 'Belize'), ('BEN', 'Benin'), ('BTN', 'Bhutan'), ('BOL', 'Bolivia'), ('BIH', 'Bosnia and Herzegovina'), ('BWA', 'Botswana'), ('BRA', 'Brazil'), ('BRN', 'Brunei Darussalam'), ('BGR', 'Bulgaria'), ('BFA', 'Burkina Faso'), ('BDI', 'Burundi'), ('CPV', 'Cabo Verde'), ('KHM', 'Cambodia'), ('CMR', 'Cameroon'), ('CAN', 'Canada'), ('CAF', 'Central African Republic'), ('TCD', 'Chad'), ('CHL', 'Chile'), ('CHN', 'China'), ('COL', 'Colombia'), ('COM', 'Comoros'), ('COD', 'Congo, Dem. Rep.'), ('COG', 'Congo, Rep.'), ('CRI', 'Costa Rica'), ('CIV', "Cote d'Ivoire"), ('HRV', 'Croatia'), ('CUB', 'Cuba'), ('CYP', 'Cyprus'), ('CZE', 'Czechia'), ('DNK', 'Denmark'), ('DJI', 'Djibouti'),

In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense

# Load dataset
df = pd.read_csv("../data/data-ml.csv")

# Clean yearly_score (replace comma with dot and convert to float)
df["yearly_score"] = df["yearly_score"].astype(str).str.replace(",", ".").astype(float)

# Select data for a specific country (e.g., Afghanistan)
country_code = "BRA"  # Change as needed
df_country = df[df["id"] == country_code].sort_values(by="year")

# Extract relevant data
data = df_country["yearly_score"].values  # Convert to numpy array

# Function to create sliding windows
def create_windowed_data(data, window_size):
    X, y = [], []
    for i in range(len(data) - window_size):
        X.append(data[i:i + window_size])
        y.append(data[i + window_size])  # Predict the next year's score
    return np.array(X), np.array(y)

# Test different window sizes
window_sizes = [5, 10, 15, 20, 21, 22]

for window_size in window_sizes:
    X, y = create_windowed_data(data, window_size)

    # Split into training and testing data (chronologically)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    # Build the model
    model = Sequential()
    model.add(Dense(10, input_dim=window_size, activation='relu'))
    model.add(Dense(1))  # Output prediction

    # Compile and train the model
    model.compile(loss='mean_squared_error', optimizer='adam')
    model.fit(X_train, y_train, epochs=100, verbose=0)

    # Evaluate the model
    loss = model.evaluate(X_test, y_test, verbose=0)
    print(f"Window Size {window_size}: Test Loss = {loss}")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Window Size 5: Test Loss = 743.5593872070312
Window Size 10: Test Loss = 3.5897271633148193
Window Size 15: Test Loss = 1.6910756826400757
Window Size 20: Test Loss = 0.9291428923606873
Window Size 21: Test Loss = 7788.5712890625
Window Size 22: Test Loss = 0.21291375160217285


# Sweet deep learning example from chatgpt

In [None]:
from keras.models import Sequential
from keras.layers import Dense
import numpy as np

# Example data (5 years of scores)
X = np.array([[80, 85, 90, 92, 95]])  # shape (1, 5) for 5 years of input
y = np.array([98])  # The target score for Year 6

# Define the model 
model = Sequential()
model.add(Dense(10, input_dim=5, activation='relu'))  # 5 input features (years)
model.add(Dense(1))  # 1 output (predicted score)

# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

# Train the model
model.fit(X, y, epochs=100, verbose=1)

# Make a prediction for the next year (e.g., Year 6)
prediction = model.predict(X)
print("Predicted score for Year 6:", prediction)

2025-03-12 15:41:30.674279: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-12 15:41:30.679144: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-12 15:41:30.733738: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-12 15:41:30.770684: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1741790490.809295   39758 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1741790490.82

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-03-12 15:41:32.807632: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 567ms/step - loss: 3376.4109
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - loss: 3303.0483
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - loss: 3230.3059
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - loss: 3158.2021
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - loss: 3086.7539
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - loss: 3015.9783
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - loss: 2945.8892
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - loss: 2876.5049
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - loss: 2807.8376
Epoch 10/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - loss: 

# Another try

In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv("../data/data.csv")
df.head(5)
# Pivot the data to have a time series per country
df = df.pivot(index=['Country', 'year'], columns='goal', values='Attainment').reset_index()

# Normalize attainment scores (0-1)
scaler = MinMaxScaler()
df.iloc[:, 2:] = scaler.fit_transform(df.iloc[:, 2:])

# Prepare Data for LSTM
countries = df['Country'].unique()
n_sdg = 17  # Number of SDGs
sequence_length = 10  # Use last 10 years to predict the next year



KeyError: 'goal'

In [None]:
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i : i + seq_length, :])  # Input sequence
        y.append(data[i + seq_length, :])  # Target (next year)
    return np.array(X), np.array(y)

# Split into training/testing sets per country
train_X, train_y, test_X, test_y = [], [], [], []

for country in countries:
    country_data = df[df['Country'] == country].iloc[:, 2:].values
    X, y = create_sequences(country_data, sequence_length)
    
    split_index = int(0.8 * len(X))  # 80% Train, 20% Test
    train_X.append(X[:split_index])
    train_y.append(y[:split_index])
    test_X.append(X[split_index:])
    test_y.append(y[split_index:])

# Convert lists to numpy arrays
train_X = np.vstack(train_X)
train_y = np.vstack(train_y)
test_X = np.vstack(test_X)
test_y = np.vstack(test_y)

print("Training Data Shape:", train_X.shape)  # (samples, 10 years, 17 SDGs)
print("Testing Data Shape:", test_X.shape)    # (samples, 10 years, 17 SDGs)


NameError: name 'countries' is not defined

In [None]:
# Define LSTM model
model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(sequence_length, n_sdg)),
    Dropout(0.2),
    LSTM(32),
    Dropout(0.2),
    Dense(n_sdg, activation='sigmoid')  # Output 17 SDGs scaled 0-1
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.summary()


In [None]:
history = model.fit(train_X, train_y, epochs=50, batch_size=32, validation_data=(test_X, test_y))

# Plot loss
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.show()


In [None]:
future_predictions = {}

for country in countries:
    country_data = df[df['Country'] == country].iloc[:, 2:].values
    last_sequence = country_data[-sequence_length:].reshape(1, sequence_length, n_sdg)
    predicted_2030 = model.predict(last_sequence)
    
    # Rescale to original percentages
    predicted_2030 = scaler.inverse_transform(predicted_2030)[0]
    
    future_predictions[country] = predicted_2030

# Convert predictions to DataFrame
predictions_df = pd.DataFrame(future_predictions, index=[f'SDG_{i+1}' for i in range(n_sdg)])
print(predictions_df)


In [None]:
predictions_df.to_csv("SDG_predictions_2030.csv")
