In [None]:
# Updated function to create sequences
def create_sequences(data):
    grouped = data.groupby('Codigo_IBGE')
    sequences = []
    labels = []
    for _, group in grouped:
        # Sort the group by year ('Ano')
        group = group.sort_values('Ano')
        
        # Drop the 'Ano' and 'Codigo_IBGE' columns as they are not features for the LSTM
        group_data = group.drop(['Ano', 'Codigo_IBGE'], axis=1).values
        
        # Create sequences and labels
        for i in range(len(group_data) - 1):
            # Use all data up to year 'i' as the sequence to predict educational funding for year 'i+1'
            sequence = group_data[:i+1]
            label = group_data[i+1, -1]  # Assuming 'adjusted_funding' is the last column
            sequences.append(sequence)
            labels.append(label)
            
    return sequences, labels

In [None]:
# Generate sequences for training and test data
train_sequences, train_labels = create_sequences(train_data)
test_sequences, test_labels = create_sequences(test_data)

In [None]:
# Find the minimum and maximum sequence lengths for the training set
min_len_train = min(len(seq) for seq in train_sequences)
max_len_train = max(len(seq) for seq in train_sequences)


# Number of samples in the training and test sets
num_samples_train = len(train_sequences)
num_samples_test = len(test_sequences)

# Number of time steps (This would be variable in this case, so we take the maximum sequence length)
num_time_steps_train = max_len_train
num_time_steps_test = max(len(seq) for seq in test_sequences)

# Number of features (excluding 'Ano' and 'Codigo_IBGE')
num_features = train_data.shape[1] - 2

num_samples_train, num_time_steps_train, num_features, num_samples_test, num_time_steps_test


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Pad sequences using a constant value of 1000
train_sequences_padded = pad_sequences(train_sequences, padding='post', value=1000, dtype='float32')
test_sequences_padded = pad_sequences(test_sequences, padding='post', value=1000, dtype='float32')

# The resulting `train_sequences_padded` and `test_sequences_padded` will be numpy arrays
# that you can directly use for training your LSTM model.

In [None]:
from tensorflow.keras import Sequential, layers

# Model
model = Sequential()

# Masking layer to ignore padding
model.add(layers.Masking(mask_value=1000, input_shape=(num_time_steps_train, num_features)))

# Simple RNN layer
model.add(layers.SimpleRNN(units=2, activation='tanh'))

# Additional dense layers
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='linear'))

# Compilation
model.compile(loss='mae', optimizer='adam')

# Fit the model
# Convert train_labels to a NumPy array with dtype 'float32'
train_labels_array = np.array(train_labels, dtype='float32')

# Fit the model
model.fit(train_sequences_padded, train_labels_array)


In [None]:
model.summary()

In [None]:
# Import necessary libraries
import numpy as np

# Sample future data for prediction
future_data = {
    'Ano': [2021],
    'Codigo_IBGE': [1100015],
    'Aprovacao': [98.4],
    'Reprovacao': [1.6],
    'Abandono': [0],
    'Matriculas': [749],
    'Docentes': [71],
    'Estabelecimentos': [3],
    'Turmas': [45],
    'PIB': [28722.45],
    'Poverty_%': [8895900.54],
    'Unemployed_%': [19318.8],
    'Acesso a internet %': [0],
    'adjusted_population': [0]
}

# Convert the sample data to a DataFrame
future_df = pd.DataFrame(future_data)

# Scale the features using the same scaler object used for training data
future_df_scaled = future_df.copy()
future_df_scaled[columns_to_scale] = scaler.transform(future_df[columns_to_scale])

# Drop the 'Ano' and 'Codigo_IBGE' columns as they are not features for the LSTM
future_sequence = future_df_scaled.drop(['Ano', 'Codigo_IBGE'], axis=1).values

# Since the LSTM expects input shape [samples, time_steps, features], reshape the sequence accordingly
future_sequence_reshaped = np.expand_dims(future_sequence, axis=0)

# Use the LSTM model to make the prediction
predicted_value = model.predict(future_sequence_reshaped)

# The 'predicted_value' will contain the predicted 'adjusted_funding' for 2021 for the municipality with code 1100015


## RNN

### Class implementation

In [None]:
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras import Sequential, layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import numpy as np
import pandas as pd

class LSTMPipeline:

    def __init__(self):
        self.feature_scaler = None
        self.target_scaler = None
        self.model = None
        self.sequences_padded = None
        self.scaled_df = None

    def preprocess_data(self, df, target_column):
        df_cleaned = df.dropna().drop_duplicates()
        columns_to_scale = df_cleaned.columns.difference(['Ano', 'Codigo_IBGE', target_column])

        if self.feature_scaler is None:
            self.feature_scaler = MinMaxScaler()
        df_scaled = df_cleaned.copy()
        df_scaled[columns_to_scale] = self.feature_scaler.fit_transform(df_cleaned[columns_to_scale])
        
        if target_column in df_cleaned.columns:
            if self.target_scaler is None:
                self.target_scaler = MinMaxScaler()
            df_scaled[[target_column]] = self.target_scaler.fit_transform(df_cleaned[[target_column]])

        return df_scaled

    def create_sequences(self, data, target_column):
        grouped = data.groupby('Codigo_IBGE')
        sequences = []
        labels = []
        for _, group in grouped:
            group = group.sort_values('Ano')
            group_data = group.drop(['Ano', 'Codigo_IBGE', target_column], axis=1).values  # Drop the target column here
            #print(f"Shape of group_data for municipality {group['Codigo_IBGE'].iloc[0]}: {group_data.shape}")  # Debugging line
            group_labels = group[target_column].values  # Extract the labels (target column)
            for i in range(len(group_data) - 1):
                sequence = group_data[:i+1]
                label = group_labels[i+1]  # Use the label corresponding to the next timestep
                sequences.append(sequence)
                labels.append(label)
        return sequences, labels


    def pad_sequences(self, sequences, pad_value=1000):
        return pad_sequences(sequences, dtype='float32', padding='post', value=pad_value)

    def build_and_train_model(self, X, y):
        input_shape = (None, X.shape[-1])
        print(input_shape)
        model = Sequential()
        model.add(layers.Masking(mask_value=1000, input_shape=input_shape))
        model.add(layers.SimpleRNN(units=2, activation='tanh'))
        model.add(layers.Dense(10, activation='relu'))
        model.add(layers.Dense(1, activation='linear'))
        model.compile(loss='mae', optimizer='adam')
        y_array = np.array(y, dtype='float32')
        model.fit(X, y_array)
        self.sequences_padded = X
        self.model = model

    
    def get_last_sequence_for_municipality(self, municipality_code, sequences, data):
        data_filtered = data[data['Codigo_IBGE'] == municipality_code].sort_values('Ano')
        last_year = data_filtered['Ano'].max()
        municipality_sequences = [seq for seq, (_, group) in zip(sequences, data.groupby('Codigo_IBGE')) if group['Codigo_IBGE'].iloc[0] == municipality_code]
        if not municipality_sequences:
            return None  
        last_sequence = municipality_sequences[-1]
        return last_sequence



    def inverse_transform_prediction(self, prediction):
        if self.target_scaler is None:
            raise AttributeError("target_scaler has not been initialized. Make sure to preprocess the data with the target column first.")
        prediction = np.array(prediction).reshape(-1, 1)
        prediction_inverse_transformed = self.target_scaler.inverse_transform(prediction)
        return prediction_inverse_transformed[0][0]

    def predict(self, future_data):
        # Preprocess the future data
        scaled_future_data = self.preprocess_data(future_data, 'adjusted_funding')
        
        # Fetch the last sequence for each municipality in the future data
        predictions = {}
        for code in scaled_future_data['Codigo_IBGE'].unique():

            last_sequence = self.get_last_sequence_for_municipality(code, self.sequences_padded, self.scaled_df)
            
            # Extend the last sequence with the new data point
            future_point = scaled_future_data[scaled_future_data['Codigo_IBGE'] == code].drop(['Ano', 'Codigo_IBGE'], axis=1).values
    
            #print("Shape of future_point:", future_point.shape)
            
            extended_sequence = np.vstack([last_sequence, future_point])
            
            # Pad the sequence
            extended_sequence_padded = self.pad_sequences([extended_sequence])
            #print(f"Extended sequence shape: {extended_sequence.shape}")  # Debugging line
            #print(f"Padded sequence shape: {extended_sequence_padded.shape}")  # Debugging line
            
            # Make the prediction
            prediction = self.model.predict(extended_sequence_padded)
            prediction_inverse_transformed = self.inverse_transform_prediction(prediction)
            predictions[code] = prediction_inverse_transformed
            
        return predictions


    def save_model(self, filepath):
        if self.model:
            self.model.save(filepath)
        else:
            print("Model is not trained yet.")

    def load_saved_model(self, filepath):
        try:
            return tf.keras.models.load_model(filepath)
        except Exception as e:
            print(f"An error occurred while loading the model: {e}")


In [None]:
df = pd.read_csv("../raw_data/all_urban_ML2.csv")

# Example usage:
pipeline = LSTMPipeline()

# # Preprocessing and 
pipeline.scaled_df = pipeline.preprocess_data(df, 'adjusted_funding')

# #sequence creation
sequences, labels = pipeline.create_sequences(pipeline.scaled_df, 'adjusted_funding')

# # Sequence padding 
sequences_padded = pipeline.pad_sequences(sequences)

# # Model building and training
pipeline.build_and_train_model(sequences_padded, labels)

In [None]:
future_data = {
    'Ano': [2022],
    'Codigo_IBGE': [1100015],
    'Aprovacao': [98.4],
    'Reprovacao': [1.6],
    'Abandono': [0.0],
    'Matriculas': [749],
    'Docentes': [71],
    'Estabelecimentos': [3],
    'Turmas': [45],
    'PIB': [28722.45],
    'Poverty_%': [19.7],
    'Unemployed_%': [10.43],
    'Acesso a internet %': [81],
    'adjusted_population': [19318.8]
}

# Convert the sample data to a DataFrame
future_df = pd.DataFrame(future_data)

In [None]:
prediction = pipeline.predict(future_df)
prediction

In [None]:
filepath = "../models/model.h5"
pipeline.save_model("../models/model.h5")

In [None]:
mymodel = pipeline.load_saved_model(filepath)

In [None]:
mymodel.summary()

### Function implementaion

In [8]:
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras import Sequential, layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import numpy as np
import pandas as pd

# Global Variables
feature_scaler = None
target_scaler = None
model = None
sequences_padded = None
scaled_df = None

def preprocess_data(df, target_column):
    global feature_scaler
    global target_scaler

    df_cleaned = df.dropna().drop_duplicates()
    columns_to_scale = df_cleaned.columns.difference(['Ano', 'Codigo_IBGE', target_column])

    if feature_scaler is None:
        feature_scaler = MinMaxScaler()
    
    df_scaled = df_cleaned.copy()
    df_scaled[columns_to_scale] = feature_scaler.fit_transform(df_cleaned[columns_to_scale])
    
    if target_column in df_cleaned.columns:
        if target_scaler is None:
            target_scaler = MinMaxScaler()
        df_scaled[[target_column]] = target_scaler.fit_transform(df_cleaned[[target_column]])

    return df_scaled


def create_sequences(data, target_column):
    grouped = data.groupby('Codigo_IBGE')
    sequences = []
    labels = []

    for _, group in grouped:
        group = group.sort_values('Ano')
        group_data = group.drop(['Ano', 'Codigo_IBGE', target_column], axis=1).values
        group_labels = group[target_column].values
        for i in range(len(group_data) - 1):
            sequence = group_data[:i+1]
            label = group_labels[i+1]
            sequences.append(sequence)
            labels.append(label)

    return sequences, labels


def add_padding(sequences, pad_value=1000):
    return pad_sequences(sequences, dtype='float32', padding='post', value=pad_value)


def build_and_train_model(X, y):
    global model
    global sequences_padded

    input_shape = (None, X.shape[-1])
    model = Sequential()
    model.add(layers.Masking(mask_value=1000, input_shape=input_shape))
    model.add(layers.SimpleRNN(units=2, activation='tanh'))
    model.add(layers.Dense(10, activation='relu'))
    model.add(layers.Dense(1, activation='linear'))
    model.compile(loss='mae', optimizer='adam')
    y_array = np.array(y, dtype='float32')
    model.fit(X, y_array)
    sequences_padded = X


def get_last_sequence_for_municipality(municipality_code, sequences, data):
    data_filtered = data[data['Codigo_IBGE'] == municipality_code].sort_values('Ano')
    last_year = data_filtered['Ano'].max()
    municipality_sequences = [seq for seq, (_, group) in zip(sequences, data.groupby('Codigo_IBGE')) if group['Codigo_IBGE'].iloc[0] == municipality_code]
    if not municipality_sequences:
        return None
    last_sequence = municipality_sequences[-1]
    return last_sequence

def inverse_transform_prediction(prediction):
    global target_scaler
    if target_scaler is None:
        raise AttributeError("target_scaler has not been initialized. Make sure to preprocess the data with the target column first.")
    prediction = np.array(prediction).reshape(-1, 1)
    prediction_inverse_transformed = target_scaler.inverse_transform(prediction)
    return prediction_inverse_transformed[0][0]


def predict(future_data):
    global sequences_padded
    global scaled_df
    global model

    # Preprocess the future data
    scaled_future_data = preprocess_data(future_data, 'adjusted_funding')
    
    # Fetch the last sequence for each municipality in the future data
    predictions = {}
    for code in scaled_future_data['Codigo_IBGE'].unique():
        last_sequence = get_last_sequence_for_municipality(code, sequences_padded, scaled_df)
        
        # Extend the last sequence with the new data point
        future_point = scaled_future_data[scaled_future_data['Codigo_IBGE'] == code].drop(['Ano', 'Codigo_IBGE'], axis=1).values

        extended_sequence = np.vstack([last_sequence, future_point])
        
        # Pad the sequence
        extended_sequence_padded = pad_sequences([extended_sequence])
        
        # Make the prediction
        prediction = model.predict(extended_sequence_padded)
        prediction_inverse_transformed = inverse_transform_prediction(prediction)
        predictions[code] = prediction_inverse_transformed
        
    return predictions


def save_model(filepath):
    global model
    if model:
        model.save(filepath)
    else:
        print("Model is not trained yet.")


def load_saved_model(filepath):
    try:
        loaded_model = tf.keras.models.load_model(filepath)
        return loaded_model
    except Exception as e:
        print(f"An error occurred while loading the model: {e}")



In [13]:
df = pd.read_csv("../raw_data/all_urban_ML2.csv")

In [14]:
# # Preprocessing and 
scaled_df = preprocess_data(df, 'adjusted_funding')
# # Create sequences
sequences, labels = create_sequences(scaled_df, 'adjusted_funding')
# # Sequence padding 
sequences_padded = add_padding(sequences)
# # Model building and training
build_and_train_model(sequences_padded, labels)



In [15]:
future_data = {
    'Ano': [2022],
    'Codigo_IBGE': [1100015],
    'Aprovacao': [98.4],
    'Reprovacao': [1.6],
    'Abandono': [0.0],
    'Matriculas': [749],
    'Docentes': [71],
    'Estabelecimentos': [3],
    'Turmas': [45],
    'PIB': [28722.45],
    'Poverty_%': [19.7],
    'Unemployed_%': [10.43],
    'Acesso a internet %': [81],
    'adjusted_population': [19318.8]
}

# Convert the sample data to a DataFrame
future_df = pd.DataFrame(future_data)

In [16]:
prediction = predict(future_df)
prediction



{1100015: -165759360.0}

In [17]:
filepath = "../models/model.h5"
save_model("../models/model.h5")

  saving_api.save_model(


In [20]:
mymodel = load_saved_model(filepath)
mymodel.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_1 (Masking)         (None, None, 12)          0         
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 2)                 30        
                                                                 
 dense_2 (Dense)             (None, 10)                30        
                                                                 
 dense_3 (Dense)             (None, 1)                 11        
                                                                 
Total params: 71 (284.00 Byte)
Trainable params: 71 (284.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [22]:
# Preprocess the future data
scaled_future_data = preprocess_data(future_df, 'adjusted_funding')

# Fetch the last sequence for each municipality in the future data
predictions = {}
for code in scaled_future_data['Codigo_IBGE'].unique():
    last_sequence = get_last_sequence_for_municipality(code, sequences_padded, scaled_df)
    
    # Extend the last sequence with the new data point
    future_point = scaled_future_data[scaled_future_data['Codigo_IBGE'] == code].drop(['Ano', 'Codigo_IBGE'], axis=1).values

    extended_sequence = np.vstack([last_sequence, future_point])
    
    # Pad the sequence
    extended_sequence_padded = pad_sequences([extended_sequence])
    
    # Make the prediction
    prediction = mymodel.predict(extended_sequence_padded)
    prediction_inverse_transformed = inverse_transform_prediction(prediction)
    predictions[code] = prediction_inverse_transformed
    
predictions




{1100015: -165759360.0}