In [1]:
#Importing libraries 
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential, layers
from tensorflow.keras.layers import Dense, SimpleRNN
from tensorflow.keras.callbacks import EarlyStopping

In [72]:
data = pd.read_csv('./raw_data/treated.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Country,Year,fertility,Code,avg_years_of_schooling
0,0,Afghanistan,2007,6.56,AFG,2.9
1,1,Afghanistan,1995,7.61,AFG,1.9
2,2,Afghanistan,2008,6.37,AFG,3.0
3,3,Afghanistan,2017,4.63,AFG,3.8
4,4,Afghanistan,1994,7.57,AFG,1.8


In [3]:
data

Unnamed: 0.1,Unnamed: 0,Country,Year,fertility,Code,avg_years_of_schooling
0,0,Afghanistan,2007,6.56,AFG,2.9
1,1,Afghanistan,1995,7.61,AFG,1.9
2,2,Afghanistan,2008,6.37,AFG,3.0
3,3,Afghanistan,2017,4.63,AFG,3.8
4,4,Afghanistan,1994,7.57,AFG,1.8
...,...,...,...,...,...,...
5121,5121,Zimbabwe,2010,4.03,ZWE,7.3
5122,5122,Zimbabwe,2000,3.75,ZWE,6.5
5123,5123,Zimbabwe,2017,3.71,ZWE,8.2
5124,5124,Zimbabwe,2002,3.72,ZWE,6.9


In [42]:
def preproc(data:pd.DataFrame) -> pd.DataFrame:
    '''
    Fines adjustments on dataset
    '''
    #Removing columns
    data.drop(columns=['Unnamed: 0', 'Code'], inplace=True)

    #Ordering by year and set it as index
    data.sort_values('Year', inplace=True)
    data.set_index('Year', inplace=True)
    
    return data

In [43]:
df = preproc(data)
df.head()

Unnamed: 0_level_0,Country,fertility,avg_years_of_schooling
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1960,Germany,2.37,7.53
1960,Thailand,6.15,2.07
1960,Kazakhstan,4.56,3.07
1960,Vietnam,6.35,2.01
1960,Kenya,7.95,1.21


In [103]:
def list_X_y(data:pd.DataFrame) -> list:
    '''
    Given a countries dataset, this function returns
    two list of dataframes, ie., lists containing one dataframe per country.
    '''
    countries = data.Country.unique().tolist()

    X = []
    y = []
    
    new_df = pd.DataFrame()

    for country in countries:
        new_df = data[data['Country']==country][['fertility', 'avg_years_of_schooling']]
        
        if new_df.shape[0] == 34:
            X.append(new_df.head(33))
            y.append(new_df['avg_years_of_schooling'].tail(1))
        else:
            pass
         
    return X, y

In [130]:
X, y = list_X_y(data)

In [131]:
#X_new = np.array([np.array(X[0]), np.array(X[1]), np.array(X[2]), 
#                  np.array(X[3]), np.array(X[4])]).astype(np.float32)

In [132]:
X_new = np.array(X)

In [134]:
y_new = np.array(y)

In [135]:
y_new = np.expand_dims(y_new.astype(np.float32), axis=-1)
y_new.shape

(132, 1, 1)

In [136]:
#Splits into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_new, y_new, test_size=0.20)

In [137]:
model = Sequential()
model.add(SimpleRNN(units=20, activation='tanh'))
model.add(Dense(10, activation="relu"))
model.add(Dense(1, activation="linear"))

In [151]:
model.compile(loss='mse', 
              optimizer='adam',
                metrics=['mae']) #change to adam

In [152]:
es = EarlyStopping(patience=10)

model.fit(X_train, y_train,
          validation_split = 0.2,
          callbacks=[es],
          epochs=100, 
          verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


<keras.callbacks.History at 0x1619a76d0>

In [155]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_5 (SimpleRNN)    (None, 20)                460       
                                                                 
 dense_10 (Dense)            (None, 10)                210       
                                                                 
 dense_11 (Dense)            (None, 1)                 11        
                                                                 
Total params: 681
Trainable params: 681
Non-trainable params: 0
_________________________________________________________________


In [153]:
history = model.evaluate(X_test, y_test)



In [150]:
history[0]

TypeError: 'float' object is not subscriptable

# Testing with the dataframe all in once

In [None]:
df.drop(columns='Country', inplace=True)

In [None]:
X = df[['fertility','avg_years_of_schooling']]
y = df['avg_years_of_schooling']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

es = EarlyStopping(patience=5)

model = Sequential()
model.add(SimpleRNN(units=20, activation='tanh'))
model.add(Dense(10, activation="relu"))
model.add(Dense(1, activation="linear"))

model.compile(loss='mse', 
              optimizer='adam') #change to adam

model.fit(X_train, y_train,
          epochs=10, 
          verbose=0)

In [None]:
history = model.evaluate(X_)

In [29]:
df[df['Country']=='Japan'].head(5)

Unnamed: 0_level_0,Country,fertility,avg_years_of_schooling
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1960,Japan,2.0,6.97
1965,Japan,2.14,6.94
1970,Japan,2.14,7.08
1975,Japan,1.91,7.9
1980,Japan,1.75,8.71


In [58]:
# --- SEQUENCE A (brazil)

day_1 = [6.06, 2.49]  # OBSERVATION 1 [fertility, avg_years_of_schooling, pollution]
day_2 = [5.70, 2.77]  # OBSERVATION 2 [temperature, speed, pollution]
day_3 = [ 4.97,  3.09]  # OBSERVATION 3 [temperature, speed, pollution]
day_4 = [ 4.42,  2.82]  # OBSERVATION 4 [temperature, speed, pollution]

sequence_a = [day_1, day_2, day_3, day_4]

y_a = [2.93]

day_1b = [2, 6.97]  # OBSERVATION 1 [fertility, avg_years_of_schooling, pollution]
day_2b = [2.14, 6.94]  # OBSERVATION 2 [temperature, speed, pollution]
day_3b = [2.14, 7.08]  # OBSERVATION 3 [temperature, speed, pollution]
day_4b = [1.91, 7.9]  # OBSERVATION 4 [temperature, speed, pollution]

sequence_b = [day_1b, day_2b, day_3b, day_4b]

y_b = [8.71]

In [63]:
df.head()

Unnamed: 0_level_0,Country,fertility,avg_years_of_schooling
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1960,Germany,2.37,7.53
1960,Thailand,6.15,2.07
1960,Kazakhstan,4.56,3.07
1960,Vietnam,6.35,2.01
1960,Kenya,7.95,1.21


In [60]:
X = np.array([sequence_a, sequence_b]).astype(np.float32)
y = np.expand_dims(np.array([y_a, y_b]).astype(np.float32), axis=-1)

In [61]:
fert

array([[2.37, 2.5 , 2.03, ..., 1.5 , 1.6 , 1.57],
       [6.15, 6.13, 5.6 , ..., 1.54, 1.54, 1.53],
       [4.56, 4.05, 3.61, ..., 2.74, 2.77, 2.75],
       ...,
       [6.49, 5.77, 4.91, ..., 1.68, 1.66, 1.64],
       [2.23, 2.25, 2.4 , ..., 1.33, 1.38, 1.35],
       [4.03, 3.54, 3.16, ..., 1.99, 1.87, 1.81]])

In [62]:
X.shape

(2, 4, 2)

In [33]:
y.shape

(2, 1, 1)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

es = EarlyStopping(patience=5)

model = Sequential()
model.add(SimpleRNN(units=20, activation='tanh'))
model.add(Dense(10, activation="relu"))
model.add(Dense(1, activation="linear"))

model.compile(loss='mse', 
              optimizer='adam') #change to adam

model.fit(X_train, y_train,
          epochs=10, 
          verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x28e6299f0>

In [73]:
df = preproc(data)
df.head()

Unnamed: 0_level_0,Country,fertility,avg_years_of_schooling
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1960,Germany,2.37,7.53
1960,Thailand,6.15,2.07
1960,Kazakhstan,4.56,3.07
1960,Vietnam,6.35,2.01
1960,Kenya,7.95,1.21


In [104]:
X, y = list_X_y(data)

In [105]:
np.array(X[10])

array([[6.52, 0.35],
       [6.79, 0.42],
       [7.08, 0.48],
       [7.28, 1.02],
       [7.22, 1.72],
       [6.81, 2.37],
       [6.19, 2.99],
       [6.07, 3.1 ],
       [5.96, 3.2 ],
       [5.86, 3.3 ],
       [5.77, 3.4 ],
       [5.7 , 3.5 ],
       [5.63, 3.6 ],
       [5.57, 3.7 ],
       [5.52, 3.8 ],
       [5.47, 3.9 ],
       [5.42, 4.  ],
       [5.37, 4.  ],
       [5.33, 4.1 ],
       [5.28, 4.1 ],
       [5.23, 4.2 ],
       [5.18, 4.2 ],
       [5.12, 4.3 ],
       [5.06, 4.3 ],
       [5.  , 4.3 ],
       [4.94, 4.3 ],
       [4.87, 4.3 ],
       [4.8 , 4.4 ],
       [4.73, 4.4 ],
       [4.66, 4.5 ],
       [4.59, 4.7 ],
       [4.52, 4.7 ],
       [4.45, 4.8 ]])

In [78]:
sequence_a

[[6.06, 2.49], [5.7, 2.77], [4.97, 3.09], [4.42, 2.82]]

In [102]:
X[0].head(33)

Unnamed: 0_level_0,fertility,avg_years_of_schooling
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1960,2.37,7.53
1965,2.5,7.68
1970,2.03,7.71
1975,1.45,7.58
1980,1.44,7.63
1985,1.37,7.55
1990,1.45,8.77
1991,1.33,8.9
1992,1.29,9.1
1993,1.28,9.3


In [106]:
X_new = np.array([np.array(X[0]), np.array(X[1]), np.array(X[2]), np.array(X[3]), np.array(X[4])]).astype(np.float32)

In [107]:
X_new.shape

(5, 33, 2)

In [111]:
np.array(y).shape

(132, 1)

In [112]:
y_new = np.expand_dims(
    np.array([np.array(y[0]), np.array(y[1]), np.array(y[2]), 
              np.array(y[3]), np.array(y[4])]).astype(np.float32), axis=-1)

In [113]:
y_new.shape

(5, 1, 1)

In [115]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

es = EarlyStopping(patience=5)

model = Sequential()
model.add(SimpleRNN(units=20, activation='tanh'))
model.add(Dense(10, activation="relu"))
model.add(Dense(1, activation="linear"))

model.compile(loss='mse', 
              optimizer='adam') #change to adam

model.fit(X_new, y_new,
          epochs=10, 
          verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x160547fd0>