# BTP PROJECT

In [1]:
#Import library

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense
import keras
import random

In [2]:

data_1 = pd.read_csv('output.csv')

# Display the first few rows of the data
print(data_1)

                time     lat     lon        sst
0         2013-01-01  10.125  51.125  25.390000
1         2013-01-01  10.125  51.375  25.519999
2         2013-01-01  10.125  51.625  25.670000
3         2013-01-01  10.125  51.875  25.800000
4         2013-01-01  10.125  52.125  25.830000
...              ...     ...     ...        ...
14381206  2023-09-14  24.875  65.875  26.500000
14381207  2023-09-14  24.875  66.125  26.490000
14381208  2023-09-14  24.875  66.375  26.800000
14381209  2023-09-14  24.875  66.625  27.150000
14381210  2023-09-14  24.875  66.875  27.480000

[14381211 rows x 4 columns]


In [3]:
def create_sequences(data, target, sequence_length):
    X, y = [], []
    for i in range(len(data) - sequence_length):
        seq = data[i:i + sequence_length]
        label = target[i + sequence_length]
        X.append(seq)
        y.append(label)
    return np.array(X), np.array(y)

In [4]:


# Your original lists
random.seed(42)
original_list = data_1['lat'].unique().tolist()
original_list_1 = data_1['lon'].unique().tolist()

# Calculate 10% of the length of the lists
num_to_select = int(len(original_list) * (13/15))
num_to_select_1 = int(len(original_list_1) * (13/15))

# Select random elements from the lists without replacement
random_selection = random.sample(original_list, num_to_select)
random_selection_1 = random.sample(original_list_1, num_to_select_1)

In [5]:
random_selection

[20.125,
 11.875,
 10.375,
 21.875,
 14.375,
 13.875,
 13.625,
 12.125,
 24.125,
 11.625,
 20.875,
 22.875,
 18.625,
 11.375,
 19.375,
 16.875,
 10.625,
 24.375,
 21.625,
 13.375,
 23.375,
 18.125,
 20.625,
 18.875,
 13.125,
 22.125,
 16.625,
 19.875,
 17.125,
 14.625,
 23.125,
 16.375,
 21.125,
 10.125,
 16.125,
 20.375,
 15.625,
 24.875,
 12.625,
 17.375,
 11.125,
 22.625,
 15.375,
 10.875,
 22.375,
 17.625,
 21.375,
 17.875,
 19.125,
 12.375,
 14.875,
 18.375]

In [6]:
random_selection_1

[65.625,
 68.125,
 54.875,
 63.125,
 53.625,
 68.625,
 60.375,
 62.625,
 57.125,
 53.125,
 52.375,
 58.375,
 69.375,
 69.875,
 50.625,
 54.125,
 50.125,
 55.375,
 67.375,
 61.125,
 64.375,
 56.875,
 67.625,
 50.875,
 56.625,
 54.375,
 61.625,
 66.625,
 62.125,
 61.875,
 61.375,
 52.125,
 60.625,
 66.125,
 65.375,
 59.625,
 50.375,
 62.375,
 66.375,
 68.875,
 64.125,
 59.875,
 54.625,
 56.125,
 51.875,
 67.875,
 51.625,
 60.125,
 57.375,
 69.625,
 59.375,
 52.625,
 55.625,
 64.875,
 65.875,
 58.125,
 59.125,
 63.625,
 67.125,
 65.125,
 63.875,
 66.875,
 58.625,
 57.875,
 52.875,
 53.875,
 60.875,
 64.625,
 55.125]

In [7]:
df_sorted = data_1.sort_values(by=['lat', 'lon'])

In [8]:
df_sorted = df_sorted[~df_sorted.isin(random_selection)].dropna()

In [9]:
df_sorted = df_sorted[~df_sorted.isin(random_selection_1)].dropna()

In [10]:
grouped_data = df_sorted.groupby(['lat', 'lon'])

In [11]:
grouped_data

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002687D247610>

In [12]:
def mim_model(input_shape):
    model = Sequential()
    model.add(LSTM(64, input_shape=input_shape, return_sequences=True))
    model.add(LSTM(32, return_sequences=True))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')
    return model

In [13]:
df = df_sorted[~df_sorted.isin(random_selection)].dropna()

In [15]:


# Convert 'time' column to datetime
df_sorted['time'] = pd.to_datetime(df_sorted['time'])
df_sorted['year'] = pd.to_datetime(df_sorted['time']).dt.year
df_sorted['month'] = pd.to_datetime(df_sorted['time']).dt.month
df_sorted['day'] = pd.to_datetime(df_sorted['time']).dt.day

# Fit scaler on entire dataset
scaler = MinMaxScaler()
df_sorted[['lon', 'lat', 'year', 'month', 'day']] = scaler.fit_transform(df_sorted[['lon', 'lat', 'year', 'month', 'day']])

sequence_length = 10
features = ['lat', 'lon', 'year', 'month', 'day', 'sst']

for group, data_group in grouped_data:

    data_group['year'] = pd.to_datetime(df_sorted['time']).dt.year
    data_group['month'] = pd.to_datetime(df_sorted['time']).dt.month
    data_group['day'] = pd.to_datetime(df_sorted['time']).dt.day
    # Filter data by month

    data_train = data_group[(1 <= data_group['month']) & (data_group['month'] <= 10)]
    data_test = data_group[11 <= data_group['month']]

    # Create sequences
    X_train, y_train = create_sequences(data_train[features].values, data_train['sst'].values, sequence_length)
    X_test, y_test = create_sequences(data_test[features].values, data_test['sst'].values, sequence_length)

    X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], len(features)))
    X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], len(features)))

    # Define and compile model
    model = Sequential()
    model.add(LSTM(64, input_shape=(sequence_length, len(features)), return_sequences=True))
    model.add(LSTM(32))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')

    # Train model with early stopping
    early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)
    model.fit(X_train, y_train, epochs=50, validation_data=(X_test, y_test), callbacks=[early_stopping])

    # Print information about the group
    print("Latitude:", group[0])
    print("Longitude:", group[1])

# No need to count groups, you can use len(grouped_data) if needed
print(len(grouped_data))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Latitude: 12.875
Longitude: 51.125
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Latitude: 12.875
Longitude: 51.375
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Latitude: 12.875
Longitude: 53.375
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Latitude: 12.875
Longitude: 55.875
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Ep

In [16]:
predicted_sst = model.predict(X_test)



In [17]:
from sklearn.metrics import mean_squared_error

y_true = y_test# Actual labels from the test set
y_pred = predicted_sst # Predicted labels from the model

mse = mean_squared_error(y_true, predicted_sst)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 1.3730192657892717


In [31]:
no_of_test = 2444
count = 0
non_count = 0

for y, y_1 in zip(y_test, y_pred):
    if (y_1 - y) ** 2 <= 3** 2:
        count += 1
    else:
        non_count += 1

print(count)
print(non_count)

596
4


In [32]:
no_of_test = 2444
count = 0
non_count = 0

for y, y_1 in zip(y_test, y_pred):
    if (y_1 - y) ** 2 <= 2** 2:
        count += 1
    else:
        non_count += 1

print(count)
print(non_count)

557
43


In [33]:
no_of_test = 2444
count = 0
non_count = 0

for y, y_1 in zip(y_test, y_pred):
    if (y_1 - y) ** 2 <= 1** 2:
        count += 1
    else:
        non_count += 1

print(count)
print(non_count)

353
247


In [19]:
data['year'] = pd.to_datetime(data['time']).dt.year
data['month'] = pd.to_datetime(data['time']).dt.month
data['day'] = pd.to_datetime(data['time']).dt.day

In [32]:
def create_sequences(data, target, sequence_length):
    X, y = [], []
    for i in range(len(data) - sequence_length):
        seq = data[i:i + sequence_length]
        label = target[i + sequence_length]
        X.append(seq)
        y.append(label)
    return np.array(X), np.array(y)

sequence_length = 10# adjust based on your model's requirements
features = ['lat', 'lon', 'year', 'month', 'day','sst']
X_train, y_train = create_sequences(data_train[features].values, data['sst'].values, sequence_length)
X_test, y_test = create_sequences(data_test[features].values, data['sst'].values, sequence_length)

In [33]:
X_train[110]

array([[  24.625   ,   66.625   , 2013.      ,    4.      ,   21.      ,
          27.42    ],
       [  24.625   ,   66.625   , 2013.      ,    4.      ,   22.      ,
          27.63    ],
       [  24.625   ,   66.625   , 2013.      ,    4.      ,   23.      ,
          27.99    ],
       [  24.625   ,   66.625   , 2013.      ,    4.      ,   24.      ,
          27.71    ],
       [  24.625   ,   66.625   , 2013.      ,    4.      ,   25.      ,
          27.109999],
       [  24.625   ,   66.625   , 2013.      ,    4.      ,   26.      ,
          27.289999],
       [  24.625   ,   66.625   , 2013.      ,    4.      ,   27.      ,
          27.67    ],
       [  24.625   ,   66.625   , 2013.      ,    4.      ,   28.      ,
          27.96    ],
       [  24.625   ,   66.625   , 2013.      ,    4.      ,   29.      ,
          27.689999],
       [  24.625   ,   66.625   , 2013.      ,    4.      ,   30.      ,
          27.84    ]])

In [34]:
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], len(features)))

In [None]:
X_test= X_test.reshape((X_test.shape[0], X_test.shape[1], len(features)))

In [None]:
X_train[110]

array([[2.4625000e+01, 6.6625000e+01, 2.0140000e+03, 2.0000000e+00,
        2.1000000e+01, 2.1960000e+01],
       [2.4625000e+01, 6.6625000e+01, 2.0140000e+03, 2.0000000e+00,
        2.2000000e+01, 2.1859999e+01],
       [2.4625000e+01, 6.6625000e+01, 2.0140000e+03, 2.0000000e+00,
        2.3000000e+01, 2.2279999e+01],
       [2.4625000e+01, 6.6625000e+01, 2.0140000e+03, 2.0000000e+00,
        2.4000000e+01, 2.2300000e+01],
       [2.4625000e+01, 6.6625000e+01, 2.0140000e+03, 2.0000000e+00,
        2.5000000e+01, 2.2619999e+01],
       [2.4625000e+01, 6.6625000e+01, 2.0140000e+03, 2.0000000e+00,
        2.6000000e+01, 2.2779999e+01],
       [2.4625000e+01, 6.6625000e+01, 2.0140000e+03, 2.0000000e+00,
        2.7000000e+01, 2.2820000e+01],
       [2.4625000e+01, 6.6625000e+01, 2.0140000e+03, 2.0000000e+00,
        2.8000000e+01, 2.2869999e+01],
       [2.4625000e+01, 6.6625000e+01, 2.0150000e+03, 1.0000000e+00,
        1.0000000e+00, 2.4109999e+01],
       [2.4625000e+01, 6.6625000e+01,

In [None]:
data_train = data.loc['2018-01-01':'2022-01-31',:]
data_os1 = data.loc['2022-02-01':'2022-12-31',:]

In [None]:
for group, data in grouped_data:
    # 'group' contains the latitude and longitude values
    # 'data' contains the subset of DataFrame corresponding to that latitude and longitude
    print("Latitude:", group[0])
    print("Longitude:", group[1])


    data['year'] = pd.to_datetime(data['time']).dt.year
    data['month'] = pd.to_datetime(data['time']).dt.month
    data['day'] = pd.to_datetime(data['time']).dt.day


    data_train = data[(1 <= data['month']) & (data['month'] <= 10)]
    data_test = data[11 <= data['month']]