<a href="https://colab.research.google.com/github/basugautam/Reproducibility-Challenge-Project/blob/main/19_Pre_existing_Implementation_or_Custom_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Mount Google Drive to access the file
from google.colab import drive
drive.mount('/content/drive')

# Import necessary library
import pandas as pd

# Provide the path to the file in Google Drive
file_path = '/content/drive/My Drive/timeseries_data.csv.csv'

# Read the CSV file
df = pd.read_csv(file_path)

# Display the first few rows of the data
df.head()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Dependency Ratio,Unnamed: 11,Unnamed: 12,Median Age,Unnamed: 14,Unnamed: 15
0,Name,GENC,Year,Total Population,Growth Rate,Population Density (per sq km),Total Fertility Rate,Life Expectancy at Birth,Under-5 Mortality Rate,Sex Ratio of the Population,Youth and Old Age (0-14 and 65+),Youth (0-14),Old Age (65+),Both Sexes,Male,Female
1,-> 2024,,,--,--,--,--,--,--,--,--,--,--,--,--,--
2,Canada,CA,2024,38904514,0.72,4.3,1.44,83.9,4.8,0.99,56.8,23.9,32.9,42.5,43.9,41.2
3,-> 2025,,,--,--,--,--,--,--,--,--,--,--,--,--,--
4,Canada,CA,2025,39187155,0.73,4.3,1.43,84.8,4.4,0.99,57.7,23.8,33.9,42.8,44.1,41.4


In [2]:
# ---------------------------------------------------------------
# a) WHY: This step is to understand the basic structure of the data
# b) HOW: Using .info(), .describe(), and .columns functions
# c) TERMS: .info() gives us column types and missing values
#           .describe() gives statistical summary of numerical data
#           .columns gives list of all column names
# d) GOAL: Helps us identify what kind of analysis or preprocessing is needed
# ---------------------------------------------------------------
print("DATAFRAME INFO:")
df.info()
print("\nDATAFRAME DESCRIPTION:")
print(df.describe())
print("\nCOLUMN NAMES:")
print(df.columns)

# ---------------------------------------------------------------
# a) WHY: To identify if there are any missing values
# b) HOW: We use the isnull().sum() function which returns number of missing entries per column
# c) TERMS: isnull() returns a boolean mask where values are missing, sum() aggregates those
# d) GOAL: So we can handle or clean the data before further analysis
# ---------------------------------------------------------------
print("\nMISSING VALUES:")
print(df.isnull().sum())

# ---------------------------------------------------------------
# a) WHY: To view time series trends and patterns
# b) HOW: Using line plots for visualizing changes over time
# c) TERMS: We use matplotlib and seaborn for plotting
# d) GOAL: Visual insights to spot patterns, seasonality or anomalies
# ---------------------------------------------------------------
# ---------------------------------------------------------------
# a) WHY: To visualize relationships between numerical variables
# b) HOW: Use seaborn heatmap on correlation matrix
# c) TERMS: corr() gives pairwise Pearson correlation of numerical columns
# d) GOAL: Spot highly correlated features that may impact modeling
# ---------------------------------------------------------------

import numpy as np       # ← FIXED: Now numpy is properly imported
import seaborn as sns
import matplotlib.pyplot as plt

# Filter only numeric columns to avoid conversion errors
numeric_df = df.select_dtypes(include=[np.number])

plt.figure(figsize=(10, 6))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Matrix (Numeric Features Only)")
plt.tight_layout()
plt.show()

import matplotlib.pyplot as plt
import seaborn as sns

# Automatically adjust style and layout
sns.set(style='whitegrid')

# Assuming 'Date' column exists and is a time-related feature
# Convert 'Date' column to datetime format if present
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'])

    # Set 'Date' as index for time-series plotting
    df.set_index('Date', inplace=True)

    # Plot all columns over time
    df.plot(figsize=(12, 6), title="Time Series Data Over Time")
    plt.xlabel("Date")
    plt.ylabel("Values")
    plt.tight_layout()
    plt.show()
else:
    print("No 'Date' column found. Skipping time series plotting.")

# ---------------------------------------------------------------
# a) WHY: To get correlation between different columns (e.g., city temperatures)
# b) HOW: Using .corr() and heatmap to visualize relation
# c) TERMS: Correlation tells how two variables move together, heatmap is a visual matrix
# d) GOAL: Understand which factors are strongly related
# ---------------------------------------------------------------
plt.figure(figsize=(10, 6))
# ---------------------------------------------------------------
# a) WHY: We want to see how numerical features are related
# b) HOW: Using heatmap to show pairwise correlations
# c) TERMS: Correlation ranges [-1, 1] — closer to ±1 = stronger linear relationship
# d) GOAL: Understand feature relationships for modeling or EDA
# ---------------------------------------------------------------

import seaborn as sns
import matplotlib.pyplot as plt

# Filter only numeric columns to avoid conversion errors
numeric_df = df.select_dtypes(include=[np.number])

plt.figure(figsize=(10, 6))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Matrix (Numeric Features Only)")
plt.tight_layout()
plt.show()

sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()


DATAFRAME INFO:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        55 non-null     object
 1   Unnamed: 1        28 non-null     object
 2   Unnamed: 2        28 non-null     object
 3   Unnamed: 3        55 non-null     object
 4   Unnamed: 4        55 non-null     object
 5   Unnamed: 5        55 non-null     object
 6   Unnamed: 6        55 non-null     object
 7   Unnamed: 7        55 non-null     object
 8   Unnamed: 8        55 non-null     object
 9   Unnamed: 9        55 non-null     object
 10  Dependency Ratio  55 non-null     object
 11  Unnamed: 11       55 non-null     object
 12  Unnamed: 12       55 non-null     object
 13  Median Age        55 non-null     object
 14  Unnamed: 14       55 non-null     object
 15  Unnamed: 15       55 non-null     object
dtypes: object(16)
memory usage: 7.0+ KB

DATAFRAME D

ValueError: zero-size array to reduction operation fmin which has no identity

<Figure size 1000x600 with 0 Axes>

In [None]:
# --------------------------------------------
# a) WHY: To analyze patterns in time-series data using neural networks
# b) HOW: We will try two approaches:
#    (1) Pre-built Keras LSTM model
#    (2) Custom model using Keras Sequential API
# c) TERMS:
#   - LSTM: Long Short-Term Memory (a type of RNN good for time data)
#   - Sequential: A linear stack of layers in Keras
#   - Dense: Fully connected layer
# d) GOAL: Forecast future values based on time-series input
# --------------------------------------------

# Import necessary libraries
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# --------------------------------------------
# a) WHY: LSTM models need data normalized between 0 and 1
# b) HOW: Using MinMaxScaler
# c) TERMS: MinMaxScaler scales each feature to range [0, 1]
# d) GOAL: Make model training faster and more stable
# --------------------------------------------
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df.values)

# --------------------------------------------
# a) WHY: Time series needs past data to predict future
# b) HOW: Create sequences of 10 steps to predict the next value
# c) TERMS: X (input sequence), y (next time step value)
# d) GOAL: Prepare data for LSTM input format
# --------------------------------------------
def create_sequences(data, step=10):
    X, y = [], []
    for i in range(len(data) - step):
        X.append(data[i:i+step])
        y.append(data[i+step])
    return np.array(X), np.array(y)

X, y = create_sequences(scaled_data, step=10)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# ---------------------------------------------------------------
# 1. USING PRE-BUILT MODEL ARCHITECTURE (FAST AND CONVENIENT)
# ---------------------------------------------------------------
# a) WHY: Quick to implement and often optimized already
# b) HOW: Use Sequential model with standard LSTM layers
# c) TERMS: LSTM(50) means 50 memory units, return_sequences=False means output single step
# d) GOAL: Forecast next values with minimal tuning
# ---------------------------------------------------------------
model_prebuilt = Sequential([
    LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dense(y.shape[1] if len(y.shape) > 1 else 1)
])
model_prebuilt.compile(optimizer='adam', loss='mse')
model_prebuilt.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

# ---------------------------------------------------------------
# 2. CUSTOM MODEL (FOR FULL CONTROL OVER ARCHITECTURE)
# ---------------------------------------------------------------
# a) WHY: Allows tuning for special features (e.g., extra layers, activation)
# b) HOW: Stack LSTM layers, change units, activation, add Dropout etc.
# c) TERMS: Dropout prevents overfitting, multiple layers help learn complexity
# d) GOAL: Optimize the model for performance on our specific data
# ---------------------------------------------------------------
model_custom = Sequential()
model_custom.add(LSTM(64, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model_custom.add(LSTM(32))
model_custom.add(Dense(y.shape[1] if len(y.shape) > 1 else 1))
model_custom.compile(optimizer='adam', loss='mae')  # Mean Absolute Error
model_custom.fit(X_train, y_train, epochs=15, batch_size=32, validation_split=0.1)

# ---------------------------------------------------------------
# EVALUATION OF MODELS
# ---------------------------------------------------------------
# a) WHY: To compare predictions vs true values
# b) HOW: Evaluate MSE on test data
# c) TERMS: Mean Squared Error (loss function)
# d) GOAL: Determine which model performs better
# ---------------------------------------------------------------
from sklearn.metrics import mean_squared_error

pred_prebuilt = model_prebuilt.predict(X_test)
pred_custom = model_custom.predict(X_test)

mse_prebuilt = mean_squared_error(y_test, pred_prebuilt)
mse_custom = mean_squared_error(y_test, pred_custom)

print(f"Prebuilt Model MSE: {mse_prebuilt}")
print(f"Custom Model MSE: {mse_custom}")


In [None]:
# ---------------------------------------------------------------
# a) WHY: Visual comparison helps us intuitively see model accuracy
# b) HOW: Plot true values (y_test) and predictions from both models
# c) TERMS: plt.plot() is a line plot, helps in comparing trends
# d) GOAL: See how closely model forecasts follow the actual data
# ---------------------------------------------------------------
import matplotlib.pyplot as plt

# Limit to first 100 samples for clear visual comparison
num_samples = 100

plt.figure(figsize=(14, 6))

# Plot actual values
plt.plot(y_test[:num_samples], label='Actual', color='black', linewidth=2)

# Plot prebuilt model predictions
plt.plot(pred_prebuilt[:num_samples], label='Prebuilt LSTM Prediction', color='blue', linestyle='--')

# Plot custom model predictions
plt.plot(pred_custom[:num_samples], label='Custom LSTM Prediction', color='green', linestyle=':')

plt.title("Actual vs Predicted Time Series Values")
plt.xlabel("Time Step")
plt.ylabel("Scaled Value")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
