## Import Libraries

In [1]:
import pandas as pd   #for data frames, reading data, data processing, and analysis
import numpy as np    #for numerical computations
from sklearn.preprocessing import StandardScaler   #for scaling features
from sklearn.model_selection import train_test_split   # For splitting data into train and test
import keras    # Import keras Library for neural network implementation
from keras.models import Sequential    # class from keras and used for implementing the model in neural networks
from keras.layers import Dense    # class from keras and used to create fully connected layers in neural networks

## Read data and get insights about it

In [2]:
# Read data
data = pd.read_csv('forestfires.csv')

In [3]:
# Display data
data

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,4,3,aug,sun,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44
513,2,4,aug,sun,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29
514,7,4,aug,sun,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16
515,1,4,aug,sat,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.00


In [4]:
# Display int columns and their counts, mean, standard deviation, minimum, maximum,and three quantiles
data.describe()

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
count,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0
mean,4.669246,4.299807,90.644681,110.87234,547.940039,9.021663,18.889168,44.288201,4.017602,0.021663,12.847292
std,2.313778,1.2299,5.520111,64.046482,248.066192,4.559477,5.806625,16.317469,1.791653,0.295959,63.655818
min,1.0,2.0,18.7,1.1,7.9,0.0,2.2,15.0,0.4,0.0,0.0
25%,3.0,4.0,90.2,68.6,437.7,6.5,15.5,33.0,2.7,0.0,0.0
50%,4.0,4.0,91.6,108.3,664.2,8.4,19.3,42.0,4.0,0.0,0.52
75%,7.0,5.0,92.9,142.4,713.9,10.8,22.8,53.0,4.9,0.0,6.57
max,9.0,9.0,96.2,291.3,860.6,56.1,33.3,100.0,9.4,6.4,1090.84


In [5]:
# Display each variable with the number of not null values and their data types
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X       517 non-null    int64  
 1   Y       517 non-null    int64  
 2   month   517 non-null    object 
 3   day     517 non-null    object 
 4   FFMC    517 non-null    float64
 5   DMC     517 non-null    float64
 6   DC      517 non-null    float64
 7   ISI     517 non-null    float64
 8   temp    517 non-null    float64
 9   RH      517 non-null    int64  
 10  wind    517 non-null    float64
 11  rain    517 non-null    float64
 12  area    517 non-null    float64
dtypes: float64(8), int64(3), object(2)
memory usage: 52.6+ KB


        There are no nulls in the data, all columns numeric except month and day which are object, total columns = 13, total rows = 517

In [6]:
# Count number of nulls in each column
data.isna().sum()

X        0
Y        0
month    0
day      0
FFMC     0
DMC      0
DC       0
ISI      0
temp     0
RH       0
wind     0
rain     0
area     0
dtype: int64

        There are no nulls in data

In [7]:
# Display shape of data (number of rows and columns)
data.shape

(517, 13)

        Data has 517 rows, 13 columns

In [8]:
# Display types of data
data.dtypes

X          int64
Y          int64
month     object
day       object
FFMC     float64
DMC      float64
DC       float64
ISI      float64
temp     float64
RH         int64
wind     float64
rain     float64
area     float64
dtype: object

        All variables are numeric except to columns [month and day] are object (String)

## Preprocessing

In [9]:
# Found that only two columns are found as a data type (object) 
# so i can transform it to numeric to easily deal with the data

# Convert the 'day' column to categorical
data['day'] = data['day'].astype('category')

# Convert categorical to int starting from 1
data['day'] = data['day'].cat.codes+1

# Convert the 'month' column to categorical
data['month'] = data['month'].astype('category')

# Convert categorical to int starting from 1
data['month'] = data['month'].cat.codes+1


In [10]:
# Display data after the encoding
data

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,8,1,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00
1,7,4,11,6,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00
2,7,4,11,3,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00
3,8,6,8,1,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00
4,8,6,8,4,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,4,3,2,4,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44
513,2,4,2,4,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29
514,7,4,2,4,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16
515,1,4,2,3,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.00


In [11]:
# Display number of unique values in each column
data.nunique()

X          9
Y          7
month     12
day        7
FFMC     106
DMC      215
DC       219
ISI      119
temp     192
RH        75
wind      21
rain       7
area     251
dtype: int64

        column [area] has the most number of unique values

     Aslo from the description, i found that area and rain are skewed to 0
     so i checked using the value counts for each column

In [12]:
# Display the number of each unique value occured in the [area] column
data['area'].value_counts()

0.00      247
1.94        3
0.52        2
3.71        2
0.68        2
         ... 
105.66      1
154.88      1
196.48      1
200.94      1
11.16       1
Name: area, Length: 251, dtype: int64

        area is skewed towards the 0 as it's the most occured value

In [13]:
# Apply log transformation, it's used to handle skewed data that heavily concentrated around zero
# This function is used to calculate the logarithm of 1 plus the values column
data['area'] = np.log1p(data['area'])  

        Now column [area] is modified in the data and the values are suitable for modeling 

In [14]:
# Display the number of each unique value occured in the [area] column
data['rain'].value_counts()

0.0    509
0.2      2
0.8      2
1.0      1
6.4      1
0.4      1
1.4      1
Name: rain, dtype: int64

        rain is skewed towards the 0 as it's the most occured value

In [15]:
# Apply log transformation, it's used to handle skewed data that heavily concentrated around zero
# This function is used to calculate the logarithm of 1 plus the values column
data['rain'] = np.log1p(data['rain'])  

        Now column [rain] is modified in the data and the values are suitable for modeling 

In [16]:
# Put data in variable X excluding column"area"
X = data.drop('area',axis=1)

# Put column "area" in Y for prediction
Y = data['area']

## Normalize Data

In [17]:
# Scale all data features using StandardScaler
scaler = StandardScaler()
features_scaled = scaler.fit_transform(X)

# Print the scaled features
print("Scaled Features: ",features_scaled)

Scaled Features:  [[ 1.00831277  0.56986043  0.28422225 ...  0.41172435  1.49861442
  -0.09726116]
 [ 1.00831277 -0.24400101  0.97087134 ... -0.69245628 -1.74175564
  -0.09726116]
 [ 1.00831277 -0.24400101  0.97087134 ... -0.69245628 -1.51828184
  -0.09726116]
 ...
 [ 1.00831277 -0.24400101 -1.08907592 ...  1.57724834  1.49861442
  -0.09726116]
 [-1.58736044 -0.24400101 -1.08907592 ... -0.14036597 -0.00983371
  -0.09726116]
 [ 0.57570057 -1.05786246  0.74198831 ... -0.81514302  0.26950853
  -0.09726116]]


## Train-Test Split

In [18]:
# Splitting data into train and test with 80% train and 20% test
xTrain, xTest, yTrain, yTest = train_test_split(features_scaled, Y, test_size = 0.2, random_state = 50) 
print(xTrain,'\n______________________________\n', xTest,'\n______________________________\n', yTrain,'\n______________________________\n', yTest)

[[ 1.44092498  1.38372188 -1.08907592 ... -0.81514302 -0.23330751
  -0.09726116]
 [ 1.44092498  1.38372188 -0.86019289 ...  1.02515803  2.22490426
  -0.09726116]
 [-1.15474824 -1.87172391 -1.08907592 ... -1.36723333  0.26950853
  -0.09726116]
 ...
 [ 1.00831277 -0.24400101 -0.17354381 ... -0.87648638 -1.23893959
  -0.09726116]
 [-0.28952383  0.56986043  1.19975437 ...  0.71844119  0.77232458
  -0.09726116]
 [ 1.87353718  3.82530623 -0.17354381 ... -1.18320323  0.26950853
  -0.09726116]] 
______________________________
 [[ 0.57570057  0.56986043 -0.63130986 ...  2.31336876  1.27514062
  -0.09726116]
 [-1.15474824 -0.24400101 -1.08907592 ... -0.14036597  0.77232458
  -0.09726116]
 [ 1.44092498  1.38372188  1.19975437 ...  0.10500751 -0.73612355
  -0.09726116]
 ...
 [-1.15474824 -1.87172391 -1.08907592 ...  0.71844119 -0.51264975
  -0.09726116]
 [-1.58736044 -1.87172391 -0.17354381 ...  2.06799529  2.22490426
  -0.09726116]
 [-1.15474824 -0.24400101 -1.08907592 ...  0.90247129 -0.73612355

In [19]:
# Put number of features_scaled in a variable (n_cols)
n_cols = features_scaled.shape[1] 

## Implementing  Neural Network Model

In [20]:
# define neural network model
def neural_network_model():
    model = Sequential()
    # First layer with 64 nodes and number of features column as the shape of input data and relu activation function
    model.add(Dense(64,input_shape=(n_cols,) , activation = 'relu'))  
    
    # Second layer with 32 nodes and relu activation function
    model.add(Dense(32 , activation = 'relu'))  
    
    # Last layer with only one node that represents the output of the model
    model.add(Dense(1))   
     # compile model using 'adam optimizer' and mean squared error
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [21]:
# Call the function neural_network_model
neural = neural_network_model()

# Fit model on the train data with 50 epochs
Fit =neural.fit(xTrain,yTrain,epochs = 50)
Fit

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x2b1007cdf40>

In [22]:
# Take the final training loss after completing the training on the training data for all epochs

# .history is object by the fit method that has list of all the loss values for each epoch
training_loss = Fit.history['loss'][-1]
print("Training Loss:", training_loss)

Training Loss: 1.1907284259796143


In [23]:
# Evaluate the loss in the test data
loss = neural.evaluate(xTest,yTest)
print('Test loss = ', loss)

Test loss =  2.0688490867614746


##### When i increase the hidden layers, number of nodes in each layer[ (128,64,32,8,1)  or (128,64,32,1) or (128,64,1) ], and number of epochs , the training loss decreased but the test loss increased and the difference between them increase leading to an overfitting data
##### When i decrease the hidden layers, number of nodes in each layer [ (32,16,8,1) or (16,8,1) ], and number of epochs, training loss increased but the difference between them decrease a little leading to an underfitting data


##### Now, the difference between the training loss and testing loss is small with 50 epochs and 3 hidden layers [64,32,1] And also the training loss is small
##### I kept changing the number of hidden layers, number of nodes, and number of epochs and that is the best numbers