## TP1 part 2
Data taken from https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/  
Can we find if there was a fire based on day, DMC, temp, win, rain  
DMC is a numerical rating of the average moisture content

#### Loading libraries, packages & the data

In [1]:
# libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Handling missing data 
from sklearn.impute import SimpleImputer

# Encoding categorical Data
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split

# Feature Scaling
from sklearn.preprocessing import StandardScaler

# Linear Regression
from sklearn.linear_model import LinearRegression

In [2]:
# Load data
fires_df_original = pd.read_csv('forestfires.csv')
# Make a copy of data
fires_df = fires_df_original.copy()
print(fires_df)

X  Y month  day  FFMC    DMC     DC   ISI  temp  RH  wind  rain   area
0    7  5   mar  fri  86.2   26.2   94.3   5.1   8.2  51   6.7   0.0   0.00
1    7  4   oct  tue  90.6   35.4  669.1   6.7  18.0  33   0.9   0.0   0.00
2    7  4   oct  sat  90.6   43.7  686.9   6.7  14.6  33   1.3   0.0   0.00
3    8  6   mar  fri  91.7   33.3   77.5   9.0   8.3  97   4.0   0.2   0.00
4    8  6   mar  sun  89.3   51.3  102.2   9.6  11.4  99   1.8   0.0   0.00
..  .. ..   ...  ...   ...    ...    ...   ...   ...  ..   ...   ...    ...
512  4  3   aug  sun  81.6   56.7  665.6   1.9  27.8  32   2.7   0.0   6.44
513  2  4   aug  sun  81.6   56.7  665.6   1.9  21.9  71   5.8   0.0  54.29
514  7  4   aug  sun  81.6   56.7  665.6   1.9  21.2  70   6.7   0.0  11.16
515  1  4   aug  sat  94.4  146.0  614.7  11.3  25.6  42   4.0   0.0   0.00
516  6  3   nov  tue  79.5    3.0  106.7   1.1  11.8  31   4.5   0.0   0.00

[517 rows x 13 columns]


#### Checking the structure of the data & extract features  

In [3]:
# This method prints information about a DataFrame including the index dtype and column dtypes, non-null values and memory usage.
fires_df.info()
print('Dataset shape: ', fires_df.shape, sep="") 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X       517 non-null    int64  
 1   Y       517 non-null    int64  
 2   month   517 non-null    object 
 3   day     517 non-null    object 
 4   FFMC    517 non-null    float64
 5   DMC     517 non-null    float64
 6   DC      517 non-null    float64
 7   ISI     517 non-null    float64
 8   temp    517 non-null    float64
 9   RH      517 non-null    int64  
 10  wind    517 non-null    float64
 11  rain    517 non-null    float64
 12  area    517 non-null    float64
dtypes: float64(8), int64(3), object(2)
memory usage: 52.6+ KB
Dataset shape: (517, 13)


In [4]:
# checking description for 1 column
fires_df.temp.describe()

count    517.000000
mean      18.889168
std        5.806625
min        2.200000
25%       15.500000
50%       19.300000
75%       22.800000
max       33.300000
Name: temp, dtype: float64

In [5]:
# List the top 5 rows
fires_df.head(5)

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


#### Examining Missing Values

In [6]:
# Finding how many missing data are there in the dataset
total = fires_df.isnull().sum().sort_values(ascending=False)
percent = round(((fires_df.isnull().sum() / fires_df.isnull().count())*100),2).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Missing values', '% of Total values'])
missing_data

Unnamed: 0,Missing values,% of Total values
area,0,0.0
rain,0,0.0
wind,0,0.0
RH,0,0.0
temp,0,0.0
ISI,0,0.0
DC,0,0.0
DMC,0,0.0
FFMC,0,0.0
day,0,0.0


In [7]:
fires_df.columns

Index(['X', 'Y', 'month', 'day', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH',
       'wind', 'rain', 'area'],
      dtype='object')

In [8]:
fires_df.shape

(517, 13)

#### Remove duplicate rows

In [9]:
print(fires_df.shape)  #  were there any duplicates?
fires_df = fires_df.drop_duplicates(keep = 'first')
print(fires_df.shape)

(517, 13)
(513, 13)


#### Drop all rows with NaN values in their columns

In [10]:
nan_cols = ['X', 'Y', 'month', 'day', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain', 'area']
fires_df[nan_cols] = fires_df[nan_cols].replace("Nil", np.nan)  # dropping the rows having nil
fires_df.dropna(subset = nan_cols, inplace=True)
fires_df.shape

(513, 13)

#### Dropping irrelevant cols

In [11]:
fires_df = fires_df.drop(columns=['X', 'Y', 'month', 'FFMC', 'DC', 'ISI', 'RH', 'rain'], axis=1)

In [12]:
fires_df.shape

(513, 5)

In [13]:
fires_df.head(5)

Unnamed: 0,day,DMC,temp,wind,area
0,fri,26.2,8.2,6.7,0.0
1,tue,35.4,18.0,0.9,0.0
2,sat,43.7,14.6,1.3,0.0
3,fri,33.3,8.3,4.0,0.0
4,sun,51.3,11.4,1.8,0.0


In [14]:
fires_df.area.describe()

count     513.000000
mean       12.891598
std        63.892586
min         0.000000
25%         0.000000
50%         0.540000
75%         6.570000
max      1090.840000
Name: area, dtype: float64

#### Extract features

In [15]:
x = fires_df.iloc[:,:-1].values
y = fires_df.iloc[:,-1].values

In [16]:
x

array([['fri', 26.2, 8.2, 6.7],
       ['tue', 35.4, 18.0, 0.9],
       ['sat', 43.7, 14.6, 1.3],
       ...,
       ['sun', 56.7, 21.2, 6.7],
       ['sat', 146.0, 25.6, 4.0],
       ['tue', 3.0, 11.8, 4.5]], dtype=object)

In [17]:
y

array([0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.000

#### Applying one hot encoder

In [18]:
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = np.array(ct.fit_transform(x))

In [19]:
x

array([[1.0, 0.0, 0.0, ..., 26.2, 8.2, 6.7],
       [0.0, 0.0, 0.0, ..., 35.4, 18.0, 0.9],
       [0.0, 0.0, 1.0, ..., 43.7, 14.6, 1.3],
       ...,
       [0.0, 0.0, 0.0, ..., 56.7, 21.2, 6.7],
       [0.0, 0.0, 1.0, ..., 146.0, 25.6, 4.0],
       [0.0, 0.0, 0.0, ..., 3.0, 11.8, 4.5]], dtype=object)

#### Split data into train & test

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

#### Feature Scaling

In [23]:
x_train

array([[0.0, 0.0, 0.0, ..., 108.3, 16.6, 5.4],
       [1.0, 0.0, 0.0, ..., 13.2, 12.3, 0.9],
       [0.0, 0.0, 0.0, ..., 276.3, 21.9, 4.0],
       ...,
       [0.0, 0.0, 0.0, ..., 49.5, 28.0, 4.5],
       [0.0, 0.0, 1.0, ..., 121.1, 25.1, 4.0],
       [1.0, 0.0, 0.0, ..., 41.5, 11.3, 5.4]], dtype=object)

In [24]:
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)

In [25]:
x_train

array([[-0.4380188 , -0.39361095, -0.4380188 , ..., -0.01895648,
        -0.36976457,  0.75898409],
       [ 2.28300705, -0.39361095, -0.4380188 , ..., -1.48200522,
        -1.09088728, -1.71377883],
       [-0.4380188 , -0.39361095, -0.4380188 , ...,  2.56560912,
         0.5190611 , -0.01031993],
       ...,
       [-0.4380188 , -0.39361095, -0.4380188 , ..., -0.92355444,
         1.54204913,  0.2644315 ],
       [-0.4380188 , -0.39361095,  2.28300705, ...,  0.1779628 ,
         1.05571056, -0.01031993],
       [ 2.28300705, -0.39361095, -0.4380188 , ..., -1.046629  ,
        -1.25859023,  0.75898409]])