# Problem Statement: -

2. In California, annual forest fires can cause huge loss of wildlife, human life, and can cost billions of dollars in property damage. Local officials would like to predict the size of the burnt area in forest fires annually so that they can be better prepared in future calamities. 
Build a Support Vector Machines algorithm on the dataset and share your insights on it in the documentation. 
Note: - Size_ category is the output variable.

# 🔷 Business Objective:- 

To help California local officials predict the size of the burnt area in annual forest fires using the Support Vector Machines (SVM) algorithm.
Accurate predictions will enable better resource planning, early warning systems, and effective disaster management to minimize loss of life, wildlife, and property.



# 🔶 Business Constraints:-

1. Time-Sensitive – Predictions must be timely for disaster preparedness.
2. Data Availability – Accuracy depends on quality and completeness of historical fire data.
3. Cost Constraint – Must work within allocated budget for analytics and system deployment.
4. Model Simplicity – Outputs should be easy for officials to interpret and act upon.
5. Regulatory Compliance – Must ensure proper handling of sensitive geographical and environmental data.



In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv("forestfires.csv")
df.head()

Unnamed: 0,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,...,monthfeb,monthjan,monthjul,monthjun,monthmar,monthmay,monthnov,monthoct,monthsep,size_category
0,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,...,0,0,0,0,1,0,0,0,0,small
1,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,...,0,0,0,0,0,0,0,1,0,small
2,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,...,0,0,0,0,0,0,0,1,0,small
3,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,...,0,0,0,0,1,0,0,0,0,small
4,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,...,0,0,0,0,1,0,0,0,0,small


# Data Exploration:-

In [3]:
df.shape

(517, 31)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 31 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   month          517 non-null    object 
 1   day            517 non-null    object 
 2   FFMC           517 non-null    float64
 3   DMC            517 non-null    float64
 4   DC             517 non-null    float64
 5   ISI            517 non-null    float64
 6   temp           517 non-null    float64
 7   RH             517 non-null    int64  
 8   wind           517 non-null    float64
 9   rain           517 non-null    float64
 10  area           517 non-null    float64
 11  dayfri         517 non-null    int64  
 12  daymon         517 non-null    int64  
 13  daysat         517 non-null    int64  
 14  daysun         517 non-null    int64  
 15  daythu         517 non-null    int64  
 16  daytue         517 non-null    int64  
 17  daywed         517 non-null    int64  
 18  monthapr  

In [5]:
df.describe()

Unnamed: 0,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,dayfri,...,monthdec,monthfeb,monthjan,monthjul,monthjun,monthmar,monthmay,monthnov,monthoct,monthsep
count,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,...,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0,517.0
mean,90.644681,110.87234,547.940039,9.021663,18.889168,44.288201,4.017602,0.021663,12.847292,0.16441,...,0.017408,0.038685,0.003868,0.061896,0.032882,0.104449,0.003868,0.001934,0.029014,0.332689
std,5.520111,64.046482,248.066192,4.559477,5.806625,16.317469,1.791653,0.295959,63.655818,0.371006,...,0.130913,0.193029,0.062137,0.241199,0.1785,0.306138,0.062137,0.04398,0.168007,0.471632
min,18.7,1.1,7.9,0.0,2.2,15.0,0.4,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,90.2,68.6,437.7,6.5,15.5,33.0,2.7,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,91.6,108.3,664.2,8.4,19.3,42.0,4.0,0.0,0.52,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,92.9,142.4,713.9,10.8,22.8,53.0,4.9,0.0,6.57,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,96.2,291.3,860.6,56.1,33.3,100.0,9.4,6.4,1090.84,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
df.columns

Index(['month', 'day', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind',
       'rain', 'area', 'dayfri', 'daymon', 'daysat', 'daysun', 'daythu',
       'daytue', 'daywed', 'monthapr', 'monthaug', 'monthdec', 'monthfeb',
       'monthjan', 'monthjul', 'monthjun', 'monthmar', 'monthmay', 'monthnov',
       'monthoct', 'monthsep', 'size_category'],
      dtype='object')

# Data Preprocessing:-

In [7]:
from sklearn.preprocessing import LabelEncoder,StandardScaler

In [8]:
categorical_columns=df.select_dtypes(include=['object']).columns
categorical_columns

Index(['month', 'day', 'size_category'], dtype='object')

In [9]:
#Initialize label Encoder
le=LabelEncoder()
le

In [10]:
#Apply Label Encoding to all categorical columns
df['month'] = le.fit_transform(df['month'])
df['day'] = le.fit_transform(df['day'])
df['size_category'] = le.fit_transform(df['size_category'])  # 'small'->0, 'large'->1

In [11]:
# Feature and target separation
X = df.drop(['size_category'], axis=1)
y = df['size_category']

In [12]:
# Normalize features
scaler = StandardScaler()

In [13]:
X_scaled = scaler.fit_transform(X)
X_scaled

array([[ 2.84222252e-01, -1.42312073e+00, -8.05959472e-01, ...,
        -4.40225453e-02, -1.72859706e-01, -7.06081245e-01],
       [ 9.70871338e-01,  1.17671466e+00, -8.10203395e-03, ...,
        -4.40225453e-02,  5.78503817e+00, -7.06081245e-01],
       [ 9.70871338e-01, -3.83186570e-01, -8.10203395e-03, ...,
        -4.40225453e-02,  5.78503817e+00, -7.06081245e-01],
       ...,
       [-1.08907592e+00,  1.36780508e-01, -1.64008316e+00, ...,
        -4.40225453e-02, -1.72859706e-01, -7.06081245e-01],
       [-1.08907592e+00, -3.83186570e-01,  6.80956663e-01, ...,
        -4.40225453e-02, -1.72859706e-01, -7.06081245e-01],
       [ 7.41988309e-01,  1.17671466e+00, -2.02087875e+00, ...,
         2.27156334e+01, -1.72859706e-01, -7.06081245e-01]])

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [15]:
#Split Data
X_train,X_test,y_train,y_test=train_test_split(X_scaled,y,test_size=0.2,random_state=42)


In [16]:
# Train SVM
svm_model = SVC(kernel='rbf')  # you can try 'linear' or 'poly' too
svm_model.fit(X_train, y_train)

In [17]:
# Evaluate
y_pred = svm_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7596153846153846
[[ 4 24]
 [ 1 75]]
              precision    recall  f1-score   support

           0       0.80      0.14      0.24        28
           1       0.76      0.99      0.86        76

    accuracy                           0.76       104
   macro avg       0.78      0.56      0.55       104
weighted avg       0.77      0.76      0.69       104



# Linear Kernel

In [18]:
# Train SVM with linear kernel
svm_linear = SVC(kernel='linear')
svm_linear.fit(X_train, y_train)
y_pred_linear = svm_linear.predict(X_test)

In [19]:
# Evaluate
results = {
    "Linear Kernel": {
        "Accuracy": accuracy_score(y_test, y_pred_linear),
        "Confusion Matrix": confusion_matrix(y_test, y_pred_linear),
        "Classification Report": classification_report(y_test, y_pred_linear)
    }
}

In [20]:
results

{'Linear Kernel': {'Accuracy': 0.9038461538461539,
  'Confusion Matrix': array([[19,  9],
         [ 1, 75]], dtype=int64),
  'Classification Report': '              precision    recall  f1-score   support\n\n           0       0.95      0.68      0.79        28\n           1       0.89      0.99      0.94        76\n\n    accuracy                           0.90       104\n   macro avg       0.92      0.83      0.86       104\nweighted avg       0.91      0.90      0.90       104\n'}}