In [1]:
import warnings
warnings.filterwarnings("ignore")

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

### 1.importing data

In [3]:
data = pd.read_csv("forestfires.csv")
data.head()

Unnamed: 0,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,...,monthfeb,monthjan,monthjul,monthjun,monthmar,monthmay,monthnov,monthoct,monthsep,size_category
0,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,...,0,0,0,0,1,0,0,0,0,small
1,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,...,0,0,0,0,0,0,0,1,0,small
2,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,...,0,0,0,0,0,0,0,1,0,small
3,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,...,0,0,0,0,1,0,0,0,0,small
4,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,...,0,0,0,0,1,0,0,0,0,small


### 2.1 getting data ready

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 31 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   month          517 non-null    object 
 1   day            517 non-null    object 
 2   FFMC           517 non-null    float64
 3   DMC            517 non-null    float64
 4   DC             517 non-null    float64
 5   ISI            517 non-null    float64
 6   temp           517 non-null    float64
 7   RH             517 non-null    int64  
 8   wind           517 non-null    float64
 9   rain           517 non-null    float64
 10  area           517 non-null    float64
 11  dayfri         517 non-null    int64  
 12  daymon         517 non-null    int64  
 13  daysat         517 non-null    int64  
 14  daysun         517 non-null    int64  
 15  daythu         517 non-null    int64  
 16  daytue         517 non-null    int64  
 17  daywed         517 non-null    int64  
 18  monthapr  

In [5]:
#since month and day columns are already encoded. so we have to drop these columns.
data.drop(['month'], axis=1, inplace=True)
data.drop(['day'], axis=1, inplace=True)
data.head() 

Unnamed: 0,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,dayfri,...,monthfeb,monthjan,monthjul,monthjun,monthmar,monthmay,monthnov,monthoct,monthsep,size_category
0,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0,1,...,0,0,0,0,1,0,0,0,0,small
1,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0,0,...,0,0,0,0,0,0,0,1,0,small
2,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0,0,...,0,0,0,0,0,0,0,1,0,small
3,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0,1,...,0,0,0,0,1,0,0,0,0,small
4,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0,0,...,0,0,0,0,1,0,0,0,0,small


In [6]:
data['size_category'].value_counts()

small    378
large    139
Name: size_category, dtype: int64

In [7]:
#size_category is object data type. let encode this.
label_encoder = preprocessing.LabelEncoder()
data['size_category']= label_encoder.fit_transform(data['size_category'])
data.head()                                                             #small = 1 , large = 0 ; we assign these alphabetically

Unnamed: 0,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,dayfri,...,monthfeb,monthjan,monthjul,monthjun,monthmar,monthmay,monthnov,monthoct,monthsep,size_category
0,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0,1,...,0,0,0,0,1,0,0,0,0,1
1,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0,0,...,0,0,0,0,0,0,0,1,0,1
2,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0,0,...,0,0,0,0,0,0,0,1,0,1
3,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0,1,...,0,0,0,0,1,0,0,0,0,1
4,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0,0,...,0,0,0,0,1,0,0,0,0,1


### 3. slpitting the data into x and y

In [18]:
x=data.iloc[:,0:28]
y=data.iloc[:,28]

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.3)

### 4. GridSearchCV (finding the best possible value for hyperparameters)

#### a) initial try

In [20]:
clf = SVC()
param_grid = [{'kernel':['rbf'],'gamma':[50,5,10,0.5],'C':[15,14,13,12,11,10,0.1,0.001] }]  #these are randomly given values
gsv = GridSearchCV(clf,param_grid,cv=10)                                                    #CV stands for cross validation
gsv.fit(x_train,y_train)

#we can try with some other kernels also.
#just move cursor to 'rbf' -- shift + tab + tab -- from there select the kernel you want
#we have to do multiple test to find best possible value
#GridSearchCV automatically build a model with the help of cross validation technique (kfold) to predict best possible value.

GridSearchCV(cv=10, estimator=SVC(),
             param_grid=[{'C': [15, 14, 13, 12, 11, 10, 0.1, 0.001],
                          'gamma': [50, 5, 10, 0.5], 'kernel': ['rbf']}])

In [21]:
gsv.best_params_ , gsv.best_score_ 

#so best possible C = 15 , gamma = 0.5 , kernel = rbf with accuracy = 73.96%

({'C': 15, 'gamma': 0.5, 'kernel': 'rbf'}, 0.734009009009009)

#### b) iterative step

In [28]:
clf = SVC()
param_grid = [{'kernel':['rbf'],'gamma':[0,0.01,0.001,0.1,0.2,0.5],'C':[0,0.01,0.001,0.1,0.5,1,1.5,2] }]  #these are randomly given values
gsv = GridSearchCV(clf,param_grid,cv=10)                                                    #CV stands for cross validation
gsv.fit(x_train,y_train)

#we can try with some other kernels also.
#just move cursor to 'rbf' -- shift + tab + tab -- from there select the kernel you want
#we have to do multiple test to find best possible value
#GridSearchCV automatically build a model with the help of cross validation technique (kfold) to predict best possible value.

GridSearchCV(cv=10, estimator=SVC(),
             param_grid=[{'C': [0, 0.01, 0.001, 0.1, 0.5, 1, 1.5, 2],
                          'gamma': [0, 0.01, 0.001, 0.1, 0.2, 0.5],
                          'kernel': ['rbf']}])

In [29]:
gsv.best_params_ , gsv.best_score_ 

({'C': 1.5, 'gamma': 0.001, 'kernel': 'rbf'}, 0.819894894894895)

### 5.Final model

In [30]:
clf = SVC(C= 1.5, gamma = 0.001)
clf.fit(x_train , y_train)

y_pred = clf.predict(x_test)

acc = accuracy_score(y_test, y_pred) * 100
print("Accuracy =", acc)

confusion_matrix(y_test, y_pred)

Accuracy = 82.05128205128204


array([[ 13,  26],
       [  2, 115]], dtype=int64)