In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv('Algerian_forest_fires_dataset_CLEANED.csv')

In [None]:
## DROPPING UNREQUIRED COLS
df.drop(['day', 'month', 'year'], axis = 1, inplace = True)
## CHANGING FIRE AND NOT FIRE TO 0 AND 1 AND REMOVING REST OF THE UNREQUIRED VALUES
df['Classes'] = np.where(df['Classes'].str.contains("not fire"), 0, 1)

In [None]:
## INDEPENDANT AND DEPENDANT FEATURES
x = df.drop('FWI', axis = 1)
y = df['FWI']

## TRAIN TEST SPLIT
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 42)

## Feature selection based on correlation
x_train.corr()

## Check for multicollinearity
plt.figure(figsize = (12, 10))
corr = x_train.corr()
sns.heatmap(corr, annot = True)

In [None]:
def correlation(dataset, threshold):
    col_corr = set()
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: 
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
    return col_corr

## THRESHOLD -- set by domain expert
corr_feat = correlation(x_train, 0.85)
## DROP FEATURES WHEN CORRELATION IS MORE THAN 0.85
x_train.drop(corr_feat, axis = 1, inplace = True)
x_test.drop(corr_feat, axis = 1, inplace = True)

<B>FEATURE SCALING OR STANDARDIZATION</B>

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [None]:
## BOX PLOT TO UNDERSTAND EFFECT OF STANDARD SCALER
plt.subplots(figsize = (15, 5))
plt.subplot(1, 2, 1)
sns.boxplot(data = x_train)
plt.title('x_train Before Scaling')
plt.subplot(1, 2, 2)
sns.boxplot(data = x_train_scaled)
plt.title('x_train After Scaling')

<B>MODEL TRAINING USING DIFFERENT MODELS</B>

LINEAR REGRESSION MODEL

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

linreg = LinearRegression()
linreg.fit(x_train_scaled, y_train)
y_pred = linreg.predict(x_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
score = r2_score(y_test, y_pred)
print("Mean absolute error : ", mae)
print("R2 Score : ", score)
plt.scatter(y_test, y_pred)

LASSO REGRESSION

In [None]:
from sklearn.linear_model import Lasso  
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

lasso = Lasso()
lasso.fit(x_train_scaled, y_train)
y_pred = lasso.predict(x_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
score = r2_score(y_test, y_pred)
print("Mean absolute error : ", mae)
print("R2 Score : ", score)
plt.scatter(y_test, y_pred)

LASSO CROSS VALIDATION

In [None]:
from sklearn.linear_model import LassoCV
lassocv = LassoCV(cv = 5)
lassocv.fit(x_train_scaled, y_train)

lassocv.alpha_

y_pred = lassocv.predict(x_test_scaled)
plt.scatter(y_test, y_pred)
mae = mean_absolute_error(y_test,y_pred)
score = r2_score(y_test,y_pred)
print("Mean absolute error", mae)
print("R2 Score", score)

RIDGE REGRESSION

In [None]:
from sklearn.linear_model import Ridge  
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

ridge = Ridge()
ridge.fit(x_train_scaled, y_train)
y_pred = ridge.predict(x_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
score = r2_score(y_test, y_pred)
print("Mean absolute error : ", mae)
print("R2 Score : ", score)
plt.scatter(y_test, y_pred)

RIDGE CROSS VALIDATION

In [None]:
from sklearn.linear_model import RidgeCV
ridgecv = RidgeCV(cv = 5)
ridgecv.fit(x_train_scaled,y_train)
y_pred = ridgecv.predict(x_test_scaled)
plt.scatter(y_test,y_pred)
mae = mean_absolute_error(y_test,y_pred)
score = r2_score(y_test,y_pred)
print("Mean absolute error", mae)
print("R2 Score", score)

ELASTIC NET REGRESSION

In [None]:
from sklearn.linear_model import ElasticNet  
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

elastic = ElasticNet()
elastic.fit(x_train_scaled, y_train)
y_pred = elastic.predict(x_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
score = r2_score(y_test, y_pred)
print("Mean absolute error : ", mae)
print("R2 Score : ", score)
plt.scatter(y_test, y_pred)

ELASTIC NET CROSS VALIDATION

In [None]:
from sklearn.linear_model import ElasticNetCV
elasticcv = ElasticNetCV(cv = 5)
elasticcv.fit(x_train_scaled,y_train)
y_pred = elasticcv.predict(x_test_scaled)
plt.scatter(y_test,y_pred)
mae = mean_absolute_error(y_test,y_pred)
score = r2_score(y_test,y_pred)
print("Mean absolute error", mae)
print("R2 Score", score)

In [19]:
## Pickle the 2 models
import pickle
pickle.dump(scaler, open("scaler.pkl", "wb"))
pickle.dump(ridge, open("ridge.pkl", "wb"))