In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.preprocessing as pre
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.inspection import DecisionBoundaryDisplay

## Prepare the Data

In [None]:
df = pd.read_csv("Data/forestfires.csv")
print(df.shape)
df

In [None]:
# Drop any rows with missing values
df.dropna(inplace=True)

In [None]:
df['area'].mean()

In [None]:
df['area'].median()

## Charts 

In [None]:
# Changing months from string to integers for graphing and storing in monthDF, a copy of df
monthDF = df.copy()
month_dict = {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
              'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}
monthDF['month'] = monthDF['month'].map(month_dict)
day_dict = {'sun': 1, 'mon': 2, 'tue': 3, 'wed': 4, 'thu': 5, 'fri': 6,
              'sat': 7}
monthDF['day'] = monthDF['day'].map(day_dict)
monthDF

In [None]:
# FFMC amount per month
plt.scatter(monthDF["month"], monthDF["FFMC"], s=3)
plt.xlabel('Month')
plt.ylabel('FFMC')
plt.title('FFMC per Month')
plt.show()

In [None]:
# Burned ha per month
plt.bar(monthDF["month"], monthDF["area"])
plt.xlabel('Month')
plt.ylabel('Burned Area in ha')
plt.title('Burned Area per hectare ')
plt.show()

In [None]:
# Burned ha per day
plt.bar(monthDF["day"], monthDF["area"])
plt.xlabel('Day of Week')
plt.ylabel('Burned Area in ha')
plt.title('Burned Area per hectare ')
plt.show()

In [None]:
# Number of fires per month
by_month = monthDF.groupby('month')['X'].count()
plt.plot(by_month.index, by_month.values)
plt.xlabel('Month')
plt.ylabel('Number of Occurrences')
plt.title('Fire Occurrences Over Time')
plt.show()

In [None]:
# Number of fires per day
by_day = monthDF.groupby('day')['X'].count()
plt.plot(by_day.index, by_day.values)
plt.xlabel('Day')
plt.ylabel('Number of Occurrences')
plt.title('Fire Occurrences Over Time')
plt.show()

In [None]:
# Small vs Big Fires
df['area_cat'] = pd.cut(df['area'], bins=[-float('inf'), 12.8, float('inf')], labels=['Small Fire', 'Large Fire'])

# Create a bar chart showing the count of each category
df['area_cat'].value_counts().plot(kind='bar')

# Set the title and axis labels
plt.title('Burned Area Above and Below Mean (12.8)')
plt.xlabel('Area category')
plt.ylabel('Count')
plt.show()

## Apply Logistic Regression

In [None]:
# Convert categorical variables to numerical using one-hot encoding
df = pd.get_dummies(df, columns=['month', 'day'])

In [None]:
# Convert the target variable to a binary variable indicating whether or not a fire occurred
threshold = 12.8 # mean of area column
#threshold = 0.52 # median of area column
df['fire'] = (df['area'] > threshold).astype(int)

# Separate the features (X) from the target variable (Y)
X = df[['wind', 'temp', 'RH', 'rain']]
#X = df[['FFMC']]
Y = df['fire']

In [None]:
# Standardize the features using a StandardScaler object
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Fit a logistic regression model to the standardized data
model = LogisticRegression()
model.fit(X, Y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
model.score(X_test,y_test)

## Apply SVM

In [None]:
svc = SVC(kernel='poly', degree=1)
svc.fit(X_train, y_train)

In [None]:
svc.score(X_test, y_test)