In [131]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

In [132]:
url = 'https://raw.githubusercontent.com/anyawild/DS3000_Final_Project_Team2/refs/heads/main/euro_c_df_clean.csv'
euro_c_df_clean = pd.read_csv(url)
euro_c_df_clean.head()

Unnamed: 0.1,Unnamed: 0,country_id,latitude,longitude,bright_ti4,bright_ti5,scan,track,acq_date,acq_time,frp,daynight
0,0,AUT,47.34311,9.62378,328.5,290.2,0.53,0.42,2020-03-01,1230,4.4,D
1,1,AUT,47.54527,9.78854,331.0,287.0,0.54,0.42,2020-03-01,1230,5.0,D
2,2,AUT,47.54559,9.78841,329.3,286.5,0.54,0.42,2020-03-01,1230,3.7,D
3,3,AUT,48.27758,14.34202,331.8,276.3,0.59,0.53,2020-03-01,1230,3.8,D
4,4,AUT,48.27502,14.33618,300.4,279.2,0.58,0.7,2020-06-01,218,1.9,N


### Checking for bright_ti4 Foldover

In [133]:
euro_c_df_clean['bright_ti4'].describe()

count    2407.000000
mean      320.309094
std        17.649098
min       207.930000
25%       303.185000
50%       322.110000
75%       335.500000
max       367.000000
Name: bright_ti4, dtype: float64

In [134]:
euro_c_df_clean['bright_ti5'].describe()

count    2407.000000
mean      287.371699
std        11.259759
min       256.900000
25%       280.100000
50%       284.870000
75%       294.225000
max       320.600000
Name: bright_ti5, dtype: float64

When foldover occurs, bright_ti4 will have a significantly lower value than bright_ti5 near the low end of its detection threshold (~208K).

In [135]:
# foldover occurs when bright_ti4 < bright_ti5 and bright_ti4 < 220
foldover_df = euro_c_df_clean[euro_c_df_clean['bright_ti4'] - euro_c_df_clean['bright_ti5'] < 0]
foldover_df = foldover_df[foldover_df['bright_ti4'] < 220]
foldover_df

Unnamed: 0.1,Unnamed: 0,country_id,latitude,longitude,bright_ti4,bright_ti5,scan,track,acq_date,acq_time,frp,daynight
1259,1259,FRA,44.05268,-0.01614,207.93,308.93,0.39,0.36,2022-03-01,1246,61.87,D


# Logistic Regression for Classification

We perform logistic regression to classify fires as standard or high-intensity based on latitude, longitude, scan, track, and days since earliest data acquisition date.

In [136]:
# create binary intensity column (high = 1, standard = 0)
euro_c_df_clean['intensity_binary'] = np.where(
    (euro_c_df_clean['bright_ti4'] > 350) | 
    ((euro_c_df_clean['bright_ti4'] - euro_c_df_clean['bright_ti5'] < 0) & 
     (euro_c_df_clean['bright_ti4'] < 220)),
    1,  # high-intensity: bright_ti4 > 350, OR
        #                 bright_ti4 - bright_ti5 < 0 & bright_ti4 < 220 (foldover)
    0   # standard-intensity: all other observations
)

# create days column
# date -> days since earliest date
date = euro_c_df_clean['acq_date']
x_datetime = pd.to_datetime(date)
reference_date = pd.Timestamp(date.min())       # earliest date, '2020-03-01'
x_timedelta = x_datetime - reference_date
euro_c_df_clean['days'] = x_timedelta.dt.days

In [137]:
intensity_counts = euro_c_df_clean['intensity_binary'].value_counts()

print(f'# of Standard-Intensity Fires: {intensity_counts.get(0, 0)}')
print(f'# of High-Intensity Fires: {intensity_counts.get(1, 0)}')

# of Standard-Intensity Fires: 2342
# of High-Intensity Fires: 65


In [138]:
X = euro_c_df_clean[['latitude', 'longitude', 'scan', 'track', 'days']]     # feature variables
y = euro_c_df_clean['intensity_binary']                                     # classes: standard (0), high (1)

# split the dataset into training and test sets 70/30
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# standardize feature variables
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) 

# train the logistic regression model
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)

# test the model
y_pred = log_reg.predict(X_test_scaled)

# assess model accuracy using confusion/classification matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# TODO: visualize confusion matrix
print('Confusion Matrix:\n', conf_matrix)

Confusion Matrix:
 [[706   0]
 [ 17   0]]


The model consistently predicts only standard-intensity for all test samples. This behavior is likely caused by a class imbalance in the dataset, where the majority class (standard-intensity fires, n=2342) dominates, causing the model to ignore the minority class (high-intensity fires, n=65). This imbalance biases the model toward standard-intensity fires, resulting in a lack of predictive power for high-intensity fires.