In [1]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
#!pip install pandas
import joblib
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
!pip install category_encoders
import category_encoders as ce

%matplotlib inline

sns.set(style = "darkgrid")



In [13]:
df = pd.read_csv("/content/drive/MyDrive/train.csv")

In [None]:
df.head()

Unnamed: 0,fire_id,fire_year,fire_number,fire_name,size_class,fire_location_latitude,fire_location_longitude,fire_origin,general_cause_desc,industry_identifier_desc,...,fuel_type,initial_action_by,ia_arrival_at_fire_date,ia_access,fire_fighting_start_date,fire_fighting_start_size,bucketing_on_fire,distance_from_water_source,first_bucket_drop_date,ex_fs_date
0,0,2015,PWF002,,1,56.108767,-116.840717,Provincial Land,Power Line Industry,,...,O1b,Industry,,,,,,,,2015-04-12 20:12:00
1,1,2011,RWF034,,1,52.397401,-116.072083,Provincial Land,Recreation,,...,,Public,,,,,,,,2011-05-21 10:52:00
2,2,2018,GWF018,,1,56.041,-119.12165,Private Land,Resident,,...,O1b,Fire Department,,,,,,,,2018-06-09 21:45:00
3,3,2015,HWF081,,2,58.489199,-115.151036,Indian Reservation,Incendiary,,...,O1a,Fire Department,,,,,,,,2015-05-13 19:07:00
4,4,2010,SWF314,,1,55.900885,-115.664957,Indian Reservation,Resident,,...,O1b,Fire Department,,,,,,,,2010-08-10 12:00:00


In [4]:
df.dtypes

fire_id                           int64
fire_year                         int64
fire_number                      object
fire_name                        object
size_class                        int64
fire_location_latitude          float64
fire_location_longitude         float64
fire_origin                      object
general_cause_desc               object
industry_identifier_desc         object
responsible_group_desc           object
activity_class                   object
true_cause                       object
fire_start_date                  object
det_agent                        object
det_agent_type                   object
discovered_date                  object
discovered_size                 float64
reported_date                    object
dispatched_resource              object
dispatch_date                    object
start_for_fire_date              object
assessment_resource              object
assessment_datetime              object
assessment_hectares             float64


In [14]:
columns_to_drop = ['fire_origin','fire_name', 'discovered_size', 'industry_identifier_desc','fire_number','fire_year','distance_from_water_source','ex_fs_date','first_bucket_drop_date','fire_fighting_start_date','ia_arrival_at_fire_date','assessment_datetime','start_for_fire_date','dispatch_date','reported_date','discovered_date','fire_start_date','initial_action_by','wind_direction','fire_position_on_slope','assessment_resource','dispatched_resource','det_agent_type','det_agent','true_cause','activity_class','responsible_group_desc','bucketing_on_fire','fire_fighting_start_size','ia_access']
df.drop(columns=columns_to_drop, inplace=True)
columns_to_one_hot_encode = ["fire_type", "weather_conditions_over_fire", "general_cause_desc","fuel_type"]

# One-hot encode the specified columns
df_encoded = pd.get_dummies(df, columns=columns_to_one_hot_encode)

# Concatenate the one-hot encoded columns to the original DataFrame
df = pd.concat([df, df_encoded], axis=1)

# Drop the original columns that were one-hot encoded
df.drop(columns=columns_to_one_hot_encode, inplace=True)
duplicate_columns = df.columns[df.columns.duplicated()]

# Keep only unique columns
df = df.loc[:, ~df.columns.duplicated()]

print(df.columns)





Index(['fire_id', 'size_class', 'fire_location_latitude',
       'fire_location_longitude', 'assessment_hectares', 'fire_spread_rate',
       'temperature', 'relative_humidity', 'wind_speed', 'fire_type_  ',
       'fire_type_   Surface', 'fire_type_Crown', 'fire_type_Ground',
       'fire_type_Surface', 'weather_conditions_over_fire_CB Dry',
       'weather_conditions_over_fire_CB Wet',
       'weather_conditions_over_fire_Clear',
       'weather_conditions_over_fire_Cloudy',
       'weather_conditions_over_fire_Rainshowers',
       'general_cause_desc_Agriculture Industry',
       'general_cause_desc_Forest Industry', 'general_cause_desc_Government',
       'general_cause_desc_Incendiary', 'general_cause_desc_Lightning',
       'general_cause_desc_Oil & Gas Industry',
       'general_cause_desc_Other Industry',
       'general_cause_desc_Power Line Industry',
       'general_cause_desc_Prescribed Fire', 'general_cause_desc_Railroad',
       'general_cause_desc_Recreation', 'general_c

In [None]:
column_names = df.columns.tolist()
print(column_names)

['fire_id', 'size_class', 'fire_location_latitude', 'fire_location_longitude', 'assessment_hectares', 'fire_spread_rate', 'temperature', 'relative_humidity', 'wind_speed', 'fire_type_  ', 'fire_type_   Surface', 'fire_type_Crown', 'fire_type_Ground', 'fire_type_Surface', 'weather_conditions_over_fire_CB Dry', 'weather_conditions_over_fire_CB Wet', 'weather_conditions_over_fire_Clear', 'weather_conditions_over_fire_Cloudy', 'weather_conditions_over_fire_Rainshowers', 'general_cause_desc_Agriculture Industry', 'general_cause_desc_Forest Industry', 'general_cause_desc_Government', 'general_cause_desc_Incendiary', 'general_cause_desc_Lightning', 'general_cause_desc_Oil & Gas Industry', 'general_cause_desc_Other Industry', 'general_cause_desc_Power Line Industry', 'general_cause_desc_Prescribed Fire', 'general_cause_desc_Railroad', 'general_cause_desc_Recreation', 'general_cause_desc_Resident', 'general_cause_desc_Restart', 'general_cause_desc_Under Investigation', 'general_cause_desc_Undet

In [15]:
imputer = SimpleImputer(strategy='mean')
df_imputed = df.copy()
# selected_features = df_imputed.columns
# df_imputed[selected_features] = imputer.fit_transform(df_imputed[selected_features])


selected_features = df.columns.tolist()
selected_features.remove('size_class')

# Impute missing values for features only
imputer.fit(df_imputed[selected_features])
df_imputed[selected_features] = imputer.transform(df_imputed[selected_features])

# Separate features (X) and target variable (y)
X = df_imputed[selected_features]
y = df_imputed['size_class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)
print('Classification Report:')
print(classification_report(y_test, y_pred))

Accuracy: 0.8774824320195539
Classification Report:
              precision    recall  f1-score   support

           1       0.94      0.97      0.95      2066
           2       0.79      0.86      0.82       930
           3       0.52      0.30      0.38       162
           4       0.33      0.02      0.03        65
           5       0.65      0.34      0.45        50

    accuracy                           0.88      3273
   macro avg       0.65      0.50      0.53      3273
weighted avg       0.86      0.88      0.86      3273



In [17]:
model_filename = '/content/drive/MyDrive/your_model_filename.joblib'
loaded_model = joblib.load(model_filename)

# Load test data
test_data = pd.read_csv("/content/drive/MyDrive/test.csv")

# Select relevant features from test data and one-hot encode
columns_to_drop = ['fire_origin', 'fire_name', 'discovered_size', 'industry_identifier_desc', 'fire_number', 'fire_year', 'distance_from_water_source', 'ex_fs_date', 'first_bucket_drop_date', 'fire_fighting_start_date', 'ia_arrival_at_fire_date', 'assessment_datetime', 'start_for_fire_date', 'dispatch_date', 'reported_date', 'discovered_date', 'fire_start_date', 'initial_action_by', 'wind_direction', 'fire_position_on_slope', 'assessment_resource', 'dispatched_resource', 'det_agent_type', 'det_agent', 'true_cause', 'activity_class', 'responsible_group_desc', 'bucketing_on_fire', 'fire_fighting_start_size', 'ia_access']
columns_to_one_hot_encode = ["fire_type", "weather_conditions_over_fire", "general_cause_desc", "fuel_type"]

# One-hot encode the specified columns
test_data = pd.get_dummies(test_data.drop(columns=columns_to_drop), columns=columns_to_one_hot_encode)

# Ensure test data has all the expected features (fill missing columns with zeros)
expected_features = loaded_model.feature_names_in_
missing_columns = set(expected_features) - set(test_data.columns)
for col in missing_columns:
    test_data[col] = 0

# Reorder columns to match the expected order
test_data = test_data[expected_features]

# Impute missing values using mean imputation
imputer = SimpleImputer(strategy='mean')
test_data_imputed = pd.DataFrame(imputer.fit_transform(test_data), columns=test_data.columns)

# Predict size categories
predicted_size_categories = loaded_model.predict(test_data_imputed)

# Create a submission DataFrame
submission_df = pd.DataFrame({'fire_id': test_data['fire_id'], 'size_class': predicted_size_categories})

# Save the results to a CSV file
submission_filename = '/content/drive/MyDrive/submission_file_version2.csv'
submission_df.to_csv(submission_filename, index=False)

# Print unique predicted values and their counts
print("Unique predicted values:", set(predicted_size_categories))
print("Number of unique predicted values:", len(set(predicted_size_categories)))

Unique predicted values: {1, 2, 3, 4, 5}
Number of unique predicted values: 5
