# import

In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import sklearn
%matplotlib inline
plt.style.use('ggplot')
import warnings
warnings.filterwarnings('ignore')
import matplotlib
import pickle
from sklearn.preprocessing import OneHotEncoder
from datetime import datetime, date
from numpy.core._exceptions import UFuncTypeError
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from scipy import sparse
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from xgboost import plot_importance
from collections import Counter
from scipy.spatial import distance_matrix,distance

matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (18,8)

# Functions

In [None]:
def clean_data(df,features_to_delete):
  return df.drop(columns=features_to_delete, errors='ignore')

A function to Redoce cardinality of given column

return the columns with the categories that captured {threshold}% of the data, and puts 'other' in the rest.

In [None]:

def cumulatively_categorise(column,threshold=0.75,return_categories_list=True,other_name='Other'):
  #Find the threshold value using the percentage and number of instances in the column
  threshold_value=int(threshold*len(column))
  #Initialise an empty list for our new minimised categories
  categories_list=[]
  #Initialise a variable to calculate the sum of frequencies
  s=0
  #Create a counter dictionary of the form unique_value: frequency
  counts=Counter(column)

  #Loop through the category name and its corresponding frequency after sorting the categories by descending order of frequency
  for i,j in counts.most_common():
    #Add the frequency to the global sum
    s+=dict(counts)[i]
    #Append the category name to the list
    categories_list.append(i)
    #Check if the global sum has reached the threshold value, if so break the loop
    if s>=threshold_value:
      break
  #Append the category Other to the list
  categories_list.append(other_name)

  #Replace all instances not in our new categories by Other  
  new_column=column.apply(lambda x: x if x in categories_list else other_name)

  #Return transformed column and unique values if return_categories=True
  if(return_categories_list):
    return new_column,categories_list
  #Return only the transformed column if return_categories=False
  else:
    return new_column

## convert to date_time

In [None]:
def fix_julian_date(df, col):
    """
    converts dates from Julian format (float) to datetime
    col - name of the column to convert
    """
    epoch = pd.to_datetime(0, unit='s').to_julian_date()
    disc_dates = df[col]
    df[col] = pd.to_datetime(disc_dates - epoch, unit='D')

In [None]:
def adding_dates_data(df, features_to_delete):
  try:
    fix_julian_date(df, 'DISCOVERY_DATE')
    fix_julian_date(df, 'CONT_DATE')
  except UFuncTypeError as e:
    print('dates allready converted')

  df['DISCOVERY_WOY'] = df['DISCOVERY_DATE'].dt.week
  df['CONT_WOY'] = df['CONT_DATE'].dt.week
  df['DISCOVERY_MONTH'] = df['DISCOVERY_DATE'].dt.month
  df['CONT_MONTH'] = df['CONT_DATE'].dt.month
  try:
    df['CONT_TIME'] = df['CONT_TIME'].apply(lambda x: pd.to_datetime(x, format='%H%M').time() if x else None)
    df['DISCOVERY_TIME'] = df['DISCOVERY_TIME'].apply(lambda x: pd.to_datetime(x, format='%H%M').time() if x else None)
  except ValueError as e:
    print('conversion of CONT_TIME and/or DISCOVERY_TIME - failed')

  
  return df

## Engineering Functions

In [None]:
def features_to_one_hot(df, one_hot_features, ohe, is_test):
  """
  Switches the feature representation to one hot and deletes the original
  """
  if not is_test:
    ohe_arr = ohe.fit_transform(df[one_hot_features])
  else:
    ohe_arr = ohe.transform(df[one_hot_features])
  # Getting all features names
  feature_labels = ohe.categories_
  labels = []
  for i, category in enumerate(feature_labels):
    for sub in category.ravel():
      if sub:
        labels.append(one_hot_features[i] + "_" + sub)

  df_no_onehot = df.drop(columns=one_hot_features)
  all_cols = list(df_no_onehot.columns)
  all_cols.extend(labels)


  df_as_arr = np.array(df_no_onehot,dtype=float)
  sparse_df = sparse.csr_matrix(df_as_arr)
  final_df = sparse.hstack([sparse_df,ohe_arr])
  return final_df, all_cols


In [None]:
def partial_one_hot(df, column_name, encode_values, drop=False):
  column_names = []
  
  for value in encode_values:
    column_names.append(f"{column_name}_{value}")
    df[column_names[-1]] = np.where(df[column_name]==value,1,0)
  
  if drop:
    df.drop(columns={column_name}, inplace=True)
  
  return column_names

### Adding features
feature engeneering - add features to the data frame

In [None]:
def calc_duration(row):
  for value in ['CONT_DATE', 'CONT_TIME', 'DISCOVERY_TIME', 'DISCOVERY_DATE']:
    if row[value] is None:
      return

  return datetime.combine(row['CONT_DATE'], row['CONT_TIME']) - datetime.combine(row['DISCOVERY_DATE'], row['DISCOVERY_TIME'])

In [None]:
def is_holiday(df, date_column):
  cal = calendar()
  holidays = cal.holidays(start=df[date_column].min(), end=df[date_column].max())
  df[f'isHoliday_{date_column}'] = df[date_column].isin(holidays)

In [None]:
def cyclical_transform_datetime_feature(df, datetime_feature):
    seconds_in_day = 24*60*60
    seconds_in_week = 7 * seconds_in_day

    df[datetime_feature + '_time_in_seconds'] = pd.to_datetime(df[datetime_feature]).values.astype(np.int64) // 10**6

    df[datetime_feature + '_day_sin_time'] = np.sin(2*np.pi*df[datetime_feature + '_time_in_seconds']/seconds_in_day)
    df[datetime_feature + '_day_cos_time'] = np.cos(2*np.pi*df[datetime_feature + '_time_in_seconds']/seconds_in_day)
    df[datetime_feature + '_sin_time_week'] = np.sin(2*np.pi*df[datetime_feature + '_time_in_seconds']/seconds_in_week)
    df[datetime_feature + '_cos_time_week'] = np.cos(2*np.pi*df[datetime_feature + '_time_in_seconds']/seconds_in_week)

In [None]:
def cyclical_transform_time_feature(df, hour_feature_name, features_to_delete):
  df.hour_feature_name = df_train[hour_feature_name].apply(lambda x: x.hour + x.minute/60 if not x is None else None)

  df[f'{hour_feature_name}_hr_sin'] = np.sin(df.hour_feature_name*(2.*np.pi/24))
  df[f'{hour_feature_name}_hr_cos'] = np.cos(df.hour_feature_name*(2.*np.pi/24))

  features_to_delete.append(hour_feature_name)

In [None]:
def increased_poweline(row):
  if row['FIRE_YEAR'] > 2009 or (row['FIRE_YEAR'] == 2009 and row['DISCOVERY_WOY'] >=4):
    return 1  
  return 0

def missing_values_decrease(row):
  if row['FIRE_YEAR'] > 1996 and row['FIRE_YEAR'] < 2003 or (row['FIRE_YEAR'] == 1996 and row['DISCOVERY_WOY'] >= 41):
    return 1

  if row['FIRE_YEAR'] > 2012 and row['FIRE_YEAR'] < 2015:
    return 1
  
  if (row['FIRE_YEAR'] == 2012 and row['DISCOVERY_WOY'] <= 9) and (row['FIRE_YEAR'] == 2015 and row['DISCOVERY_WOY'] <= 33):
    return 1

  return 0

def decrease_rail(row):
  if row['FIRE_YEAR'] >= 1992 and row['FIRE_YEAR'] < 2002:
    return 1
  if row['FIRE_YEAR'] == 2002 and row['DISCOVERY_WOY'] < 41:
    return 1
  return 0

In [None]:
def add_features(df, good_has_features, features_to_delete):
  """
  Adds features to the dataframe
  """
  # Time
  df['WEEKDAY'] =  df['DISCOVERY_DATE'].dt.weekday  # add day of week column

  cyclical_transform_datetime_feature(df, 'DISCOVERY_DATE')
  cyclical_transform_datetime_feature(df, 'CONT_DATE')
  #cyclical_transform_time_feature(df, 'CONT_TIME', features_to_delete)
  #cyclical_transform_time_feature(df, 'DISCOVERY_TIME', features_to_delete)

  df['Area_of_independent'] = ((df['DISCOVERY_WOY'] > 23) & (df['DISCOVERY_WOY'] < 27)).astype(int)
  df['increased_poweline_cases'] = df.apply(increased_poweline, axis=1)
  df['decreased_missing_values'] = df.apply(missing_values_decrease, axis=1)
  df['decreased_railroad'] = df.apply(decrease_rail, axis=1)
  df['increased_lighting_cases'] = df['DISCOVERY_WOY'].apply(lambda x:1 if x >= 19 and x<= 41 else 0)


  df['DURATION'] = df.apply(calc_duration,axis=1).astype('timedelta64[s]')

  # scale the DURATION
  df['DURATION'] = (df['DURATION']-df['DURATION'].min())/(df['DURATION'].max()-df['DURATION'].min())

  # Adding holidays
  is_holiday(df, "DISCOVERY_DATE")
  is_holiday(df, "CONT_DATE")

  # "has" features
  for feature in good_has_features:
    df = add_has_feature(df, feature)
  # # remove original features
  df = df.drop(good_has_features, axis=1, errors='ignore')


  return df

A function to add has_feature feature

In [None]:
def add_has_feature(df, feature):
  """
  Adds has_feature to the df while removing the feature itself
  """
  df[f"HAS_{feature}"] = df[feature].isna()
  return df

In [None]:
def reducing_cardinality(df,feature_to_reduce_cardinality,thershold):
  categories_list_dict = {}
  for feature in feature_to_reduce_cardinality:
    new_col, categories_list = cumulatively_categorise(df[feature], thershold, other_name=f'Other_{feature}')
    df[feature] = new_col
    categories_list_dict[feature] = categories_list

  return df, categories_list_dict

In [None]:
def reduce_cardinality_test(df,feature_to_reduce_cardinality,categories_list_dict):
  for feature in feature_to_reduce_cardinality:
    lst = categories_list_dict[feature]
    df[feature] = df[feature].apply(lambda x : x if x in lst else f'Other_{feature}')
  return df

# Final preprocess function

In [None]:
def preprocess(df, features_to_delete, good_has_features, one_hot_features,feature_to_reduce_cardinality,ohe,is_test, categories_list_dict=None):
  """
  Returns the df after preprocessing.
  """
  print("Starting data processing\n\t-----")
  Y = None
  if is_test and 'STAT_CAUSE_DESCR' in df.columns:
    Y = df['STAT_CAUSE_DESCR']
    df.drop(columns=['STAT_CAUSE_DESCR'] ,inplace=True, errors='ignore')
  print("Starting date processing")

  df = adding_dates_data(df, features_to_delete)

  print("Finished date processing")


  print("Starting adding features")
  df = add_features(df, good_has_features, features_to_delete)
  print("Finished adding feature")


  print("Starting cleaning data")
  df = clean_data(df, features_to_delete)
  print("Finished cleaning data")
  
  #Adding one hot representation for not all values
  print("Starting reducing cardinality")
  if not is_test:
    df, categories_list_dict = reducing_cardinality(df,feature_to_reduce_cardinality,thershold=0.8)
  else:
    df = reduce_cardinality_test(df,feature_to_reduce_cardinality,categories_list_dict)
  print("Finished reducing cardinality")
  if not is_test:
    Y = df['STAT_CAUSE_DESCR']
    df.drop(columns=['STAT_CAUSE_DESCR'], inplace=True, errors='ignore')
  # Adding one hot
  print("Starting adding one hot")
  df_as_sparse , feature_labels = features_to_one_hot(df, one_hot_features, ohe, is_test) 
  print("Finished adding one hot")
  print("\t-----\nFinished data processing")
  if is_test:
    return df_as_sparse,Y
  return df_as_sparse, Y, feature_labels, categories_list_dict

# Model Class 

In [None]:
class FiresPreprocessor:

  def __init__(self):
  
    self.ohe = OneHotEncoder(categories='auto', handle_unknown='ignore', sparse = True)
    
    self.categories_list_dict = None 

    self.good_has_features = ['FIRE_NAME','FIRE_CODE']

    self.feature_to_reduce_cardinality = ['NWCG_REPORTING_UNIT_NAME','SOURCE_SYSTEM', "FIPS_NAME", 'SOURCE_REPORTING_UNIT'] 

    self.one_hot_features = self.feature_to_reduce_cardinality + ['OWNER_DESCR', 'STATE', 'NWCG_REPORTING_AGENCY']

    self.features_to_delete = ["FOD_ID", "FPA_ID", "OBJECTID", "LOCAL_FIRE_REPORT_ID", 
                            "LOCAL_INCIDENT_ID", "ICS_209_INCIDENT_NUMBER", "ICS_209_NAME", "MTBS_ID", 'NWCG_REPORTING_UNIT_ID', 
                            'FIRE_SIZE_CLASS', "OWNER_CODE", 'COMPLEX_NAME', 'FIPS_CODE', 
                            'COUNTY', 'Shape', 'SOURCE_SYSTEM_TYPE', 'CONT_DOY', 'DISCOVERY_DOY', "MTBS_FIRE_NAME",'SOURCE_REPORTING_UNIT_NAME', 
                            'DISCOVERY_DATE', 'CONT_DATE','DISCOVERY_TIME', 'CONT_TIME','STAT_CAUSE_CODE']


  def fit_transform(self, data):
    X_train, y_train, feature_labels, categories_list_dict = preprocess(data, 
                                                                        self.features_to_delete, 
                                                                        self.good_has_features, 
                                                                        self.one_hot_features,
                                                                        self.feature_to_reduce_cardinality, 
                                                                        self.ohe,is_test=False)
    self.categories_list_dict = categories_list_dict
    return X_train, y_train

  def transform(self,test_data):
    X_test, y_test = preprocess(test_data, 
                                self.features_to_delete, 
                                self.good_has_features, 
                                self.one_hot_features,
                                self.feature_to_reduce_cardinality, 
                                self.ohe,True, self.categories_list_dict)
    return X_test, y_test


In [None]:
class FiresModel:

  def __init__(self, model_type):
    self.parameters = { 'colsample_bytree': 0.9911191130678206,
                       'eta': 0.49685865594047707,
                       'gamma': 1.4308614729557847,
                       'max_depth': int(6),
                       'min_child_weight': int(4),
                       'n_estimators': int(196),
                       'reg_alpha': int(40.0),
                       'reg_lambda': 0.91095884549275
                       }
    self.Preprocessor = FiresPreprocessor()
    self.model_type = model_type
    if self.model_type == 'xgb':
      self.model = XGBClassifier(**self.parameters)  
    elif self.model_type == 'clf':
      self.model = RandomForestClassifier() 

  def fit(self,data):
    X_train, y_train = self.Preprocessor.fit_transform(data)

    print("begin trainning")
    self.model.fit(X_train, y_train)
    print("finish trainning")

    y_hat = self.model.predict_proba(X_train)
    self.train_score = roc_auc_score(y_train, y_hat, average='macro', multi_class='ovr')

  def get_train_score(self):
    return self.train_score

  def predict(self,test_data):
    X_test, y_test = self.Preprocessor.transform(test_data)
    y_hat = self.model.predict_proba(X_test)
    return y_hat

  def score(self,data):
    X_test, y_test = self.Preprocessor.transform(data)
    y_hat = self.model.predict_proba(X_test)
    score = roc_auc_score(y_test, y_hat,  average='macro', multi_class='ovr')
    return score



  





# Your code goes Here:

## Our way to load the data

In [None]:
# Mount drive in google colab
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
drive_project_dir = '/content/drive/MyDrive/APLDS/'

df_train_pickle_fn = 'df_train_raw.pickle'
df_train_pickle_path_raw = drive_project_dir + df_train_pickle_fn
df_test_pickle_fn = 'df_test_raw.pickle'
df_test_pickle_path = drive_project_dir + df_test_pickle_fn

with open(df_train_pickle_path_raw, 'rb') as f: 
     df_train = pickle.load(f)
with open(df_test_pickle_path, 'rb') as f:
     df_test = pickle.load(f)

## Use example 

In [None]:
# Use example 
# We assume we have df_train and df_test.
# Our Labels is The column 'STAT_CAUSE_DESCR' and we assume it's inside the DataFrames.
model = FiresModel(model_type='xgb')
model.fit(df_train.sample(150000))
print(f'train AUC score: {model.train_score}\n')
test_score = model.score(df_test)
print(f'test AUC score: {test_score}')



Starting data processing
	-----
Starting date processing
Finished date processing
Starting adding features
Finished adding feature
Starting cleaning data
Finished cleaning data
Starting reducing cardinality
Finished reducing cardinality
Starting adding one hot
Finished adding one hot
	-----
Finished data processing
begin trainning
finish trainning
train AUC score: 0.8829255325197909
Starting data processing
	-----
Starting date processing
Finished date processing
Starting adding features
Finished adding feature
Starting cleaning data
Finished cleaning data
Starting reducing cardinality
Finished reducing cardinality
Starting adding one hot
Finished adding one hot
	-----
Finished data processing
test AUC score: 0.8677210401365572
