# Get text derived features

***Objective:*** Create features from the raw text of the bigfoot sighting report.

## Required libraries

In [1]:
from datetime import datetime
import os
import pandas as pd
import requests

## Required function

In [2]:
def get_word_status(input_dataframe, search_column_name, keyword_list,
                    new_column_name):
    # PURPOSE
    #   This function will search all the observations for keywords and then
    #   note the whether the keyword is present or absent from the observation
    #   text. It then returns a copy of the supplied dataframe with a new
    #   column that has the recording of the keyword.
    #
    # INPUTS
    #   input_dataframe        := The name of the dataframe that the data 
    #                             resides. Type should be 
    #                             pandas.core.frame.DataFrame
    #   search_column_name     := The name of the column in the dataframe to 
    #                             search. Type should be string.
    #   keyword_list           := The keywords to search for all in lowercase.
    #                             Type should be a list containing only 
    #                             strings.
    #   new_column_name        := The desired name of the new column that
    #                             contains an indicator string for the presense
    #                             of any of the keywords in keyword_list. Type
    #                             should be string.
    #
    # OUTPUT
    #   N/A
    #
    # RETURNS
    #   A pandas dataframe containing an additional column that notes whether
    #   any of the keywords listed in keyword_list was found in the raw
    #   observation text.
    flag_list = []
    for obs in input_dataframe[search_column_name]:
        count = 0
        for keyword in keyword_list:
            if keyword in obs.lower():
                count += 1
        if count > 0:
            flag_list.append("Yes")
        else:
            flag_list.append("No")
    input_dataframe[new_column_name] = flag_list
    return input_dataframe

## Bring the data in

In [3]:
os.chdir("../data/interim/")

In [4]:
dat = pd.read_csv("add_init_raw_feats_20230416_0636.csv")

In [5]:
dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4086 entries, 0 to 4085
Data columns (total 27 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   observed                      4053 non-null   object 
 1   county                        4086 non-null   object 
 2   state                         4086 non-null   object 
 3   season                        4086 non-null   object 
 4   latitude                      4086 non-null   float64
 5   longitude                     4086 non-null   float64
 6   date                          4086 non-null   object 
 7   classification                4086 non-null   object 
 8   temperature_high              4086 non-null   float64
 9   temperature_mid               3949 non-null   float64
 10  temperature_low               4086 non-null   float64
 11  dew_point                     3936 non-null   float64
 12  humidity                      3936 non-null   float64
 13  clo

In [6]:
# Remove nulls from the `observed` feature
dat = dat[dat["observed"].notna()]

## Create some features by looking for the presense of certain words

While normally, one might want to use some NLP here, i.e., topic modelling, $n$-grams, etc., we have elected not to do that as that would be overkill for the goal of this project: build a basic interactive dashboard on Tableau public.

Thus, we merely check for the presence of certain keywords that we believe may be interesting.

In [7]:
dat = get_word_status(dat, "observed", ["hunt"], "hunting")

In [8]:
dat = get_word_status(dat, "observed", ["fish"], "fishing")

In [9]:
dat = get_word_status(dat, "observed", ["camp", "tent"], "camping")

In [10]:
dat = get_word_status(dat, "observed", ["hike", "hiking", "hikeing"], "hiking")

In [11]:
dat = get_word_status(dat, "observed", ["drive", "driving", "driveing"], "driving")

## Write to file to be used for Tableau dashboard

In [12]:
os.chdir("../processed")

In [13]:
file_name_root = "processed_bigfoot_data_"
date_time_str = datetime.today().strftime("%Y-%m-%d %H:%M")
date_str = date_time_str.split()[0].replace("-", "") + "_"
time_str = date_time_str.split()[1].replace(":", "")
del date_time_str
file_ext = ".csv"
file_name = file_name_root + date_str + time_str + file_ext
del file_name_root
del date_str
del time_str
del file_ext

In [14]:
print(file_name)

processed_bigfoot_data_20230416_1539.csv


In [15]:
dat.to_csv(file_name, sep=",", index=False)
del file_name

In [16]:
dat.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4053 entries, 0 to 4085
Data columns (total 32 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   observed                      4053 non-null   object 
 1   county                        4053 non-null   object 
 2   state                         4053 non-null   object 
 3   season                        4053 non-null   object 
 4   latitude                      4053 non-null   float64
 5   longitude                     4053 non-null   float64
 6   date                          4053 non-null   object 
 7   classification                4053 non-null   object 
 8   temperature_high              4053 non-null   float64
 9   temperature_mid               3918 non-null   float64
 10  temperature_low               4053 non-null   float64
 11  dew_point                     3905 non-null   float64
 12  humidity                      3905 non-null   float64
 13  clo