# Data Preparation

In [2]:
# importing the libraries
from utils.DataIngestion import get_config, print_config, get_path, ingest_data
from utils.HelperFunctions import negative_check, not_in_list, out_of_bounding_box, out_of_range

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import os
import yaml
import logging

### Assessing Values:

In [3]:
def basic_assessment(df, columns, valid_values, non_negatives):
    ''' assess the values in a dataframe
    Args:
        df: dataframe to assess
        columns: dictionary of column names by category
        valid_values: dictionary of valid values for categorical columns
        non_negatives: list of continuous columns with only non-negative values valid
    '''
    for col in list(df):
        print(f"Missing Values in {col}: {df[col].isna().sum()}")
        print(f"Distinct Values in {col}: {df[col].nunique()}\n")
        
    # for categorical varaibles, count number of invalid values by column
    for col in valid_values:
        print(f"Non-Valid values in {col}: {df[col].apply(lambda x: not_in_list(x, valid_values[col])).sum()}")
    print('\n')
    # counting non-negative values in continuous variables
    for col in non_negatives:
        mask = pd.to_numeric(df[col], errors='coerce').isna()
        print(f"Non Numeric Values in {col}: {mask.sum()}")
        if (mask.sum() == 0) and (col in non_negatives):
            print(f"Negative values in {col}: {df[col].apply(lambda x: negative_check(x)).sum()}")

In [4]:
def geo_assessment(df, bounding_box):
    '''assess the geo columns in the dataframe by checking if latitude and longitude values are out of bounding box.
    Args:
        df: dataframe to check
        bounding_box: dictionary with max and min values to compare the location to
    '''
    lat_oob = df['latitude'].apply(lambda x: out_of_range(x, bounding_box['max_lat'], bounding_box['min_lat'])).sum()
    long_oob = df['longitude'].apply(lambda x: out_of_range(x, bounding_box['max_long'], bounding_box['min_long'])).sum()
    loc_oob = df.apply(lambda x: out_of_bounding_box(x.latitude, x.longitude, bounding_box), axis=1).sum()
    
    print("Latitude out of bound count:", lat_oob)
    print("Longitude out of bound count:", long_oob)
    print("Location out of bound count:", loc_oob)

### Master Cell

Contains calls to all the other functions to complete data preparation

In [5]:
# get path for data files
path = get_path()
print(f"Path is: {path}")

# getting the configuration file
config = get_config("data_preparation_config.yml")
print_config(config)

Path is: D:\Machine Learning\MACHINE LEARNING PROJECTS\airbnb_price_prediction\data
Path, path_to_yaml: D:\Machine Learning\MACHINE LEARNING PROJECTS\airbnb_price_prediction\notebooks\data_preparation_config.yml


Config Value general --> {'load_from_scratch': False, 'save_raw_dataframe': False, 'save_transformed_dataframe': True, 'remove_bad_values': True}
Config Value columns --> {'categorical': ['neighbourhood_group', 'neighbourhood', 'room_type'], 'continuous': ['minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'latitude', 'longitude'], 'date': ['last_review'], 'text': ['name', 'host_name'], 'excluded': ['price', 'id']}
Config Value category_defaults --> [{'categorical': 'missing'}, {'continuous': 0.0}, {'text': 'missing'}, {'date': datetime.date(2019, 1, 1)}, {'excluded': 'missing'}]
Config Value category_invalid_replacements --> [{'categorical': 'bad_categorical'}, {'continuous': 'bad_continuous'}, {'text': 'bad_text'}, {'date': 'bad_da

In [6]:
# loading the dataframe
df = ingest_data(path=path,
                input_csv=config['file_names']['input_csv'],
                pickled_input_dataframe=config['file_names']['pickled_input_dataframe'],
                save_raw_dataframe=config['general']['save_raw_dataframe'],
                load_from_scratch=config['general']['load_from_scratch'])
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [7]:
# Basic Assessment 
basic_assessment(df, config['columns'], config['valid_values'], config['non_negative_continuous'])
# Geospatial Assessment
geo_assessment(df, config['bounding_box'])

Missing Values in id: 0
Distinct Values in id: 48895

Missing Values in name: 16
Distinct Values in name: 47905

Missing Values in host_id: 0
Distinct Values in host_id: 37457

Missing Values in host_name: 21
Distinct Values in host_name: 11452

Missing Values in neighbourhood_group: 0
Distinct Values in neighbourhood_group: 5

Missing Values in neighbourhood: 0
Distinct Values in neighbourhood: 221

Missing Values in latitude: 0
Distinct Values in latitude: 19048

Missing Values in longitude: 0
Distinct Values in longitude: 14718

Missing Values in room_type: 0
Distinct Values in room_type: 3

Missing Values in price: 0
Distinct Values in price: 674

Missing Values in minimum_nights: 0
Distinct Values in minimum_nights: 109

Missing Values in number_of_reviews: 0
Distinct Values in number_of_reviews: 394

Missing Values in last_review: 10052
Distinct Values in last_review: 1764

Missing Values in reviews_per_month: 10052
Distinct Values in reviews_per_month: 937

Missing Values in cal

In [8]:
# Summary of the Dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   name                            48879 non-null  object 
 2   host_id                         48895 non-null  int64  
 3   host_name                       48874 non-null  object 
 4   neighbourhood_group             48895 non-null  object 
 5   neighbourhood                   48895 non-null  object 
 6   latitude                        48895 non-null  float64
 7   longitude                       48895 non-null  float64
 8   room_type                       48895 non-null  object 
 9   price                           48895 non-null  int64  
 10  minimum_nights                  48895 non-null  int64  
 11  number_of_reviews               48895 non-null  int64  
 12  last_review                     