## Import Libraries

In [112]:
import sagemaker
import pandas as pd
import numpy as np
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import s3_input, Session
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer


### Creating an s3 bucket:

In [113]:
bucket_name = 'appplication' # CHANGE THIS VARIABLE TO A UNIQUE NAME FOR YOUR BUCKET
my_region = boto3.session.Session().region_name  # set the region of the instance
print(my_region)

us-east-1


In [114]:
s3 = boto3.resource('s3')
try:
    if my_region == 'us-east-1':
        s3.create_bucket(Bucket=bucket_name)
    print('S3 bucket created successfully')
except:
    print('S3 error: ',e)

S3 bucket created successfully


### Mapping the path of the models in s3:

In [115]:
# set an output path where the trained model will be saved
prefix = 'xgboost-as-a-built-in-algo'
output_path = 's3://{}/{}/output'.format(bucket_name, prefix)
print(output_path)

s3://appplication/xgboost-as-a-built-in-algo/output


### Reading the CSV file

In [116]:
df = pd.read_csv('airbnb_hw2.csv')
df.head()

Unnamed: 0,name,accommodates,bed_type,bedrooms,beds,cancellation_policy,cleaning_fee,host_total_listings_count,price,property_type,room_type,high_booking_rate,bathrooms,extra_people,host_acceptance_rate,host_is_superhost,host_response_rate,minimum_nights,market
0,Cathy's Cozy Cottage,2,Real Bed,1.0,1.0,moderate,,1.0,$75.00,House,Private room,1,1.0,$0.00,,False,100%,2,Los Angeles
1,Private Entrance! Bay View! Quiet!,2,Real Bed,1.0,2.0,moderate,$50.00,2.0,$99.00,House,Private room,0,1.0,$0.00,92%,False,100%,1,San Diego
2,Beautiful Bohemian Getaway,5,Real Bed,1.0,3.0,moderate,$15.00,1.0,$75.00,Condominium,Private room,1,1.0,$12.00,97%,False,69%,1,San Diego
3,Lux Wash Park Home Gr8 4 families,10,Real Bed,5.0,8.0,strict,$100.00,1.0,$411.00,House,Entire home/apt,0,4.0,$50.00,,False,100%,3,Denver
4,Huge private bedroom with park views,2,Real Bed,1.0,1.0,moderate,,1.0,$75.00,Apartment,Private room,0,1.0,$0.00,,False,100%,1,New York


### Function for handling missing values

In [117]:
class MissingValueHandler:
    def __init__(self, df):
        self.df = df

    def report_missing_values(self):
        mis_val = self.df.isnull().sum()
        mis_val_pct = 100 * mis_val / len(self.df)
        mis_val_tab = pd.concat([mis_val, mis_val_pct], axis=1)
        mis_val_tab_renamed = mis_val_tab.rename(columns={0: 'Missing Values', 1: '% of Total Values'})
        mis_val_tab_renamed = mis_val_tab_renamed[mis_val_tab_renamed.iloc[:, 1] != 0].sort_values('% of Total Values', ascending=False).round(1)
        print(f"Your selected dataframe has {self.df.shape[1]} columns.\nThere are {mis_val_tab_renamed.shape[0]} columns that have missing values.")
        return mis_val_tab_renamed

    def fill_with_value(self, columns, value):
        if isinstance(columns, list):
            for col in columns:
                self.df[col].fillna(value, inplace=True)
        else:
            self.df[columns].fillna(value, inplace=True)

    def fill_forward(self, columns):
        if isinstance(columns, list):
            for col in columns:
                self.df[col].fillna(method='ffill', inplace=True)
        else:
            self.df[columns].fillna(method='ffill', inplace=True)

    def fill_backward(self, columns):
        if isinstance(columns, list):
            for col in columns:
                self.df[col].fillna(method='bfill', inplace=True)
        else:
            self.df[columns].fillna(method='bfill', inplace=True)

    def fill_with_mean(self, columns):
        if isinstance(columns, list):
            for col in columns:
                self.df[col].fillna(self.df[col].mean(), inplace=True)
        else:
            self.df[columns].fillna(self.df[columns].mean(), inplace=True)

    def fill_with_median(self, columns):
        if isinstance(columns, list):
            for col in columns:
                self.df[col].fillna(self.df[col].median(), inplace=True)
        else:
            self.df[columns].fillna(self.df[columns].median(), inplace=True)
            

### Data Cleaning and Processing

In [118]:
# Creating an instance of the class:
handler = MissingValueHandler(df)

missing_report = handler.report_missing_values()
print(missing_report)


# Code 

handler.fill_with_value(['cleaning_fee'], 0)
handler.fill_with_mean(['bedrooms', 'beds', 'host_total_listings_count'])
handler.fill_with_value(['name'], 'Missing')

# Replace NAs in bathrooms with the median value
handler.fill_with_median('bathrooms')

# Replace NAs in host_is_superhost with False
df['host_is_superhost'].fillna(False, inplace=True)

# Group "strict" and "super_strict_30" into "strict" in cancellation_policy
df['cancellation_policy'] = df['cancellation_policy'].replace('super_strict_30', 'strict')

# Convert cleaning_fee and price into numbers and replace NAs with 0
df['cleaning_fee'] = pd.to_numeric(df['cleaning_fee'].str.replace('[\$,]', '', regex=True), errors='coerce').fillna(0)
df['price'] = pd.to_numeric(df['price'].str.replace('[\$,]', '', regex=True), errors='coerce').fillna(0)

# Convert extra_people into numbers
df['extra_people'] = df['extra_people'].replace('[\$,]', '', regex=True).astype(float)

# Create new variables
df['charges_for_extra'] = np.where(df['extra_people'] > 0, 'YES', 'NO')
df['host_acceptance'] = np.where(df['host_acceptance_rate'].isna(), 'MISSING', 
                                     np.where(df['host_acceptance_rate'] == '100%', 'ALL', 'SOME'))
df['host_response'] = np.where(df['host_response_rate'].isna(), 'MISSING', 
                                    np.where(df['host_response_rate'] == '100%', 'ALL', 'SOME'))
df['has_min_nights'] = np.where(df['minimum_nights'] > 1, 'YES', 'NO')

# Count the number of instances in each market
market_counts = df['market'].value_counts()

# Identify markets with under 300 instances
under_300_markets = market_counts[market_counts < 300].index.tolist()

# Replace under 300 markets with "OTHER"
df.loc[df['market'].isin(under_300_markets), 'market'] = 'OTHER'

# Replace NAs in market
df['market'].fillna('OTHER', inplace=True)


# Creating some new variables

# price_per_person
df['price_per_person'] = df['price'] / df['accommodates']

# has_cleaning_fee
df['has_cleaning_fee'] = df['cleaning_fee'].apply(lambda x: 'YES' if x > 0 else 'NO')

# bed_category
df['bed_category'] = df['bed_type'].apply(lambda x: 'bed' if x == 'Real Bed' else 'other')

# property_category
property_categories = {
    'Apartment': 'apartment', 'Serviced apartment': 'apartment', 'Loft': 'apartment',
    'Bed & Breakfast': 'hotel', 'Boutique hotel': 'hotel', 'Hostel': 'hotel',
    'Townhouse': 'condo', 'Condominium': 'condo',
    'Bungalow': 'house', 'House': 'house'
}
df['property_category'] = df['property_type'].map(property_categories).fillna('other')

# Convert property_category to a factor (category in pandas)
#df['property_category'] = df['property_category'].astype('category')

# ppp_ind
median_ppp = df.groupby('property_category')['price_per_person'].transform('median')
df['ppp_ind'] = (df['price_per_person'] > median_ppp).astype(int)

# Convert market to a categorical variable
df['market'] = df['market'].astype('category')

# Convert high_booking_rate to a categorical variable
df['high_booking_rate'] = df['high_booking_rate'].astype('category')


# This cell contains all dataframes needed for predictive analytics

# Selecting features and target
df = df[["accommodates", "bedrooms", "beds", "high_booking_rate"]]


Your selected dataframe has 19 columns.
There are 10 columns that have missing values.
                           Missing Values  % of Total Values
host_acceptance_rate                 9216               92.2
cleaning_fee                         1839               18.4
host_response_rate                   1662               16.6
market                                 33                0.3
bathrooms                              31                0.3
host_total_listings_count              19                0.2
host_is_superhost                      19                0.2
beds                                   12                0.1
bedrooms                                9                0.1
name                                    1                0.0


Amazon SageMaker XGBoost can train on data in either a CSV or LibSVM format. It should have the following:

- Have the target variable in the first column
- Not have a header row

In [121]:
df = pd.concat([df['high_booking_rate'], df.drop(['high_booking_rate'], axis=1)], axis=1)
df

Unnamed: 0,high_booking_rate,accommodates,bedrooms,beds
0,1,2,1.0,1.0
1,0,2,1.0,2.0
2,1,5,1.0,3.0
3,0,10,5.0,8.0
4,0,2,1.0,1.0
...,...,...,...,...
9995,0,2,0.0,1.0
9996,0,4,1.0,2.0
9997,0,8,4.0,4.0
9998,0,4,1.0,2.0


In [122]:
train_data, test_data = np.split(df.sample(frac=1, random_state=1729), [int(0.7 * len(df))])
train_data.to_csv('train.csv', header=False, index=False)
test_data.to_csv('test.csv', header=False, index=False)

  return bound(*args, **kwds)


In [124]:
### Saving Train and Test Into Buckets
## We start with Train Data
import os
#pd.concat([train_data['high_booking_rate'], train_data.drop(['high_booking_rate'], axis = 1)], axis = 1).to_csv('train.csv', index = False, header = False)

boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = sagemaker.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type= 'csv')

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


In [126]:
# Test Data Into Buckets
#pd.concat([test_data['high_booking_rate'], test_data.drop(['high_booking_rate'], axis = 1)], axis = 1).to_csv('test.csv', index = False, header = False)

boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')
s3_input_test = sagemaker.TrainingInput(s3_data='s3://{}/{}/test'.format(bucket_name, prefix), content_type= 'csv')

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


### Building Models Xgboost- Inbuilt Algorithm

In [127]:
# initialize hyperparameters
hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "verbosity":"1",
        "objective":"binary:logistic",
        "num_round":"50"}

In [128]:
from sagemaker.xgboost.estimator import XGBoost

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, "1.7-1")



INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [129]:
# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          instance_count=1, 
                                          instance_type='ml.m5.2xlarge', 
                                          volume_size=5, # 5 GB 
                                          output_path=output_path,
                                          use_spot_instances=True,
                                          max_run = 300,
                                          max_wait = 600)


In [130]:
# execute the XGBoost training job
estimator.fit({'train': s3_input_train, 'validation': s3_input_test})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-01-31-04-23-11-828


2024-01-31 04:23:11 Starting - Starting the training job...
2024-01-31 04:23:27 Starting - Preparing the instances for training......
2024-01-31 04:24:34 Downloading - Downloading input data...
2024-01-31 04:24:59 Downloading - Downloading the training image......
2024-01-31 04:26:10 Training - Training image download completed. Training in progress....
2024-01-31 04:26:35 Uploading - Uploading generated training model[34m[2024-01-31 04:26:29.756 ip-10-2-82-224.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-01-31 04:26:29.779 ip-10-2-82-224.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-01-31:04:26:30:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-01-31:04:26:30:INFO] Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34m[2024-01-31:04:26:30:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-01

### Deploy ML model as Endpoints

In [132]:
xgb_predictor = estimator.deploy(
    initial_instance_count = 1,
    instance_type = 'ml.m4.xlarge',
    serializer = CSVSerializer())

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-01-31-04-28-04-742
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-01-31-04-28-04-742
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-01-31-04-28-04-742


------!

### Prediction of Test Data

In [140]:
def predict(data, rows=500):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = []
    for array in split_array:
        result = xgb_predictor.predict(array).decode('utf-8')
        # Splitting by newline and then by commas
        for line in result.strip().split('\n'):
            predictions.extend(line.split(','))

    return np.array(predictions, dtype=float)

predictions = predict(test_data.to_numpy()[:,1:])


### Results and Confusion Matrix

In [144]:
cm = pd.crosstab(index=test_data['high_booking_rate'], 
columns=np.round(predictions), 
rownames=['Observed'], colnames=['Predicted']) 
tn = cm.iloc[0, 0] 
fn = cm.iloc[1, 0] 
tp = cm.iloc[1, 1] 
fp = cm.iloc[0, 1] 
p = (tp + tn) / (tp + tn + fp + fn) * 100 
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p)) 
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "Not High", "High")) 
print("Observed") 
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("Not High", tn / (tn + fn) * 100, tn, fp / (tp + fp) * 100, fp)) 
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("High", fn / (tn + fn) * 100, fn, tp / (tp + fp) * 100, tp)) 


Overall Classification Rate: 76.8%

Predicted      Not High           High
Observed
Not High       77% (2296)    36% (4)
High            23% (693)     64% (7) 



### Deleting the Endpoints

In [149]:
xgb_predictor.delete_endpoint()


INFO:sagemaker:Deleting endpoint configuration with name: sagemaker-xgboost-2024-01-31-04-28-04-742
INFO:sagemaker:Deleting endpoint with name: sagemaker-xgboost-2024-01-31-04-28-04-742


In [152]:
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

[{'ResponseMetadata': {'RequestId': 'ZKG0K8CD2G92YQAE',
   'HostId': 'z1lPYQuIx00GiCqzJoRti5iSxX0n1Xje1zbKTczyWfIEklg5SG40/kNM7re/omz0iLvxodVmCLo=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'z1lPYQuIx00GiCqzJoRti5iSxX0n1Xje1zbKTczyWfIEklg5SG40/kNM7re/omz0iLvxodVmCLo=',
    'x-amz-request-id': 'ZKG0K8CD2G92YQAE',
    'date': 'Wed, 31 Jan 2024 04:54:48 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2024-01-31-03-33-01-120/profiler-output/system/incremental/2024013103/1706672040.algo-1.json'},
   {'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2024-01-31-04-23-11-828/output/model.tar.gz'},
   {'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2024-01-31-04-23-11-828/debug-output/events/000000000010/000000000010_worker_0.tfevents'},
   {'Key': 'xgboost-as-a-built-in