<a href="https://colab.research.google.com/github/annaroney/Airbnb/blob/main/Ensemble_Classification_PredictionProblem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Prediction Problem Code

**Problem:** Classification

**Model:** Ensemble

**Accuracy on Kaggle:** .94

In [1]:
# run this cell if using google colab
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/My Drive/data science/stat303-3')

Mounted at /content/drive


### Libraries

In [3]:
# Libraries here
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.linear_model import Lasso, LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, VotingClassifier, StackingClassifier, BaggingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold, RepeatedKFold, RandomizedSearchCV
from xgboost import XGBClassifier
# uncomment next line if necessary
#!pip install catboost
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



### Data and Preprocessing

In [7]:
# reading TRAINING DATA
train = pd.read_csv('train_classification.csv')


# Modifying the neighborhoods_cleansed column to only reflect if the neighborhood is in the top 3 most fancy
train.neighbourhood_cleansed.value_counts()[0:3]
fancy_neighborhoods = ['Near North Side','West Town','Lake View']
train.neighbourhood_cleansed = train.neighbourhood_cleansed.apply(
    lambda x: 1 if x in fancy_neighborhoods else 0)

# Since there are many property types, modifying the column so that only types with over 100 observations are kept, otherwise set to 'Other'
train.property_type.value_counts()[0:7]
top_property_types = ['Entire rental unit','Entire condo','Private room in rental unit','Entire home',
                     'Private room in home','Private room in home','Entire serviced apartment','Room in hotel']
train.property_type = train.property_type.apply(lambda x: x if x in top_property_types else 'Other')

# host_location data prep
train['host_location'].fillna('Other', inplace=True)
top_locations = train['host_location'].value_counts().index[:3]
train.loc[~train['host_location'].isin(top_locations), 'host_location'] = 'Other'

# host_response_time data prep
mode_value = train['host_response_time'].mode()[0]
train['host_response_time'].fillna(mode_value, inplace=True)

# host_response_rate data prep
train['host_response_rate'] = train['host_response_rate'].str.rstrip('%').astype(float)
mean_value = train['host_response_rate'].mean()
train['host_response_rate'].fillna(mean_value, inplace=True)

# host_acceptance_rate data prep
train['host_acceptance_rate'] = train['host_acceptance_rate'].str.rstrip('%').astype(float)
mean_value = train['host_acceptance_rate'].mean()
train['host_acceptance_rate'].fillna(mean_value, inplace=True)

# host_is_superhost data prep
train['host_is_superhost'] = train['host_is_superhost'].astype(str)
train['host_is_superhost'] = train['host_is_superhost'].replace({'t': 1, 'f': 0})
mode_value = train['host_is_superhost'].mode()[0]
train['host_is_superhost'].fillna(mode_value, inplace=True)

# host_neighbourhood
train.host_neighbourhood.value_counts()[0:3]
top_host_neighbourhoods = ['Cambridge','River North','Logan Square']
train.host_neighbourhood = train.host_neighbourhood.apply(
    lambda x: 1 if x in top_host_neighbourhoods else 0)
train['host_neighbourhood'].fillna('Other', inplace=True)

# host_has_profile_pic data prep
train['host_has_profile_pic'] = train['host_has_profile_pic'].astype(str)
train['host_has_profile_pic'] = train['host_has_profile_pic'].replace({'t': 1, 'f': 0})

# host_identity_verified data prep
train['host_identity_verified'] = train['host_identity_verified'].astype(str)
train['host_identity_verified'] = train['host_identity_verified'].replace({'t': 1, 'f': 0})

# bathrooms_text
top_bathroom_types = train['bathrooms_text'].value_counts().index[:5]
train['bathrooms_text'] = train['bathrooms_text'].apply(lambda x: x if x in top_bathroom_types else 'Other')

# has_availability data prep
train['has_availability'] = train['has_availability'].astype(str)
train['has_availability'] = train['has_availability'].replace({'t': 1, 'f': 0})

# Imputing missing numeric values using median
train = train.fillna(train.median(numeric_only=True))
# Imputing missing categorical values using ffill and then bfill (in case the first value was null)
train = train.fillna(method = 'ffill')
train = train.fillna(method = 'bfill')

# SAME MODIFICATIONS FOR TEST DATA

test = pd.read_csv('test_classification.csv')

# Neighborhood cleaning
test.neighbourhood_cleansed = test.neighbourhood_cleansed.apply(
    lambda x: 1 if x in fancy_neighborhoods else 0)

# Property type cleaning
test.property_type = test.property_type.apply(lambda x: x if x in top_property_types else 'Other')

# host_location data prep
test['host_location'].fillna('Other', inplace=True)
top_locations = test['host_location'].value_counts().index[:3]
test.loc[~test['host_location'].isin(top_locations), 'host_location'] = 'Other'

# host_response_time data prep
mode_value = test['host_response_time'].mode()[0]
test['host_response_time'].fillna(mode_value, inplace=True)

# host_response_rate data prep
test['host_response_rate'] = test['host_response_rate'].str.rstrip('%').astype(float)
mean_value = test['host_response_rate'].mean()
test['host_response_rate'].fillna(mean_value, inplace=True)

# host_acceptance_rate data prep
test['host_acceptance_rate'] = test['host_acceptance_rate'].str.rstrip('%').astype(float)
mean_value = test['host_acceptance_rate'].mean()
test['host_acceptance_rate'].fillna(mean_value, inplace=True)

# host_neighbourhood
test.host_neighbourhood.value_counts()[0:3]
top_host_neighbourhoods = ['Cambridge','River North','Logan Square']
test.host_neighbourhood = test.host_neighbourhood.apply(
    lambda x: 1 if x in top_host_neighbourhoods else 0)
test['host_neighbourhood'].fillna('Other', inplace=True)

# host_has_profile_pic data prep
test['host_has_profile_pic'] = test['host_has_profile_pic'].astype(str)
test['host_has_profile_pic'] = test['host_has_profile_pic'].replace({'t': 1, 'f': 0})

# host_identity_verified data prep
test['host_identity_verified'] = test['host_identity_verified'].astype(str)
test['host_identity_verified'] = test['host_identity_verified'].replace({'t': 1, 'f': 0})

# bathrooms_text
top_bathroom_types = test['bathrooms_text'].value_counts().index[:5]
test['bathrooms_text'] = test['bathrooms_text'].apply(lambda x: x if x in top_bathroom_types else 'Other')

# has_availability data prep
test['has_availability'] = test['has_availability'].astype(str)
test['has_availability'] = test['has_availability'].replace({'t': 1, 'f': 0})

# Imputing
test = test.fillna(test.median(numeric_only=True))
test = test.fillna(method = 'ffill')
test = test.fillna(method = 'bfill')

  train = train.fillna(method = 'ffill')
  train = train.fillna(method = 'bfill')
  test = test.fillna(method = 'ffill')
  test = test.fillna(method = 'bfill')


### Predictor Selection

In [8]:
# ** RUN ONLY ONCE **

ids = test.id
train = train.drop(columns=['id', 'host_since', 'first_review', 'last_review'])
test = test.drop(columns=['id', 'host_since', 'first_review', 'last_review'])

x_train = train.drop(columns='host_is_superhost')
y_train = train.host_is_superhost

x_test = test

x_train = pd.get_dummies(x_train)
x_test = pd.get_dummies(x_test)

def clean_feature_name(name):
    translation_table = str.maketrans({
        ",": "",
        "[": "",
        "]": "",
        "'": "",
        '"': "",
        " ": "_"
    })
    cleaned_name = name.translate(translation_table)
    return cleaned_name

# Clean all feature names in the DataFrame
x_train.columns = [clean_feature_name(col) for col in x_train.columns]
x_test.columns = [clean_feature_name(col) for col in x_test.columns]

### Model Tuning and Training

In [9]:
# ensemble of six boosting models, each already individually tuned

bm1 = KNeighborsClassifier(n_neighbors = 7, weights = 'distance')

bm2 = BaggingClassifier(random_state = 12,
                            n_estimators = 100,
                            bootstrap = False,
                            bootstrap_features = True,
                            max_features = .5,
                            max_samples = .9,
                            )

bm3 = RandomForestClassifier(random_state = 1,
                            n_estimators = 100,
                            bootstrap = True,
                            max_features = .725,
                            max_samples = .8
                            )

bm4 = LGBMClassifier(random_state=1, num_threads = 1, verbose=-1,
                           learning_rate=0.1, max_depth=6, n_estimators=800,
                           reg_lambda=0.1, subsample=0.5)

bm5 = CatBoostClassifier(learning_rate=0.1, max_depth=6, n_estimators=2200,
                                reg_lambda=0.1, subsample=0.75, random_state = 1)

bm6 = XGBClassifier(random_state = 12,
                    objective = 'binary:logistic',
                    scale_pos_weight = 1,
                    n_estimators = 95,
                    max_depth = 8,
                    subsample = .95,
                    learning_rate = .1,
                    reg_lambda = .1,
                    gamma = .1)

model = VotingClassifier(estimators = [('knn',bm1),('bagged_trees',bm2),('random_forest',bm3),('lgbm',bm4),('cat',bm5),('xgb',bm6)])

model.fit(x_train, y_train)

0:	learn: 0.6274174	total: 54.5ms	remaining: 1m 59s
1:	learn: 0.5884776	total: 61.5ms	remaining: 1m 7s
2:	learn: 0.5591490	total: 67.7ms	remaining: 49.6s
3:	learn: 0.5306773	total: 74.4ms	remaining: 40.8s
4:	learn: 0.5150296	total: 82.5ms	remaining: 36.2s
5:	learn: 0.4965097	total: 89.3ms	remaining: 32.7s
6:	learn: 0.4811611	total: 96.1ms	remaining: 30.1s
7:	learn: 0.4662467	total: 103ms	remaining: 28.2s
8:	learn: 0.4455554	total: 115ms	remaining: 27.9s
9:	learn: 0.4314306	total: 125ms	remaining: 27.3s
10:	learn: 0.4201412	total: 136ms	remaining: 27.1s
11:	learn: 0.4126888	total: 143ms	remaining: 26s
12:	learn: 0.4060443	total: 149ms	remaining: 25.1s
13:	learn: 0.3987617	total: 156ms	remaining: 24.4s
14:	learn: 0.3915900	total: 165ms	remaining: 24s
15:	learn: 0.3873306	total: 173ms	remaining: 23.6s
16:	learn: 0.3789556	total: 179ms	remaining: 23s
17:	learn: 0.3742651	total: 186ms	remaining: 22.5s
18:	learn: 0.3690081	total: 192ms	remaining: 22.1s
19:	learn: 0.3639781	total: 199ms	remai

### Prediction

In [10]:

y_pred_prob = model.predict(x_test)
y_pred = y_pred_prob > .5
integers = y_pred.astype(int)

# I then used the train data and the fact that some host_id values overlapped with the test data to override predictions that I knew were wrong
predictions = pd.DataFrame({'id': ids,'host_id':test.host_id,'predicted':integers})
correct = train[['host_id','host_is_superhost']]
correct = correct.sort_values(by='host_id').drop_duplicates()
merged = pd.merge(predictions, correct, on='host_id', how='left')
merged['host_is_superhost'].isna().sum()
merged['host_is_superhost'] = merged['host_is_superhost'].fillna(merged['predicted'])
merged['host_is_superhost'].isna().sum()
merged['predicted'] = merged['host_is_superhost']
merged.drop(columns=['host_is_superhost'], inplace=True)
merged.drop(columns=['host_id'], inplace=True)

# the output is then created using the new dataframe called merged
output = pd.DataFrame({'id': merged.id,'predicted':merged.predicted})

# create csv file of predicted classifications
output = pd.DataFrame({'id': merged.id,'predicted':merged.predicted})
output.to_csv('annaroneyclassification.csv', index=False)