In [None]:
# import the required packages
import pandas as pd
import numpy as np

## 1. HR Analytics

### 1.1 Load and Sample the data

In [None]:
#ORIGINAL HR ANALYTICS DATASET FROM ASG1
hr_data = pd.read_csv('hr_data.csv')
hr_data.head()

In [None]:
from sklearn.pipeline import Pipeline
import feature_engine.imputation as mdi
from feature_engine.encoding import OneHotEncoder
from sklearn.preprocessing import StandardScaler
#Missing Value Imputation Copied from Asg1
pipe = Pipeline(steps=[
    ('imp_num_median', mdi.MeanMedianImputer(imputation_method = 'median', variables='previous_year_rating')),
    ('imp_cat_frequent', mdi.CategoricalImputer(variables ='education', imputation_method='frequent')),
])

pipe.fit(hr_data)
hr_data = pipe.transform(hr_data)

#Categorical Encoding Copied from Asg1
hr_data['education'] = hr_data['education'].replace(("Master's & above",
                                                     "Bachelor's", "Below Secondary"),(3, 2, 1))
ohe_enc = OneHotEncoder(
    top_categories=None,  
    variables=None,
    drop_last=False)
ohe_enc.fit(hr_data)
hr_data = ohe_enc.transform(hr_data)

#Scaling of Data to exclude is_promoted column
df_target = hr_data['is_promoted']
df_scale = hr_data.drop(columns=['is_promoted'])
scaler = StandardScaler()
scaler.fit(df_scale)
df_hr_scaled = scaler.transform(df_scale)
df_hr_sScaled = pd.DataFrame(df_hr_scaled, columns=df_scale.columns)
df_hr_sScaled = df_hr_sScaled.reset_index(drop=True)
df_target = df_target.reset_index(drop=True)
hr_data = df_hr_sScaled
hr_data['is_promoted'] = df_target

#Additional features Copied from Asg1
hr_data['total_score'] = hr_data['avg_training_score'] * hr_data['no_of_trainings']
hr_data['awards_per_year'] = hr_data['awards_won?'] / hr_data['length_of_service']

#Dropping features Copied from Asg1
hr_data = hr_data.drop(['employee_id'], axis = 1)

In [None]:
df_hr = hr_data.copy()

In [None]:
df_hr.info()

In [None]:
from sklearn.model_selection import train_test_split

#Stratified Sampling
hr1=df_hr[df_hr['is_promoted']==1]
hr0=df_hr[df_hr['is_promoted']==0]
hr0_sampled = hr0.sample(n=len(hr1), random_state=2).copy()
hr_stratified=pd.concat([hr1,hr0_sampled],axis=0)
x_train_strat, x_test_strat, y_train_strat, y_test_strat = train_test_split(hr_stratified.drop('is_promoted', axis= 1),
                                                    hr_stratified['is_promoted'], test_size=0.3, random_state=1)

In [None]:
print('Stratified Promoted:', (hr_stratified['is_promoted'] == 1).sum())
print('Stratified Not promoted:', (hr_stratified['is_promoted'] == 0).sum())
print(x_train_strat.shape, x_test_strat.shape)

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='minority')
x=df_hr.drop('is_promoted', axis= 1)
y=df_hr['is_promoted']
X, Y = smote.fit_resample(x, y)
x_train_smote, x_test_smote, y_train_smote, y_test_smote = train_test_split(X, Y, test_size=0.3, random_state=1)

In [None]:
print('Smote Promoted:', (Y == 1).sum())
print('Smote Not promoted:', (Y == 0).sum())
print(x_train_smote.shape, x_test_smote.shape)

### 1.2 Build the Model(s)

In [None]:
results_list = []

#### Logistic Regression baseline model test for Stratified Sampling and SMOTE Sampling

In [None]:
from sklearn.linear_model import LogisticRegression
lr_strat = LogisticRegression()
lr_strat.fit(x_train_strat, y_train_strat)
lr_smote = LogisticRegression()
lr_smote.fit(x_train_smote, y_train_smote)
print('Stratified training accuracy is: ', lr_strat.score(x_train_strat,y_train_strat))
print('Stratified testing accuracy is: ', lr_strat.score(x_test_strat,y_test_strat))
print('SMOTE training accuracy is: ', lr_smote.score(x_train_smote,y_train_smote))
print('SMOTE testing accuracy is: ', lr_smote.score(x_test_smote,y_test_smote))

In [None]:
x_train = x_train_smote
x_test = x_test_smote
y_train = y_train_smote
y_test = y_test_smote

#### Logistic Regression

In [None]:
from sklearn.model_selection import cross_val_score, cross_validate
lr = LogisticRegression()
lr.fit(x_train, y_train)

results  = cross_validate(lr, df_hr.drop('is_promoted', axis= 1), df_hr['is_promoted'], scoring='accuracy', cv=5, return_train_score = True)
results_list.append(results)

#### Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc.fit(x_train, y_train)

results  = cross_validate(dtc, df_hr.drop('is_promoted', axis= 1), df_hr['is_promoted'], scoring='accuracy', cv=5, return_train_score = True)
results_list.append(results)

#### Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_train,y_train)

results  = cross_validate(rf, df_hr.drop('is_promoted', axis= 1), df_hr['is_promoted'], scoring='accuracy', cv=5, return_train_score = True)
results_list.append(results)

#### AdaBoost with Decision Tree Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier
base_model = dtc
ada = AdaBoostClassifier(estimator=base_model)
ada.fit(x_train, y_train)

results  = cross_validate(ada, df_hr.drop('is_promoted', axis= 1), df_hr['is_promoted'], scoring='accuracy', cv=5, return_train_score = True)
results_list.append(results)

#### CatBoost

In [None]:
from catboost import CatBoostClassifier
cbg = CatBoostClassifier()
cbg.fit(x_train, y_train)

results  = cross_validate(cbg, df_hr.drop('is_promoted', axis= 1), df_hr['is_promoted'], scoring='accuracy', cv=5, return_train_score = True)
results_list.append(results)

#### Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(x_train, y_train)

results  = cross_validate(gbc, df_hr.drop('is_promoted', axis= 1), df_hr['is_promoted'], scoring='accuracy', cv=5, return_train_score = True)
results_list.append(results)

#### Soft Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier
ensemble = VotingClassifier(estimators=[('rf', rf), ('ada', ada), ('cat', cbg)], voting='soft')
ensemble.fit(x_train, y_train)

results  = cross_validate(ensemble, df_hr.drop('is_promoted', axis= 1), df_hr['is_promoted'], scoring='accuracy', cv=5, return_train_score = True)
results_list.append(results)

### 1.3 Evaluate and Improve the Model(s)

#### Model Accuracy and Cross Validaton Accuracy

In [None]:
model_list = [lr, dtc, rf, ada, cbg, gbc, ensemble]
model_name = ["Logistic Regression", "Decision Tree", "Random Forest", "AdaBoost", "CatBoost", "GradientBoost","Soft Voting"]
j=0
for i in model_list:
    print(model_name[j], ' training accuracy is: ', i.score(x_train,y_train))
    print(model_name[j], ' testing accuracy is: ', i.score(x_test,y_test))
    print(model_name[j], ' cross val training accuracy is:', sum(results_list[j]['train_score'])/len(results_list[j]['train_score']))
    print(model_name[j], ' cross val testing accuracy is:', sum(results_list[j]['test_score'])/len(results_list[j]['test_score']))
    print()
    j += 1

In [None]:
results_list = []

#### Decision Tree Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
dtc = DecisionTreeClassifier(random_state=1)

param_grid = { "criterion" : ["gini", "entropy"], 
              "max_depth": [20, 30, 35, 40], 
              "min_samples_leaf" : [2, 4, 6, 8]}

gs = GridSearchCV(estimator=dtc, param_grid=param_grid, scoring='accuracy', cv=5)

gs = gs.fit(df_hr.drop('is_promoted', axis= 1), df_hr['is_promoted'])

print(gs.best_score_)
print(gs.best_params_)

In [None]:
dtc = DecisionTreeClassifier(criterion = 'gini', max_depth = 20, min_samples_leaf = 8, random_state=1)
dtc.fit(x_train, y_train)
print('training accuracy is: ', dtc.score(x_train,y_train))
print('testing accuracy is: ', dtc.score(x_test,y_test))

results  = cross_validate(dtc, df_hr.drop('is_promoted', axis= 1), df_hr['is_promoted'], scoring='accuracy', cv=5, return_train_score = True)
results_list.append(results)

#### Random Forest Hyperparameter Tuning

In [None]:
rf = RandomForestClassifier(max_features='sqrt', random_state=1, n_jobs=-1)

param_grid = { "criterion" : ["gini", "entropy"], 
              "max_depth": [20, 30, 35, 40], 
              "min_samples_leaf" : [2, 4, 6, 8], 
              "n_estimators": [40, 70, 100, 120]}

gs = GridSearchCV(estimator=rf, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)

gs = gs.fit(df_hr.drop('is_promoted', axis= 1), df_hr['is_promoted'])

print(gs.best_score_)
print(gs.best_params_)

In [None]:
rf = RandomForestClassifier(criterion = 'entropy', max_depth = 40, min_samples_leaf = 2, n_estimators=70, random_state = 1)
rf.fit(x_train,y_train)

print('training accuracy is: ', rf.score(x_train,y_train))
print('testing accuracy is: ', rf.score(x_test,y_test))

results  = cross_validate(rf, df_hr.drop('is_promoted', axis= 1), df_hr['is_promoted'], scoring='accuracy', cv=5, return_train_score = True)
results_list.append(results)

#### CatBoost Hyperparameter Tuning

In [None]:
cbg = CatBoostClassifier(random_state=1)

param_grid = {
        'iterations': [500, 750, 1000],
        'depth': [3, 6, 9],
        'learning_rate': [0.1, 0.2, 0.25],
        'l2_leaf_reg': [1, 3, 5]}

gs = GridSearchCV(estimator=cbg, param_grid=param_grid, scoring='accuracy', cv=5)

gs = gs.fit(df_hr.drop('is_promoted', axis= 1), df_hr['is_promoted'])

print(gs.best_score_)
print(gs.best_params_)

In [None]:
cbg = CatBoostClassifier(depth = 3, iterations = 1000, l2_leaf_reg = 3, learning_rate = 0.1, random_state=1)
cbg.fit(x_train, y_train)

results  = cross_validate(cbg, df_hr.drop('is_promoted', axis= 1), df_hr['is_promoted'], scoring='accuracy', cv=5, return_train_score = True)
results_list.append(results)
print('training accuracy is: ', cbg.score(x_train,y_train))
print('testing accuracy is: ', cbg.score(x_test,y_test))

In [None]:
model_list = [dtc, rf, cbg]
model_name = ["Decision Tree", "Random Forest", "CatBoost"]
j=0
for i in model_list:
    print(model_name[j], ' training accuracy is: ', i.score(x_train,y_train))
    print(model_name[j], ' testing accuracy is: ', i.score(x_test,y_test))
    print(model_name[j], ' cross val training accuracy is:', sum(results_list[j]['train_score'])/len(results_list[j]['train_score']))
    print(model_name[j], ' cross val testing accuracy is:', sum(results_list[j]['test_score'])/len(results_list[j]['test_score']))
    print()
    j += 1

# Best HR Analytics Model - CatBoost Classifier

In [None]:
predicted_prices = pd.DataFrame({'Actual': y_test, 'Prediction': cbg.predict(x_test)})
predicted_prices.head(10)

## 2. Airbnb

### 2.1 Load the data

In [None]:
airbnb = pd.read_csv('listings_new.csv')
airbnb.head()

In [None]:
df_airbnb = airbnb.copy()

In [None]:
df_airbnb.info()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df_airbnb.drop('price',axis=1), 
                                                    df_airbnb['price'], test_size=0.3, random_state=1)

### 2.2 Build the Model(s)

#### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(x_train, y_train)

#### Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor()
rf_reg.fit(x_train, y_train.ravel()) 

#### Support Vector Machine Regressor

In [None]:
from sklearn.svm import SVR
svm = SVR()
svm.fit(x_train, y_train.ravel())

#### MLP Regressor

In [None]:
from sklearn.neural_network import MLPRegressor
mlp = MLPRegressor()
mlp.fit(x_train,y_train)

#### CatBoost Regressor

In [None]:
from catboost import CatBoostRegressor
cat_reg = CatBoostRegressor()
cat_reg.fit(x_train,y_train)

#### Light GBM Regressor

In [None]:
import lightgbm as lgb
from lightgbm import LGBMRegressor
lgbm = LGBMRegressor()
lgbm.fit(x_train, y_train)

#### XGBoost Regressor

In [None]:
from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(x_train, y_train)

### 2.3 Evaluate and Improve the Model(s)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
model_list = [lr, rf_reg, svm, mlp, cat_reg, lgbm,xgb]
model_name = ["Linear Regression", "Random Forest", "SVM", "MLP", "CatBoost", "LightGBM", "XGBoost"]
j=0
for i in model_list:
    print(model_name[j], ' training mean squared error is: ', mean_squared_error(i.predict(x_train), y_train))
    print(model_name[j], ' testing mean squared error is: ', mean_squared_error(i.predict(x_test), y_test))
    print(model_name[j], ' training mean absolute error is: ', mean_absolute_error(i.predict(x_train), y_train))
    print(model_name[j], ' testing mean absolute error is: ', mean_absolute_error(i.predict(x_test), y_test))
    print(model_name[j], ' training r-square is: ', r2_score(i.predict(x_train), y_train))
    print(model_name[j], ' testing r-square is: ', r2_score(i.predict(x_test), y_test))
    print()
    j += 1

#### Cleaning Airbnb Dataset Again

In [None]:
import scipy.stats as stats
from sklearn.preprocessing import LabelEncoder
airbnb = pd.read_csv('listings.csv')
df_airbnb = airbnb.copy()
# Unecessary features to predict price
df_airbnb.drop(['id','name','host_id','host_name','last_review','reviews_per_month'], inplace=True, axis=1)
# Cap outliers from target value (price)
def cap_outliers(df, column):
    lower_quantile = df[column].quantile(0.25)
    upper_quantile = df[column].quantile(0.75)
    df[column] = df[column].apply(lambda x: lower_quantile if x < lower_quantile else x)
    df[column] = df[column].apply(lambda x: upper_quantile if x > upper_quantile else x)
    return df
#df_airbnb = cap_outliers(df_airbnb,'price')
outlier = (np.abs(stats.zscore(df_airbnb["price"]))<0.8)
outlier_ix = np.where(outlier==False)
df_airbnb.drop(index=outlier_ix[0], inplace=True)



outliers = [] 
# Check outliers using zscore
for lat in df_airbnb['latitude']:
    zscore = (lat - np.mean(df_airbnb['latitude'])) / np.std(df_airbnb['latitude'])
    if zscore > 3:
        outliers.append(lat)
# replace outliers with median
df_cleaned = df_airbnb.replace(outliers, np.median(df_airbnb['latitude']))
# Hosts at air-bnb provide a maximum of one year stay (365 days) in the form of rent to the visitors
df_cleaned = df_cleaned[df_airbnb['minimum_nights'] <= 365]

#Using log transformation on target variable price, will use np.expm1() on predicted price
df_cleaned['price'] = np.log1p(df_cleaned['price'])
#Numeric Transform
df_cleaned['minimum_nights']=np.log1p(df_cleaned['minimum_nights'])
df_cleaned['availability_365']=np.log1p(df_cleaned['availability_365'])
df_cleaned['calculated_host_listings_count']=np.log1p(df_cleaned['calculated_host_listings_count'])
df_cleaned['number_of_reviews']=np.log1p(df_cleaned['number_of_reviews'])


# Map integers to categorical values
room_dict ={
    'Entire home/apt': 1,
    'Private room': 2,
    'Shared room': 3
}
df_cleaned['room_type'] = df_cleaned['room_type'].copy().map(room_dict)

# Label encoding for neighbourhood group and neighbourhood
label = LabelEncoder()
df_cleaned['neighbourhood_group'] = label.fit_transform(df_cleaned['neighbourhood_group'])
label = LabelEncoder()
df_cleaned['neighbourhood'] = label.fit_transform(df_cleaned['neighbourhood'])

df_airbnb=df_cleaned
# Scaling the data
df_target = df_airbnb['price']
df_scale = df_airbnb.drop(columns=['price'])
scaler = StandardScaler()
scaler.fit(df_scale)
airbnb_scaled = scaler.transform(df_scale)
airbnb_sScaled = pd.DataFrame(airbnb_scaled, columns=df_scale.columns)
airbnb_sScaled = airbnb_sScaled.reset_index(drop=True)
df_target = df_target.reset_index(drop=True)
df_airbnb = airbnb_sScaled
df_airbnb['price'] = df_target
df_airbnb.info()
x_train, x_test, y_train, y_test = train_test_split(df_airbnb.drop('price',axis=1), 
                                                    df_airbnb['price'], test_size=0.3, random_state=1)

In [None]:
lr = LinearRegression()
lr.fit(x_train, y_train)
rf_reg = RandomForestRegressor()
rf_reg.fit(x_train, y_train.ravel()) 
svm = SVR()
svm.fit(x_train, y_train.ravel())
mlp = MLPRegressor()
mlp.fit(x_train,y_train)
cat_reg = CatBoostRegressor()
cat_reg.fit(x_train,y_train)
lgbm = LGBMRegressor()
lgbm.fit(x_train, y_train)
xgb = XGBRegressor()
xgb.fit(x_train, y_train)

In [None]:
model_list = [lr, rf_reg, svm, mlp, cat_reg, lgbm,xgb]
j=0
for i in model_list:
    print(model_name[j], ' training mean squared error is: ', mean_squared_error(i.predict(x_train), y_train))
    print(model_name[j], ' testing mean squared error is: ', mean_squared_error(i.predict(x_test), y_test))
    print(model_name[j], ' training mean absolute error is: ', mean_absolute_error(i.predict(x_train), y_train))
    print(model_name[j], ' testing mean absolute error is: ', mean_absolute_error(i.predict(x_test), y_test))
    print(model_name[j], ' training r-square is: ', r2_score(i.predict(x_train), y_train))
    print(model_name[j], ' testing r-square is: ', r2_score(i.predict(x_test), y_test))
    print()
    j += 1

print()
j=0
for i in model_list:
    if i == rf_reg or i == svm:
        results  = cross_validate(i, df_airbnb.drop('price',axis=1), df_airbnb['price'].ravel(), scoring='r2', cv=5, return_train_score = True)
        print(model_name[j], ' cross val training mse:', sum(results['train_score'])/len(results['train_score']))
        print(model_name[j], ' cross val testing mse:', sum(results['test_score'])/len(results['test_score']))
    else:
        results  = cross_validate(i, df_airbnb.drop('price',axis=1), df_airbnb['price'], scoring='r2', cv=5, return_train_score = True)
        print(model_name[j], ' cross val training mse:', sum(results['train_score'])/len(results['train_score']))
        print(model_name[j], ' cross val testing mse:', sum(results['test_score'])/len(results['test_score']))
    print()
    j += 1

#### Random Forest Regressor Hyperparameter Tuning

In [None]:
rf_reg = RandomForestRegressor(random_state=1)

param_grid = {
    'n_estimators': [40, 50, 60],
    'max_depth': [10, 12],
    'min_samples_split': [2, 5],
    "min_samples_leaf" : [3, 5, 8]
}

gs = GridSearchCV(estimator=rf_reg, param_grid=param_grid, cv=5, scoring='r2')

gs = gs.fit(df_airbnb.drop('price', axis= 1), df_airbnb['price'])

print(gs.best_score_)
print(gs.best_params_)

In [None]:
rf_reg = RandomForestRegressor(max_depth=12,min_samples_leaf=5,min_samples_split=2,n_estimators=60)
rf_reg.fit(x_train, y_train.ravel()) 

#### CatBoost Regressor Hyperparameter Tuning

In [None]:
cat_reg = CatBoostRegressor(random_state=1)

param_grid = {
    'learning_rate': [0.1, 0.01],
    'depth': [3, 5, 7],
    'l2_leaf_reg': [3, 5, 7],
    'iterations': [300, 400],
    'boosting_type': ['Ordered', 'Plain']
}

gs = GridSearchCV(estimator=cat_reg, param_grid=param_grid, cv=5, scoring='r2')

gs = gs.fit(df_airbnb.drop('price', axis= 1), df_airbnb['price'])

print(gs.best_score_)
print(gs.best_params_)

In [None]:
cat_reg = CatBoostRegressor(boosting_type='Ordered',depth=7,iterations=400,l2_leaf_reg=5,learning_rate=0.1)
cat_reg.fit(x_train,y_train)

#### XGBoost Regressor Hyperparameter Tuning

In [None]:
xgb = XGBRegressor(random_state=1)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'gamma': [0, 0.1, 0.2],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.5, 0.7, 1]
}

gs = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, scoring='r2')

gs = gs.fit(df_airbnb.drop('price', axis= 1), df_airbnb['price'])

print(gs.best_score_)
print(gs.best_params_)

In [None]:
xgb = XGBRegressor(n_estimators= 100,max_depth= 7,learning_rate= 0.1, gamma= 0.2,min_child_weight=5 ,subsample=0.7 )
xgb.fit(x_train, y_train)

In [None]:
model_list = [rf_reg, cat_reg, xgb]
model_name = ["Random Forest", "CatBoost", "XGBoost"]
j=0
for i in model_list:
    print(model_name[j], ' training mean squared error is: ', mean_squared_error(i.predict(x_train), y_train))
    print(model_name[j], ' testing mean squared error is: ', mean_squared_error(i.predict(x_test), y_test))
    print(model_name[j], ' training mean absolute error is: ', mean_absolute_error(i.predict(x_train), y_train))
    print(model_name[j], ' testing mean absolute error is: ', mean_absolute_error(i.predict(x_test), y_test))
    print(model_name[j], ' training r-square is: ', r2_score(i.predict(x_train), y_train))
    print(model_name[j], ' testing r-square is: ', r2_score(i.predict(x_test), y_test))
    print()
    j += 1

## Best Airbnb Price Model - CatBoost Regressor

In [None]:
predicted_prices = pd.DataFrame({'Actual': np.expm1(y_test), 'Prediction': np.expm1(xgb.predict(x_test))})
predicted_prices.head(10)