In [None]:
import numpy as np  
import pandas as pd 
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.feature_selection import SelectPercentile, mutual_info_regression
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from projectFunctions import commentSentimentAnalysis, featureSelection, descriptionSentimentAnalysis, amenitiesCategorization
from sklearn.preprocessing import OneHotEncoder
from lazypredict.Supervised import LazyClassifier
import plotly.express as px
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
import time
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# **<span style="color:#3c1518">Import listings.csv file with translated descriptions</span>**

In [None]:
dataset = pd.read_csv('listings.csv')

# **<span style="color:#3c1518">Drop useless Features</span>**

In [None]:
X = dataset.drop(columns=['Unnamed: 0', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name','neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_thumbnail_url','host_verifications','neighbourhood','neighbourhood_group_cleansed', 'latitude',
       'longitude','minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights','calendar_updated','calendar_last_scraped','Unnamed: 77','host_picture_url','host_location','host_since','calculated_host_listings_count_shared_rooms','calculated_host_listings_count_private_rooms','calculated_host_listings_count_entire_homes'])

# Dropped calculated_host_listings_count_private_rooms,calculated_host_listings_count_private_rooms cause almost all records are equal to zero 
# Dropped calculated_host_listings_count_entire_homes as it gives the same info with the feature calculated_host_listings_count

# **<span style="color:#3c1518">Drop Features that have for that k% NaN Values. We have selected 25%</span>**

In [None]:
totalNumberOfRows = X['descriptions'].count()
thresholdToDropColumn = 25 # Insert k here 
for column in X.columns:
    percentageOfNaNValues = (X[X[column].isna()]['descriptions'].count())/totalNumberOfRows*100
    if percentageOfNaNValues >= thresholdToDropColumn : X = X.drop(columns=column)

# **<span style="color:#3c1518">Execute some Feature Selection Algorithms</span>**

In [None]:
selectedFeatures = featureSelection(X, 10)
pd.DataFrame(selectedFeatures)

# **<span style="color:#3c1518">Review Comments Sentiment Analysis</span>**

In [None]:
commentsRatingPerId = commentSentimentAnalysis()

## <span style="color:#3c1518">Merge listings with Comments Rate per Id </span>

In [None]:
X = pd.merge(X,commentsRatingPerId, how= 'left', left_on= 'id', right_on='listing_id')

# **<span style="color:#3c1518">Descriptions Sentiment Analysis</span>**

## <span style="color:#3c1518">Drop records that do not include descriptions (69 records)</span>

In [None]:
X = X[X.descriptions.notna()]
X.reset_index(inplace= True, drop= True)

## <span style="color:#3c1518">Sentiment Analysis</span>

In [None]:
X['descriptions'] = descriptionSentimentAnalysis(X.descriptions)

# **<span style="color:#3c1518">Onehot in Amenities</span>**

In [None]:
amenities = amenitiesCategorization(X).astype(int)
X.drop(columns='amenities', inplace= True)
X.drop(columns=list(amenities.columns), inplace= True)
X = X.join(amenities)

# **<span style="color:#3c1518">Onehot in Room Type</span>**

In [None]:
roomTypes = list(X.room_type.unique())
X = X.join(pd.get_dummies(X.room_type)).drop(columns='room_type')

# **<span style="color:#3c1518">Feature Selection</span>**

In [None]:
Xfinal = X[selectedFeatures['RFE']+list(amenities.columns)+roomTypes+['price']].dropna()
y = Xfinal.price
Xfinal.drop(columns='price')

# **<span style="color:#3c1518">Standarization</span>**

In [None]:
scaler = StandardScaler()
Xfinal = scaler.fit_transform(Xfinal)

# **<span style="color:#3c1518">Prices Binning</span>**

In [None]:
est = KBinsDiscretizer(n_bins=30, encode='ordinal', strategy='quantile')
yFinal = est.fit_transform(np.array(y).reshape(-1,1))
edges = est.bin_edges_

In [None]:
fig = px.histogram(yFinal)
fig.show()

In [None]:
edges

# **<span style="color:#3c1518">Lazy Predict</span>**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Xfinal, yFinal,test_size=.3,random_state =123)

In [None]:
clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
models

# **<span style="color:#3c1518">Hyperparametres Tuning</span>**

In [None]:
pipe = Pipeline(steps=[("scaler", StandardScaler()), ('dtc', DecisionTreeClassifier())])
param_grid = {"dtc__min_samples_split" : list(np.arange(2,10,1)), "dtc__min_samples_leaf": list(np.arange(1,10,1)), \
              "dtc__splitter": ['best', 'random']}
search = HalvingGridSearchCV(pipe, param_grid, n_jobs=-1)
search.fit(X_train, y_train)

# **<span style="color:#3c1518">Evaluation of the Selected Model</span>**

In [None]:
pipe = Pipeline(steps=[("scaler", StandardScaler()), ('dtc', search.best_estimator_['dtc'])])
startTime = time.time()
pipe.fit(X_train,y_train)
stopTime = time.time()
trainingTime = stopTime - startTime
pipe.score(X_test,y_test)

# **<span style="color:#3c1518">Final Metrics</span>**

## <span style="color:#3c1518">Accuracy</span>

In [None]:
accuracy_score(pipe.predict(X_test),y_test)

## <span style="color:#3c1518">Balanced Accuracy</span>

In [None]:
balanced_accuracy_score(pipe.predict(X_test),y_test)

## <span style="color:#3c1518">F1 Score</span>

In [None]:
f1_score(pipe.predict(X_test),y_test,average='micro')

## <span style="color:#3c1518">Confusion Matrix</span>

In [None]:
predictions = pipe.predict(X_test)
cm = confusion_matrix(y_test, predictions, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=clf.classes_)
disp.plot()
plt.gcf().set_size_inches(15,10)
plt.show()