#### data wrangling

In [1]:
# import packages
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

df = pd.read_csv("https://media.githubusercontent.com/media/boneeyah/DS7331_Group/main/Data_Files/airbnb_los_angeles.csv")
#df = pd.read_csv("Data_Files/airbnb_los_angeles.csv")

In [2]:
# drop variables which won't be useful for the classification model
for col in [
    'listing_url','scrape_id','last_scraped','description','neighborhood_overview','picture_url','host_url','host_about','host_response_time','host_response_rate','host_acceptance_rate',
    'host_thumbnail_url','host_picture_url','host_verifications','host_has_profile_pic','bathroom_text','host_listings_count','host_neighbourhood','bathrooms','minimum_minimum_nights',
    'maximum_minimum_nights','minimum_maximum_nights','maximum_maximum_nights','minimum_nights_avg_ntm','maximum_nights_avg_ntm','calendar_updated','availability_30','availability_60',
    'availability_90','availability_365','calendar_last_scraped','number_of_reviews_ltm','number_of_reviews_l30d','review_scores_accuracy','review_scores_communication','review_scores_cleanliness',
    'review_scores_checkin','review_scores_value','review_scores_location','calculated_host_listings_count_entire_homes','calculated_host_listings_count_private_rooms',
    'calculated_host_listings_count_shared_rooms','reviews_per_month','neighbourhood','neighbourhood_group_cleansed', 'first_review','last_review','minimum_nights','maximum_nights','license','name','host_name'
]:
    if col in df:
        del df[col]

In [3]:
# remove nulls
df = df[~df.review_scores_rating.isnull() & ~df.bathrooms_text.isnull() & ~df.host_since.isnull() & ~df.host_location.isnull()]

## getting property type from string
types = ['Private room', 'Entire', 'Room in hotel','Room','Shared room']
pat = '|'.join(r"\b{}\b".format(x) for x in types)

df['property_type']= df['property_type'].str.extract('('+ pat + ')', expand = False)
df['property_type'] = (df.property_type.
                       fillna(value = 'other').
                       replace(['Entire','Room in hotel'],['Entire unit','Hotel room']))

In [4]:
# impute values based on median
df['beds'] = df[['accommodates','beds']].groupby(by = 'accommodates').transform(lambda grp: grp.fillna(grp.median()))
df_grouped = df.groupby(by = ['property_type','beds'])
df_imputed = df_grouped[['beds','bedrooms']].transform(lambda grp: grp.fillna(grp.median()))

index = df_imputed[df_imputed.bedrooms.isnull()].index
df = df.drop(index= index)

df['imputed']=df_imputed[['bedrooms']]

# replace 'bedrooms' column with imputed column and deleting the duplicated column
df['bedrooms'] = df['imputed']
del df['imputed']

In [5]:
## now fixing dtypes for attributes
df['host_since'] = pd.to_datetime(df.host_since)
df['price'] = df['price'].replace('[\$,]','',regex = True).astype(float)
df['bathrooms_text'] = df['bathrooms_text'].replace(['Half-bath', 'Shared half-bath', 'Private half-bath'],['0.5 bath','0.5 shared bath', '0.5 private bath'])
df_bathrooms = df['bathrooms_text'].str.split(n=1, expand=True).rename(columns = {0:'bathroom_number',1:'bathroom_type'})
df_bathrooms['bathroom_type'] = df_bathrooms.bathroom_type.fillna(value = 'bath')
df_bathrooms['bathroom_type'] = df_bathrooms['bathroom_type'].replace(['baths','shared baths'],['bath','shared bath'])
df_bathrooms['bathroom_number'] = df_bathrooms['bathroom_number'].astype('float')
df.insert(15, 'bathroom_number',df_bathrooms['bathroom_number'])
df.insert(16, 'bathroom_type', df_bathrooms['bathroom_type'])
del df['bathrooms_text']

In [6]:
### filter out price outliers
df = df[(df.beds<10) & (df.price<750)]

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30580 entries, 0 to 42000
Data columns (total 24 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              30580 non-null  int64         
 1   host_id                         30580 non-null  int64         
 2   host_since                      30580 non-null  datetime64[ns]
 3   host_location                   30580 non-null  object        
 4   host_is_superhost               30580 non-null  object        
 5   host_total_listings_count       30580 non-null  float64       
 6   host_identity_verified          30580 non-null  object        
 7   neighbourhood_cleansed          30580 non-null  object        
 8   latitude                        30580 non-null  float64       
 9   longitude                       30580 non-null  float64       
 10  property_type                   30580 non-null  object        
 11  ro

In [8]:
[df.host_is_superhost.value_counts(),df.host_identity_verified.value_counts(),df.has_availability.value_counts(),df.instant_bookable.value_counts()]

[f    20236
 t    10344
 Name: host_is_superhost, dtype: int64,
 t    26080
 f     4500
 Name: host_identity_verified, dtype: int64,
 t    28785
 f     1795
 Name: has_availability, dtype: int64,
 f    19767
 t    10813
 Name: instant_bookable, dtype: int64]

# Create Models


### Rubric Note: Create a logistic regression model and a support vecotr machine model for the classification task involved with your dataset. Assess how well each model performs (use a 80/20 train/test split for your data). Adjust parameters of the model to make them more accurate. If your dataset size requires the use of stochastic gradient descent, then linear kernel only is fine to use. THat is, the SGDClassifier is fine to use for optimizing logistic regression and linear support vector machines. FOr many problems, SGD will be required in order to train the SVM model in a reasonable timeframe. 

In [9]:
# Model will focus on classifying superhost status
# since we're encoding with binary response, we can use labelencoder from sklearn
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['host_is_superhost'] = label_encoder.fit_transform(df['host_is_superhost'])
df.host_is_superhost.value_counts()

0    20236
1    10344
Name: host_is_superhost, dtype: int64

In [10]:
# encode identity verified
df['host_identity_verified'] = label_encoder.fit_transform(df['host_identity_verified'])
df.host_identity_verified.value_counts()

#encode has_availability
df['has_availability'] = label_encoder.fit_transform(df['has_availability'])
df.has_availability.value_counts()

#encode instant bookable
df['instant_bookable'] = label_encoder.fit_transform(df['instant_bookable'])
df.instant_bookable.value_counts()

0    19767
1    10813
Name: instant_bookable, dtype: int64

In [11]:
## changing host_location to a binary feature of is_local using a list of LA area neighborhoods that will be used to extract from location str
#los_angeles = pd.read_csv('Data_Files/LosAngelesNeighborhoods.csv')
los_angeles = pd.read_csv('https://raw.githubusercontent.com/boneeyah/DS7331_Group/main/Data_Files/LosAngelesNeighborhoods.csv')
los_angeles = los_angeles.iloc[:,0].tolist()

pattern = '|'.join(los_angeles)
df['host_is_local'] = df['host_location'].str.contains(pattern)

df['host_is_local'] = label_encoder.fit_transform(df['host_is_local'])

df = df.drop(columns= ['host_location']) #drop old host_location column

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30580 entries, 0 to 42000
Data columns (total 24 columns):
 #   Column                          Non-Null Count  Dtype         
---  ------                          --------------  -----         
 0   id                              30580 non-null  int64         
 1   host_id                         30580 non-null  int64         
 2   host_since                      30580 non-null  datetime64[ns]
 3   host_is_superhost               30580 non-null  int64         
 4   host_total_listings_count       30580 non-null  float64       
 5   host_identity_verified          30580 non-null  int64         
 6   neighbourhood_cleansed          30580 non-null  object        
 7   latitude                        30580 non-null  float64       
 8   longitude                       30580 non-null  float64       
 9   property_type                   30580 non-null  object        
 10  room_type                       30580 non-null  object        
 11  ac

In [13]:
# One Hot Encoding other categorical variables as a sparse dataset
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

df1 = df[['neighbourhood_cleansed','property_type', 'room_type', 'bathroom_type', 'amenities']] # to get only the 


In [None]:
transformer = make_column_transformer((OneHotEncoder(drop = 'first',sparse=False), ['neighbourhood_cleansed','property_type', 'room_type', 'bathroom_type', 'amenities']), remainder='passthrough')
transformed = transformer.fit_transform(df1)
transformed_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
print(transformed)

In [22]:
transformer.get_feature_names_out()

array(['onehotencoder__neighbourhood_cleansed_Adams-Normandie',
       'onehotencoder__neighbourhood_cleansed_Agoura Hills',
       'onehotencoder__neighbourhood_cleansed_Agua Dulce', ...,
       'onehotencoder__amenities_["Wifi", "Smoke alarm", "Long term stays allowed", "Kitchen"]',
       'onehotencoder__amenities_["Wifi", "Washer", "TV", "Kitchen"]',
       'onehotencoder__amenities_[]'], dtype=object)

In [None]:
df.neighbourhood_cleansed.info()

In [None]:
df.property_type.info()

# Model Advantages

### Rubric Note: Discuss the advantages of each model for each classifciation task. Does one type of model offer superior performance over another in terms of prediction accuracy? In terms of training time or efficience? Explain in detail.

# Interpret Feature Importance

### Rubric Note: Use the weights from logistic regression to interpret the importance of different features for the classification task. Explain your interpretation in detail. WHy do you think some variables are more important?

# Interpret Support Vectors

### Rubric Note: Look at the chosen support vectors for the classfication task. Do these provide any 8insight into the data? Explain. IF you used stochastic gradient descent (and therefore did not explicitly solve for support vectors), try subsampling your data to train the SVC model - then analyze the support vectors from the subsampled dataset.