In [None]:
#Questions to be answered
# Question 1: Neighbourhood with most Airbnb houses with WIFI and laptop friendly workplaces
# Question 2: Neighbourhood with most Airbnb houses with cleansed neighborhoods
# Question 3: Neighbourhood with the largest number of highest review scores
# Question 4: Most important features that affect the prices

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np


In [2]:
file_path = 'listings.csv'
listings_data = pd.read_csv(file_path)

In [7]:
# Function to check if both WIFI and laptop friendly workplaces are present in the amenities
def has_required_amenities(amenities):
    required_amenities = {"Wireless Internet", "Laptop Friendly Workspace"}
    amenities_set = set(amenity.strip('"') for amenity in amenities.strip('{}').split(','))
    return required_amenities.issubset(amenities_set)

In [8]:
# Apply the function to filter listings with required amenities
filtered_df = listings_data[listings_data['amenities'].apply(has_required_amenities)]
filtered_df

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,...,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,12147973,https://www.airbnb.com/rooms/12147973,20160906204935,2016-09-07,Sunny Bungalow in the City,"Cozy, sunny, family home. Master bedroom high...",The house has an open and cozy feel at the sam...,"Cozy, sunny, family home. Master bedroom high...",none,"Roslindale is quiet, convenient and friendly. ...",...,,f,,,f,moderate,f,f,1,
3,1436513,https://www.airbnb.com/rooms/1436513,20160906204935,2016-09-07,Spacious Sunny Bedroom Suite in Historic Home,Come experience the comforts of home away from...,Most places you find in Boston are small howev...,Come experience the comforts of home away from...,none,Roslindale is a lovely little neighborhood loc...,...,10.0,f,,,f,moderate,f,f,1,1.00
7,2843445,https://www.airbnb.com/rooms/2843445,20160906204935,2016-09-07,"""Tranquility"" on ""Top of the Hill""","We can accommodate guests who are gluten-free,...",We provide a bedroom and full shared bath. Ra...,"We can accommodate guests who are gluten-free,...",none,Our neighborhood is residential with friendly ...,...,10.0,f,,,f,moderate,t,t,2,2.38
9,849408,https://www.airbnb.com/rooms/849408,20160906204935,2016-09-07,Perfect & Practical Boston Rental,This is a cozy and spacious two bedroom unit w...,Perfect apartment rental for those in town vis...,This is a cozy and spacious two bedroom unit w...,none,"This neighborhood truly has it all. Good, not...",...,9.0,f,,,f,strict,f,f,2,1.01
12,2684840,https://www.airbnb.com/rooms/2684840,20160906204935,2016-09-07,"Updated, spacious living in Rozzie","Clean, sunny 2 bedroom in amazing Roslindale V...",,"Clean, sunny 2 bedroom in amazing Roslindale V...",none,,...,10.0,f,,,t,flexible,f,f,2,0.64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3575,5280827,https://www.airbnb.com/rooms/5280827,20160906204935,2016-09-07,Private cozy room,"Cozy room near T station. 5 min walking to T, ...",,"Cozy room near T station. 5 min walking to T, ...",none,,...,7.0,f,,,t,strict,f,f,8,0.33
3578,14536322,https://www.airbnb.com/rooms/14536322,20160906204935,2016-09-07,Victorian Garden Level Room - Omega,Exquisite garden level (semi basement) room in...,"Very safe, bustling with shops, restaurants, H...",Exquisite garden level (semi basement) room in...,none,"Safe, in the midst of all Harvard Square actio...",...,10.0,f,,,t,strict,f,f,2,2.00
3579,14852179,https://www.airbnb.com/rooms/14852179,20160906204935,2016-09-07,Spacious Queen Bed Room Close to Boston Univer...,- Grocery: A full-size Star market is 2 minute...,,- Grocery: A full-size Star market is 2 minute...,none,,...,,f,,,f,flexible,f,f,1,
3580,8373729,https://www.airbnb.com/rooms/8373729,20160906204935,2016-09-07,Big cozy room near T,5 min walking to Orange Line subway with 2 sto...,,5 min walking to Orange Line subway with 2 sto...,none,,...,9.0,f,,,t,strict,f,f,8,0.34


In [9]:
# Question 1: Neighbourhood with most Airbnb houses with WIFI and laptop friendly workplaces
neighbourhood_with_most_wifi_and_laptop_friendly = filtered_df['neighbourhood_cleansed'].value_counts().idxmax()
neighbourhood_with_most_wifi_and_laptop_friendly

'Jamaica Plain'

In [10]:
# Question 2: Neighbourhood with most Airbnb houses with cleansed neighborhoods
neighbourhood_with_most_cleansed = listings_data['neighbourhood_cleansed'].value_counts().idxmax()
neighbourhood_with_most_cleansed

'Jamaica Plain'

In [11]:
# Question 3: Neighbourhood with the largest number of highest review scores
listings_data['review_scores_rating'] = listings_data['review_scores_rating'].fillna(0)
neighbourhood_with_highest_reviews = listings_data.groupby('neighbourhood_cleansed')['review_scores_rating'].max().idxmax()
neighbourhood_with_highest_reviews

'Allston'

In [12]:
# Feature Analysis for Price
# Selectt relevant features for the analysis
features = ['neighbourhood_cleansed', 'room_type', 'accommodates', 'bathrooms', 'bedrooms',
            'beds', 'amenities', 'number_of_reviews', 'review_scores_rating', 'instant_bookable']
target = 'price'

In [13]:
# Convert price to a numeric value
listings_data['price'] = listings_data['price'].replace('[\$,]', '', regex=True).astype(float)

# Handling missing values
listings_data[features] = listings_data[features].fillna(listings_data[features].mean(numeric_only=True))

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(listings_data[features], listings_data[target], test_size=0.2, random_state=42)

# Creating a preprocessing pipeline
numeric_features = listings_data[features].select_dtypes(include=['int64', 'float64']).columns
categorical_features = listings_data[features].select_dtypes(include=['object']).columns

In [14]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [15]:
# Creating the regression model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])


In [16]:
# Training the model
model.fit(X_train, y_train)

In [17]:

# Predicting and evaluating the model
y_pred = model.predict(X_test)
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Coefficient of Determination:', r2_score(y_test, y_pred))


Mean Squared Error: 27768.071668395503
Coefficient of Determination: 0.20299065201219402


In [19]:
# Analyzing feature importance
if hasattr(model.named_steps['regressor'], 'coef_'):
    coefficients = model.named_steps['regressor'].coef_
    feature_names = numeric_features.tolist() + \
                    list(model.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features))
    feature_importance = pd.Series(coefficients, index=feature_names)
    print("\nFeature importances:\n", feature_importance.sort_values(ascending=False))



Feature importances:
 amenities_{TV,"Cable TV","Wireless Internet","Air Conditioning",Kitchen,"Pets Allowed",Doorman,Gym,"Pets live on this property",Dog(s),Cat(s),"Elevator in Building",Heating,"Family/Kid Friendly","Suitable for Events",Washer,Dryer,"Smoke Detector","First Aid Kit","Fire Extinguisher",Essentials,Shampoo,"24-Hour Check-in","Hair Dryer",Iron,"Laptop Friendly Workspace"}    2574.276540
amenities_{Internet,"Air Conditioning",Kitchen,Doorman,Gym,"Elevator in Building",Heating,Dryer,"Smoke Detector"}                                                                                                                                                                                                                                                                  1294.706880
amenities_{TV,Internet,"Wireless Internet","Air Conditioning",Kitchen,"Pets Allowed","Family/Kid Friendly",Washer,Dryer}                                                                                           