In [1]:
#Questions to be answered
# Question 1: Neighbourhood with most Airbnb houses with WIFI and laptop friendly workplaces
# Question 2: Neighbourhood with most Airbnb houses with cleansed neighborhoods
# Question 3: Neighbourhood with the largest number of highest review scores
# Question 4: Most important features that affect the prices

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np


In [None]:
file_path = 'listings.csv'
listings_data = pd.read_csv(file_path)

In [None]:
# Function to check if both WIFI and laptop friendly workplaces are present in the amenities
def has_required_amenities(amenities):
    required_amenities = {"Wireless Internet", "Laptop Friendly Workspace"}
    amenities_set = set(amenity.strip('"') for amenity in amenities.strip('{}').split(','))
    return required_amenities.issubset(amenities_set)

In [None]:
# Apply the function to filter listings with required amenities
filtered_df = listings_data[listings_data['amenities'].apply(has_required_amenities)]

In [None]:
# Question 1: Neighbourhood with most Airbnb houses with WIFI and laptop friendly workplaces
neighbourhood_with_most_wifi_and_laptop_friendly = filtered_df['neighbourhood_cleansed'].value_counts().idxmax()


In [None]:
# Question 2: Neighbourhood with most Airbnb houses with cleansed neighborhoods
neighbourhood_with_most_cleansed = listings_data['neighbourhood_cleansed'].value_counts().idxmax()

In [None]:
# Question 3: Neighbourhood with the largest number of highest review scores
listings_data['review_scores_rating'] = listings_data['review_scores_rating'].fillna(0)
neighbourhood_with_highest_reviews = listings_data.groupby('neighbourhood_cleansed')['review_scores_rating'].max().idxmax()


In [None]:
# Feature Analysis for Price
# Selectt relevant features for the analysis
features = ['neighbourhood_cleansed', 'room_type', 'accommodates', 'bathrooms', 'bedrooms',
            'beds', 'amenities', 'number_of_reviews', 'review_scores_rating', 'instant_bookable']
target = 'price'

In [None]:
# Convert price to a numeric value
listings_data['price'] = listings_data['price'].replace('[\$,]', '', regex=True).astype(float)

# Handling missing values
listings_data[features] = listings_data[features].fillna(listings_data[features].mean(numeric_only=True))

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(listings_data[features], listings_data[target], test_size=0.2, random_state=42)

# Creating a preprocessing pipeline
numeric_features = listings_data[features].select_dtypes(include=['int64', 'float64']).columns
categorical_features = listings_data[features].select_dtypes(include=['object']).columns

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [None]:
# Creating the regression model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])


In [None]:
# Training the model
model.fit(X_train, y_train)

In [None]:

# Predicting and evaluating the model
y_pred = model.predict(X_test)
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Coefficient of Determination:', r2_score(y_test, y_pred))


In [None]:
# Analyzing feature importance
if hasattr(model.named_steps['regressor'], 'coef_'):
    coefficients = model.named_steps['regressor'].coef_
    feature_names = numeric_features.tolist() + \
                    list(model.named_steps['preprocessor'].named_transformers_['cat'].named_steps['onehot'].get_feature_names(categorical_features))
    feature_importance = pd.Series(coefficients, index=feature_names)
    print("\nFeature importances:\n", feature_importance.sort_values(ascending=False))
