In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [None]:
file_path = 'data/listings.csv'
df = pd.read_csv(file_path)

df = df.drop_duplicates()
print("Duplicates removed. Remaining rows:", len(df))

missing_values = df.isnull().sum()
print("Missing values before handling:\n", missing_values)

In [None]:
df.dropna(subset=['price', 'bedrooms'], inplace=True)
missing_values = df.isnull().sum()

In [None]:
def extract_bathrooms(bathrooms_text):
    try:
        return float(bathrooms_text.split()[0])
    except:
        return np.nan

df['bathrooms'] = df['bathrooms'].fillna(df['bathrooms_text'].apply(extract_bathrooms))

In [None]:
df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
df['host_since'] = pd.to_datetime(df['host_since'])
df['last_scraped'] = pd.to_datetime(df['last_scraped'])
df['first_review'] = pd.to_datetime(df['first_review'])
df['last_review'] = pd.to_datetime(df['last_review'])

boolean_columns = ['host_is_superhost', 'instant_bookable', 'has_availability']
for col in boolean_columns:
    df[col] = df[col].apply(lambda x: True if x == 't' else False)

In [None]:
df['host_experience_days'] = (pd.to_datetime('today') - df['host_since']).dt.days

df['avg_availability'] = df[['availability_30', 'availability_60', 'availability_90', 'availability_365']].mean(axis=1)

text_columns = ['name', 'description', 'neighborhood_overview', 'host_about']
for col in text_columns:
    df[col] = df[col].str.lower().str.replace('[^\w\s]', '')

In [None]:
price_threshold = df['price'].quantile(0.99)
df = df[df['price'] < price_threshold]

In [None]:
columns_to_drop = [
    'listing_url', 'scrape_id', 'source','host_location', 'host_about','host_response_time', 'host_response_rate', 
    'host_acceptance_rate','host_thumbnail_url', 'host_picture_url','host_neighbourhood', 'host_listings_count', 'host_total_listings_count',
    'host_verifications', 'host_has_profile_pic', 'host_identity_verified','neighborhood_overview',  'calendar_updated',
    'calendar_last_scraped', 'first_review', 'last_review', 'license', 'instant_bookable', 'calculated_host_listings_count',
    'calculated_host_listings_count_entire_homes','calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms',
    'neighbourhood_group_cleansed', 'neighbourhood'
]

df_cleaned = df.drop(columns=columns_to_drop)


In [None]:
df_cleaned.head(50)

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['price'], kde=True, bins=50)
plt.title('Distribution of Rental Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='room_type', y='price', data=df)
plt.title('Price by Room Type')
plt.xlabel('Room Type')
plt.ylabel('Price')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='property_type', y='price', data=df)
plt.title('Price by Property Type')
plt.xlabel('Property Type')
plt.ylabel('Price')
plt.xticks(rotation=90)
plt.show()

In [None]:
neighborhood_density = df['neighbourhood_cleansed'].value_counts().reset_index()
neighborhood_density.columns = ['neighbourhood_cleansed', 'listings_count']

plt.figure(figsize=(12, 8))
sns.barplot(data=neighborhood_density.head(20), x='listings_count', y='neighbourhood_cleansed')
plt.title('Top 20 Neighborhoods by Listing Density')
plt.xlabel('Number of Listings')
plt.ylabel('Neighborhood')
plt.show()

In [None]:
neighborhood_price = df.groupby('neighbourhood_cleansed')['price'].mean().reset_index()

plt.figure(figsize=(12, 8))
sns.barplot(data=neighborhood_price.sort_values(by='price', ascending=False).head(20), x='price', y='neighbourhood_cleansed')
plt.title('Top 20 Neighborhoods by Average Price')
plt.xlabel('Average Price')
plt.ylabel('Neighborhood')
plt.show()

In [None]:
df['occupancy_rate'] = 1 - df['availability_365'] / 365

availability_columns = ['availability_30', 'availability_60', 'availability_90', 'availability_365']
availability_data = df[availability_columns].mean().reset_index()
availability_data.columns = ['Period', 'Average Availability']

plt.figure(figsize=(10, 6))
sns.barplot(data=availability_data, x='Period', y='Average Availability')
plt.title('Average Availability Over Different Periods')
plt.xlabel('Period')
plt.ylabel('Average Availability')
plt.show()

In [None]:
neighborhood_occupancy = df.groupby('neighbourhood_cleansed')['occupancy_rate'].mean().reset_index()

plt.figure(figsize=(12, 8))
sns.barplot(data=neighborhood_occupancy.sort_values(by='occupancy_rate', ascending=False).head(20), x='occupancy_rate', y='neighbourhood_cleansed')
plt.title('Top 20 Neighborhoods by Occupancy Rate')
plt.xlabel('Average Occupancy Rate')
plt.ylabel('Neighborhood')
plt.show()

In [None]:
correlation_matrix = df[['price', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'review_scores_rating']].corr()

plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()

In [None]:
features = ['accommodates', 'bathrooms', 'bedrooms', 'beds', 'review_scores_rating']
X = df[features]
y = df['price']

imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

feature_importance = pd.DataFrame({'feature': features, 'importance': model.coef_})
feature_importance = feature_importance.sort_values(by='importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Feature Importance')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()