In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Load the dataset
df = pd.read_csv('AB_NYC_2019.csv')

# Data cleaning
df = df[df['price'] < 1000]
df = df.dropna(subset=['reviews_per_month', 'last_review'])

# Convert last_review to datetime and extract month
df['last_review'] = pd.to_datetime(df['last_review'], errors='coerce')
df['month'] = df['last_review'].dt.month
df['review_score'] = df['number_of_reviews'] * df['reviews_per_month']
df = df.fillna(0)

# Encode categorical variables
df_encoded = pd.get_dummies(df, columns=['neighbourhood_group', 'room_type'], drop_first=True)

# Feature selection
features = ['minimum_nights', 'availability_365', 'reviews_per_month', 'review_score', 'month'] + \
           [col for col in df_encoded.columns if 'neighbourhood_group_' in col or 'room_type_' in col]
X = df_encoded[features]
y = df_encoded['price']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit model
model = LinearRegression()
model.fit(X_train, y_train)

# Prepare example listings
example_listings = pd.DataFrame([
    {'minimum_nights': 3, 'availability_365': 180, 'reviews_per_month': 3.0, 'review_score': 3.0 * 10, 'month': 3,
     'neighbourhood_group_Brooklyn': 1, 'neighbourhood_group_Manhattan': 0,
     'neighbourhood_group_Queens': 0, 'neighbourhood_group_Staten Island': 0,
     'room_type_Private room': 1, 'room_type_Shared room': 0},

    {'minimum_nights': 5, 'availability_365': 360, 'reviews_per_month': 5.5, 'review_score': 5.5 * 25, 'month': 6,
     'neighbourhood_group_Brooklyn': 0, 'neighbourhood_group_Manhattan': 1,
     'neighbourhood_group_Queens': 0, 'neighbourhood_group_Staten Island': 0,
     'room_type_Private room': 0, 'room_type_Shared room': 0},

    {'minimum_nights': 2, 'availability_365': 60, 'reviews_per_month': 1.2, 'review_score': 1.2 * 5, 'month': 11,
     'neighbourhood_group_Brooklyn': 0, 'neighbourhood_group_Manhattan': 0,
     'neighbourhood_group_Queens': 0, 'neighbourhood_group_Staten Island': 1,
     'room_type_Private room': 0, 'room_type_Shared room': 1}
])

# Fill in missing columns with 0 (if any)
for col in X.columns:
    if col not in example_listings.columns:
        example_listings[col] = 0

# Ensure order matches
example_listings = example_listings[X.columns]

# Predict prices
predicted_prices = model.predict(example_listings)
predicted_prices



array([ 76.8662456 , 240.39658087,   4.92658252])