In [40]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# import and transform CSV-file into pandas dataframe, inspect it 
StreetEasy= pd.read_csv('streeteasy.csv')
StreetEasy.head()

# remove unnecessary columns to use all relevant columns as features and store them in X
X= StreetEasy.drop(['rental_id','building_id','rent','neighborhood','submarket','borough','no_fee','has_doorman',],axis=1)

# The rent is  our label, meaning what we want to predict
y= StreetEasy['rent']

# split the dataframe into training and test set
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

# scale the data and transform the scaled arrays back into dataframes
scaler=MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)
x_train_scaled = pd.DataFrame(x_train_scaled, columns=x_train.columns)
x_test_scaled = pd.DataFrame(x_test_scaled, columns=x_test.columns)

# build the model with Linear Regression
model=LinearRegression()
model.fit(x_train_scaled,y_train)
y_predict= model.predict(x_test_scaled)

# build the Model with a RandomForestRegressor
rf_model= RandomForestRegressor(n_estimators=100)
rf_model.fit(x_train_scaled,y_train)
y_predict_rf= rf_model.predict(x_test_scaled)


# test the model for the apartment with id 2472
apartment_features=[[2,2,2000,4,1,96,0,0,0,0,0,0]]
apartment_features_df= pd.DataFrame(apartment_features, columns=X.columns)
apartment_features_scaled = scaler.transform(apartment_features_df)
apartment_features_df_scaled = pd.DataFrame(apartment_features_scaled, columns=X.columns)

# make a prediction with the RandomForestRegressor
RentPrediction_rf = rf_model.predict(apartment_features_df_scaled)
print(f"Predicted Rent (Random Forest Regressor): {RentPrediction_rf[0]}")

# make a prediction with the Linear Regression
RentPrediction_lr= model.predict(apartment_features_df_scaled)
print(f"Predicted Rent (Linear Regression): {RentPrediction_rf[0]}")
# 9740.45 is actually pretty close to the 11500$ rent :)

Predicted Rent (Random Forest Regressor): 9760.99
Predicted Rent (Linear Regression): 9760.99




In [34]:
# compare the scores
print(f"Score (Linear Regression): {model.score(x_test_scaled, y_test)}")
print(f"Mean Squared Error (Linear Regression): {mean_squared_error(y_test, y_predict)}")

print(f"Score (Random Forest): {rf_model.score(x_test_scaled, y_test)}")
print(f"Mean Squared Error (Random Forest): {mean_squared_error(y_test, y_predict_rf)}")

Score (Linear Regression): 0.7364651394331072
Mean Squared Error (Linear Regression): 2383488.6533520906
Score (Random Forest): 0.8057601712013405
Mean Squared Error (Random Forest): 1756763.5149853087


In [38]:
StreetEasy.drop(['building_id','neighborhood','submarket','borough','no_fee','has_doorman',],axis=1).head()

Unnamed: 0,rental_id,rent,bedrooms,bathrooms,size_sqft,min_to_subway,floor,building_age_yrs,has_roofdeck,has_washer_dryer,has_elevator,has_dishwasher,has_patio,has_gym
0,1545,2550,0.0,1,480,9,2.0,17,1,0,1,1,0,1
1,2472,11500,2.0,2,2000,4,1.0,96,0,0,0,0,0,0
2,10234,3000,3.0,1,1000,4,1.0,106,0,0,0,0,0,0
3,2919,4500,1.0,1,916,2,51.0,29,1,0,1,1,0,0
4,2790,4795,1.0,1,975,3,8.0,31,0,0,1,1,0,1
