In [58]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv('./resources/Cleaned_data_without_outliers.csv')

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,price,beds,bath,propertysqft,borough,property_category,type,sublocality_ext,postcode,state_ext,latitude,longitude,price_per_sqft
0,315000,2,2.0,1400.0,East 55th Street,Commercial,condo,Manhattan,10022,New York,40.761255,-73.974483,225.0
1,260000,4,2.0,2015.0,Staten Island,Residential,house,Staten Island,10312,New York,40.541805,-74.196109,129.032258
2,69000,3,1.0,445.0,New York,Commercial,condo,Manhattan,10022,New York,40.761398,-73.974613,155.05618
3,899500,2,2.0,2184.207862,Manhattan,Residential,condo,Manhattan,10027,New York,40.809448,-73.946777,411.819779
4,265000,1,1.0,750.0,Morrison Avenue,Residential,co-op,The Bronx,10473,New York,40.821586,-73.874089,353.333333


In [59]:
X = df.drop(['price','latitude','longitude','state_ext','postcode','sublocality_ext'], axis=1)
y = df['price']

In [60]:
X.dtypes

beds                   int64
bath                 float64
propertysqft         float64
borough               object
property_category     object
type                  object
price_per_sqft       float64
dtype: object

In [61]:
select_features = ["beds", "bath", "propertysqft","price_per_sqft","borough","property_category", "type"]

# Create another variable X_sel with only the columns
# in the "select_features" list

X_sel = df[select_features]
X_sel.head()

Unnamed: 0,beds,bath,propertysqft,price_per_sqft,borough,property_category,type
0,2,2.0,1400.0,225.0,East 55th Street,Commercial,condo
1,4,2.0,2015.0,129.032258,Staten Island,Residential,house
2,3,1.0,445.0,155.05618,New York,Commercial,condo
3,2,2.0,2184.207862,411.819779,Manhattan,Residential,condo
4,1,1.0,750.0,353.333333,Morrison Avenue,Residential,co-op


In [62]:
# Set the target variable y
y = df["price"].values.reshape(-1, 1)

In [63]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X_sel, y, random_state=42)

In [64]:
# Display X_train
X_train

Unnamed: 0,beds,bath,propertysqft,price_per_sqft,borough,property_category,type
1073,4,3.0,2184.207862,343.373913,Queens,Residential,pending
2407,4,2.0,2184.207862,388.699269,Queens,Residential,house
927,2,1.0,2184.207862,423.494492,Manhattan,Residential,condo
497,3,2.0,2184.207862,377.711304,Queens,Residential,house
1857,4,1.0,2184.207862,297.590267,Queens,Residential,house
...,...,...,...,...,...,...,...
1638,3,2.0,1725.000000,460.869565,Brooklyn,Residential,multi-family home
1095,4,3.0,2184.207862,618.479598,Queens,Residential,multi-family home
1130,3,2.0,2184.207862,319.464558,Queens,Residential,multi-family home
1294,4,2.0,1224.000000,734.477124,Staten Island,Residential,house


In [65]:
from sklearn.preprocessing import OneHotEncoder

# Use a OneHotEncoder to convert the training data to numerical values
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False, dtype='int')
X_train_encoded = pd.DataFrame(data=ohe.fit_transform(X_train[['borough','property_category','type']]), columns=ohe.get_feature_names_out())
X_train_encoded

Unnamed: 0,borough_35th Avenue,borough_5th Avenue,borough_61st Street,borough_67th Drive,borough_98th Place,borough_Annadale,borough_Astoria,borough_Bath Beach,borough_Bay Ridge,borough_Bay Terrace,...,type_condop,type_contingent,type_for sale,type_foreclosure,type_house,type_land,type_mobile house,type_multi-family home,type_pending,type_townhouse
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2032,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2033,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2034,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2035,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [66]:
# Encode the test data
X_test_encoded = pd.DataFrame(data=ohe.transform(X_test[['borough','property_category','type']]), columns=ohe.get_feature_names_out())
X_test_encoded

Unnamed: 0,borough_35th Avenue,borough_5th Avenue,borough_61st Street,borough_67th Drive,borough_98th Place,borough_Annadale,borough_Astoria,borough_Bath Beach,borough_Bay Ridge,borough_Bay Terrace,...,type_condop,type_contingent,type_for sale,type_foreclosure,type_house,type_land,type_mobile house,type_multi-family home,type_pending,type_townhouse
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
674,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
675,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
676,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
677,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [67]:
from sklearn.preprocessing import StandardScaler

# Create the StandardScaler instance and using it to scale the data

scaler = StandardScaler()

In [68]:
# Scale the training data
X_train_scaled=scaler.fit_transform(X_train[['beds','bath','propertysqft','price_per_sqft']])
X_test_scaled=scaler.transform(X_test[['beds','bath','propertysqft','price_per_sqft']])

In [69]:
Scaled_df=pd.DataFrame(X_train_scaled,columns=['beds','bath','propertysqft','price_per_sqft'])
Scaled_df_test=pd.DataFrame(X_test_scaled,columns=['beds','bath','propertysqft','price_per_sqft'])

In [76]:
X_train_final=pd.concat([Scaled_df,X_train_encoded], axis=1)
X_test_final=pd.concat([Scaled_df_test,X_test_encoded], axis=1)

In [79]:
print(X_train_final.shape)
print(X_test_final.shape)

(2037, 142)
(679, 142)


In [None]:
#Train the model
model = LinearRegression()




In [None]:
# Fit the model to the full training data. 
#model.fit(X_train_scaled, y_train)

In [None]:
# Predict the model
# Calculate the mean_squared_error and the r-squared value
# for the testing data

# Use our models to make predictions
predicted_lr = model.predict(X_test_scaled)

# Score the predictions with mse and r2
#mselr = mean_squared_error(y_test, predicted_lr)
#r2lr = r2_score(y_test, predicted_lr)

print("---------------------")
print(f"Select Features:")
print(f"mean squared error (MSE): {mse2}")
print(f"R-squared (R2): {r2lr}")

In [None]:
import statsmodels.api as sm

# Use the statsmodels package to create and fit a linear regression
lr = sm.OLS(y_train, X_train).fit()

In [None]:
# Create a variable to hold the p-values of all columns sorted in ascending order
p_values = model.pvalues.sort_values()
p_values