In [90]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv('./resources/cleaned_data.csv')

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,beds,bath,propertysqft,type,state,city,suburb,postcode,latitude,longitude,price,category
0,2,2.0,1400.0,Condo,New York,New York,Manhattan,10022,40.761255,-73.974483,315000,Residential
1,5,10.0,4900.0,Condo,New York,New York,Manhattan,10019,40.766393,-73.980991,195000000,Residential
2,4,2.0,2015.0,House,New York,New York,Staten Island,10312,40.541805,-74.196109,260000,Residential
3,3,1.0,445.0,Condo,New York,New York,Manhattan,10022,40.761398,-73.974613,69000,Residential
4,5,2.0,4900.0,Townhouse,New York,New York,Manhattan,10065,40.767224,-73.969856,55000000,Residential


In [91]:
X = df.drop(['price','category','latitude','longitude','city','state','postcode'], axis=1)
y = df['price']

In [92]:
X.dtypes

beds              int64
bath            float64
propertysqft    float64
type             object
suburb           object
dtype: object

In [93]:
select_features = ["beds", "bath", "propertysqft", "type", "suburb"]

# Create another variable X_sel with only the columns
# in the "select_features" list

X_sel = df[select_features]
X_sel.head()

Unnamed: 0,beds,bath,propertysqft,type,suburb
0,2,2.0,1400.0,Condo,Manhattan
1,5,10.0,4900.0,Condo,Manhattan
2,4,2.0,2015.0,House,Staten Island
3,3,1.0,445.0,Condo,Manhattan
4,5,2.0,4900.0,Townhouse,Manhattan


In [94]:
# Set the target variable y
y = df["price"].values.reshape(-1, 1)

In [95]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [96]:
# Display X_train
X_train

Unnamed: 0,beds,bath,propertysqft,type,suburb
3233,1,1.0,2184.207862,Co-op,Manhattan
4136,1,2.0,1560.000000,Condo,Manhattan
2988,1,2.0,1130.000000,Co-op,Queens
839,2,2.0,824.000000,Condo,Queens
500,3,2.0,2184.207862,Land,Queens County
...,...,...,...,...,...
3444,2,2.0,2184.207862,Co-op,Manhattan
466,3,1.0,2184.207862,Co-op,Manhattan
3092,1,1.0,700.000000,Co-op,Whitestone
3772,5,3.0,2146.000000,Multi-family,Brooklyn


In [97]:
# Use a OneHotEncoder to convert the training data to numerical values
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False, dtype='int')
# ohe.fit(X_train['type','suburb'])
X_train_encoded = pd.DataFrame(data=ohe.fit_transform(X_train[['type','suburb']]), columns=ohe.get_feature_names_out())
X_train_encoded

Unnamed: 0,type_Co-op,type_Condo,type_Foreclosure,type_House,type_Land,type_Multi-family,type_Other,type_Townhouse,suburb_Arverne,suburb_Astoria,...,suburb_Throgs Neck,suburb_Tribeca,suburb_Two Bridges,suburb_Upper East Side,suburb_Upper West Side,suburb_Wakefield,suburb_Washington Heights,suburb_Whitestone,suburb_Williamsburg,suburb_Woodside
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3146,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3147,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3148,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [98]:
# Encode the test data
X_test_encoded = pd.DataFrame(data=ohe.transform(X_test[['type','suburb']]), columns=ohe.get_feature_names_out())
X_test_encoded

Unnamed: 0,type_Co-op,type_Condo,type_Foreclosure,type_House,type_Land,type_Multi-family,type_Other,type_Townhouse,suburb_Arverne,suburb_Astoria,...,suburb_Throgs Neck,suburb_Tribeca,suburb_Two Bridges,suburb_Upper East Side,suburb_Upper West Side,suburb_Wakefield,suburb_Washington Heights,suburb_Whitestone,suburb_Williamsburg,suburb_Woodside
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1045,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1046,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1047,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1048,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [99]:
from sklearn.preprocessing import StandardScaler

# Create the StandardScaler instance and using it to scale the data

scaler = StandardScaler()

In [102]:
# Scale the training data
X_train_scaled=scaler.fit_transform(X_train[['beds','bath','propertysqft']])
X_test_scaled=scaler.transform(X_test[['beds','bath','propertysqft']])

In [105]:
Scaled_df=pd.DataFrame(X_train_scaled,columns=['beds','bath','propertysqft'])
Scaled_df_test=pd.DataFrame(X_test_scaled,columns=['beds','bath','propertysqft'])

In [108]:
X_train_final=pd.concat([Scaled_df,X_train_encoded], axis=1)
X_test_final=pd.concat([Scaled_df,X_train_encoded], axis=1)

In [107]:
X_train_final.head()

Unnamed: 0,beds,bath,propertysqft,type_Co-op,type_Condo,type_Foreclosure,type_House,type_Land,type_Multi-family,type_Other,...,suburb_Throgs Neck,suburb_Tribeca,suburb_Two Bridges,suburb_Upper East Side,suburb_Upper West Side,suburb_Wakefield,suburb_Washington Heights,suburb_Whitestone,suburb_Williamsburg,suburb_Woodside
0,-1.53454,-0.715537,0.140647,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-1.53454,-0.206172,-0.465249,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-1.53454,-0.206172,-0.882635,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-0.787892,-0.206172,-1.179658,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-0.041243,-0.206172,0.140647,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [72]:
#Train the model
model = LinearRegression()




In [73]:
# Fit the model to the full training data. 
#model.fit(X_train_scaled, y_train)

In [None]:
# Predict the model
# Calculate the mean_squared_error and the r-squared value
# for the testing data

# Use our models to make predictions
predicted_lr = model.predict(X_test_scaled)

# Score the predictions with mse and r2
#mselr = mean_squared_error(y_test, predicted_lr)
#r2lr = r2_score(y_test, predicted_lr)

print("---------------------")
print(f"Select Features:")
print(f"mean squared error (MSE): {mse2}")
print(f"R-squared (R2): {r2lr}")

In [None]:
import statsmodels.api as sm

# Use the statsmodels package to create and fit a linear regression
lr = sm.OLS(y_train, X_train).fit()

In [None]:
# Create a variable to hold the p-values of all columns sorted in ascending order
p_values = model.pvalues.sort_values()
p_values