In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score


In [3]:
from sklearn.preprocessing import OneHotEncoder # for converting Cat colmns to machine understanable binary 
from sklearn.compose import ColumnTransformer # trasformer a function to pass both  numerical and catgorical for ml
from sklearn.pipeline import Pipeline # to pass stage by stage transformation 

In [13]:
df = pd.read_csv('housingDataGen.csv')

In [5]:
df.head(5)

Unnamed: 0,Square Feet,Number of Bedrooms,Number of Bathrooms,Year Built,Neighborhood,Garage Spaces,Lot Size,City,Price
0,2126,2,2.507115,2004,Urban,1,0.166285,Kansas City,213834500.0
1,2459,2,1.983652,2002,Suburban,0,0.105804,New York,246782800.0
2,1860,2,3.211397,1999,Rural,1,0.235589,Houston,187658800.0
3,2294,1,1.110139,2021,Urban,1,0.480616,Kansas City,231966300.0
4,2130,4,2.424307,1994,Rural,1,0.82211,Kansas City,217657400.0


In [14]:
# if the dataype you have to change , then . 
df['Number of Bathrooms'] = df['Number of Bathrooms'].astype(int)
df['Number of Bathrooms'] = df['Number of Bathrooms'].astype(str)
df['Number of Bedrooms'] = df['Number of Bedrooms'].astype(str)

In [15]:
# Separate features and target variable
X = df.drop('Price', axis=1)
y = df['Price']

In [7]:
X.head(5)

Unnamed: 0,Square Feet,Number of Bedrooms,Number of Bathrooms,Year Built,Neighborhood,Garage Spaces,Lot Size,City
0,2126,2,2.507115,2004,Urban,1,0.166285,Kansas City
1,2459,2,1.983652,2002,Suburban,0,0.105804,New York
2,1860,2,3.211397,1999,Rural,1,0.235589,Houston
3,2294,1,1.110139,2021,Urban,1,0.480616,Kansas City
4,2130,4,2.424307,1994,Rural,1,0.82211,Kansas City


In [8]:
y.head(5)

0    2.138345e+08
1    2.467828e+08
2    1.876588e+08
3    2.319663e+08
4    2.176574e+08
Name: Price, dtype: float64

In [9]:
X.count()

Square Feet            4999
Number of Bedrooms     4999
Number of Bathrooms    4999
Year Built             4999
Neighborhood           4999
Garage Spaces          4999
Lot Size               4999
City                   4999
dtype: int64

In [16]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [17]:
X_train.count()

Square Feet            3999
Number of Bedrooms     3999
Number of Bathrooms    3999
Year Built             3999
Neighborhood           3999
Garage Spaces          3999
Lot Size               3999
City                   3999
dtype: int64

In [12]:
X_test.count()

Square Feet            1000
Number of Bedrooms     1000
Number of Bathrooms    1000
Year Built             1000
Neighborhood           1000
Garage Spaces          1000
Lot Size               1000
City                   1000
dtype: int64

In [18]:
df.describe()

Unnamed: 0,Square Feet,Year Built,Garage Spaces,Lot Size,Price
count,4999.0,4999.0,4999.0,4999.0,4999.0
mean,2010.170834,1985.879176,1.003601,0.555496,204310900.0
std,574.316658,20.981952,0.822592,0.258989,57427620.0
min,1000.0,1950.0,0.0,0.100043,101638000.0
25%,1519.0,1968.0,0.0,0.331765,155499500.0
50%,2009.0,1986.0,1.0,0.559281,203899600.0
75%,2508.0,2004.0,2.0,0.775953,253892900.0
max,2999.0,2022.0,2.0,0.999691,305151500.0


In [20]:
X_train.head(5)

Unnamed: 0,Square Feet,Number of Bedrooms,Number of Bathrooms,Year Built,Neighborhood,Garage Spaces,Lot Size,City
1738,2441,1,1,2020,Suburban,1,0.127127,Kansas City
4943,1578,1,3,2004,Urban,0,0.163502,New York
2916,1841,2,3,1975,Suburban,1,0.966276,Houston
1595,2854,4,1,1952,Suburban,2,0.276417,Kansas City
3214,2012,4,2,2007,Suburban,1,0.153921,New York


In [40]:
# Define categorical and numerical features
categorical_features = ['City', 'Neighborhood','Number of Bedrooms', 'Number of Bathrooms']
numerical_features = ['Square Feet', 'Year Built', 'Garage Spaces', 'Lot Size']

In [41]:
# transfor allow the numerical to pass and catgorical value to go through one hot encoding 
#, that ml can read the categorical vaule more good
# Create transformers for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

In [42]:
# Create a pipeline with preprocessing and linear regression
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

In [43]:
# Fit the model to the training data
model.fit(X_train, y_train)


In [44]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluating the polynomial regression model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Mean Squared Error (Polynomial):', mse)
print('R-squared (Polynomial):', r2)

Mean Squared Error (Polynomial): 461556211.7045994
R-squared (Polynomial): 0.9999998556368809


In [45]:
df.columns

Index(['Square Feet', 'Number of Bedrooms', 'Number of Bathrooms',
       'Year Built', 'Neighborhood', 'Garage Spaces', 'Lot Size', 'City',
       'Price'],
      dtype='object')

In [46]:
# Define new data for prediction
new_data = {
    'Square Feet': [2500],
    'Year Built': [2010],
    'Garage Spaces': [2],
    'Lot Size': [0.3],
    'City': ['New York'],
    'Neighborhood': ['Suburban'],
    'Number of Bedrooms': [3],
    'Number of Bathrooms': [2]
}

# Create a DataFrame for the new data
new_df = pd.DataFrame(new_data)

# Use the fitted ColumnTransformer to transform the new data
new_data_transformed = preprocessor.transform(new_df)


TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''