In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import joblib
from sklearn.datasets import fetch_california_housing

# Download the California Housing dataset
california_housing = fetch_california_housing()
data = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)

# Median house value for California districts
data['MEDV'] = california_housing.target

# Print the first 10 rows of the dataset
data.head(10)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MEDV
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
5,4.0368,52.0,4.761658,1.103627,413.0,2.139896,37.85,-122.25,2.697
6,3.6591,52.0,4.931907,0.951362,1094.0,2.128405,37.84,-122.25,2.992
7,3.12,52.0,4.797527,1.061824,1157.0,1.788253,37.84,-122.25,2.414
8,2.0804,42.0,4.294118,1.117647,1206.0,2.026891,37.84,-122.26,2.267
9,3.6912,52.0,4.970588,0.990196,1551.0,2.172269,37.84,-122.25,2.611


In [2]:
# Add more random descriptions
additional_descriptions = [
    "Spacious family home with a beautiful view",
    "Elegant townhouse in a historic district",
    "Luxurious penthouse with top-notch amenities",
    "Quaint cottage surrounded by nature",
    "Contemporary condo with cutting-edge design",
    "Sunny apartment in a lively urban neighborhood",
    "Rustic farmhouse with a charming atmosphere",
    "Stylish loft with industrial chic decor"#,
    # ... add more descriptions ...
]

# Add the new descriptions to the dataset
data['Description'] = np.random.choice(additional_descriptions, size=len(data))

In [3]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('MEDV', axis=1), data['MEDV'], test_size=0.2, random_state=42)

In [5]:
# Create a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X_train.columns.difference(['Description'])),
        ('text', CountVectorizer(), 'Description')
    ]
)

# Combine preprocessing with the model
model = make_pipeline(preprocessor, LinearRegression())

# Train the model
model.fit(X_train, y_train)

In [6]:
# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error on Test Data: {mse}')

# Save the model for future use
joblib.dump(model, 'housing_price_model.joblib')

Mean Squared Error on Test Data: 0.5559902753065507


['housing_price_model.joblib']

In [7]:
# Example: Predict the price for a new house
new_house = pd.DataFrame({
    'MedInc': [3.0],  # Example numerical features, use appropriate values from the dataset
    'HouseAge': [20.0],
    'AveRooms': [5.0],
    'AveBedrms': [2.0],
    'Population': [1000.0],
    'AveOccup': [3.0],
    'Latitude': [37.5],
    'Longitude': [-122.5],
    'Description': ["Charming cottage with a garden"]  # Example text feature
})

# Load the pre-trained model
loaded_model = joblib.load('housing_price_model.joblib')

# Make predictions for the new house
predicted_price = loaded_model.predict(new_house) * 100000
print(f'Predicted Price for the New House: ${predicted_price[0]:.2f}')

Predicted Price for the New House: $283609.60


In [8]:
######Complete Code######

# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
import joblib
from sklearn.datasets import fetch_california_housing

# Download the California Housing dataset
california_housing = fetch_california_housing()
data = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)

# Median house value for California districts
data['MEDV'] = california_housing.target

# Print the first 10 rows of the dataset
data.head(10)

# Add more random descriptions
additional_descriptions = [
    "Spacious family home with a beautiful view",
    "Elegant townhouse in a historic district",
    "Luxurious penthouse with top-notch amenities",
    "Quaint cottage surrounded by nature",
    "Contemporary condo with cutting-edge design",
    "Sunny apartment in a lively urban neighborhood",
    "Rustic farmhouse with a charming atmosphere",
    "Stylish loft with industrial chic decor"#,
    # ... add more descriptions ...
]

# Add the new descriptions to the dataset
data['Description'] = np.random.choice(additional_descriptions, size=len(data))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data.drop('MEDV', axis=1), data['MEDV'], test_size=0.2, random_state=42
)

# Create a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X_train.columns.difference(['Description'])),
        ('text', CountVectorizer(), 'Description')
    ]
)

# Combine preprocessing with the model
model = make_pipeline(preprocessor, LinearRegression())

# Train the model
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error on Test Data: {mse}')

# Save the model for future use
joblib.dump(model, 'housing_price_model.joblib')

# Example: Predict the price for a new house
new_house = pd.DataFrame({
    'MedInc': [3.0],  # Example numerical features, use appropriate values from the dataset
    'HouseAge': [20.0],
    'AveRooms': [5.0],
    'AveBedrms': [2.0],
    'Population': [1000.0],
    'AveOccup': [3.0],
    'Latitude': [37.5],
    'Longitude': [-122.5],
    'Description': ["Charming cottage with a garden"]  # Example text feature
})

# Load the pre-trained model
loaded_model = joblib.load('housing_price_model.joblib')

# Make predictions for the new house
predicted_price = loaded_model.predict(new_house) * 100000
print(f'Predicted Price for the New House: ${predicted_price[0]:.2f}')

Mean Squared Error on Test Data: 0.556851346766172
Predicted Price for the New House: $284670.62
