In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Load the dataset from CSV
df = pd.read_csv('data.lr ml 1.csv')

In [3]:
# Exploratory Data Analysis (EDA)
# Let's take a quick look at the first few rows of the dataset
print(df.head())

                  date      price  bedrooms  bathrooms  sqft_living  sqft_lot  \
0  2014-05-02 00:00:00   313000.0       3.0       1.50         1340      7912   
1  2014-05-02 00:00:00  2384000.0       5.0       2.50         3650      9050   
2  2014-05-02 00:00:00   342000.0       3.0       2.00         1930     11947   
3  2014-05-02 00:00:00   420000.0       3.0       2.25         2000      8030   
4  2014-05-02 00:00:00   550000.0       4.0       2.50         1940     10500   

   floors  waterfront  view  condition  sqft_above  sqft_basement  yr_built  \
0     1.5           0     0          3        1340              0      1955   
1     2.0           0     4          5        3370            280      1921   
2     1.0           0     0          4        1930              0      1966   
3     1.0           0     0          4        1000           1000      1963   
4     1.0           0     0          4        1140            800      1976   

   yr_renovated                    str

In [4]:
X = df[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition']]
y = df['price']

In [5]:
# Check for missing values
print(df.isnull().sum())

date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
street           0
city             0
statezip         0
country          0
dtype: int64


In [6]:
# Next, you can split the dataset into training and testing sets using sklearn's train_test_split function:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Now, let's specify the feature names for X to avoid the warning message:
feature_names = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition']
X_train.columns = feature_names
X_test.columns = feature_names

In [10]:
# Now, you can build your linear regression model using sklearn's LinearRegression class:
model = LinearRegression()

In [11]:
# Fit the model on the training data
model.fit(X_train, y_train)

In [12]:
# Once the model is trained, you can use it to make predictions on new data (test set in this case):
y_pred = model.predict(X_test)

In [13]:
# To evaluate the model's performance, you can use metrics such as Mean Squared Error (MSE) and R-squared:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [14]:
print("Mean Squared Error:", mse)
print("R-squared:", r2)

Mean Squared Error: 986869414953.98
R-squared: 0.03233518995632512


In [None]:


# Get input as a comma-separated string, then split and convert to integers
input_str = input("Enter house features (comma-separated): ")
input_list = [int(val.strip()) for val in input_str.split(',')]

# Reshape the input for prediction
input_data = np.array(input_list).reshape(1, -1)

# Check for the correct number of features
if input_data.shape[1] != X_train.shape[1]:
    raise ValueError(f"Expected {X_train.shape[1]} features, got {input_data.shape[1]}.")

# Predict and print the result
predicted_price = model.predict(input_data)
print(f"Predicted Price: ${predicted_price[0]:,.2f}")
