In [6]:
# import dependencies
import pandas as pd

# read in the data
df = pd.read_csv('../resources/2015_2023_house_school.csv')

df.head() 

Unnamed: 0,SCHOOL_NAME,ZIP,price,bed,bath,acres,sqft,City,State,Sold Year
0,PS 48 PO MICHAEL J BUCZEK,10033,669000.0,2.0,1.0,0.27,870.0,New York City,NY,2015
1,PS 30 WILTON,10454,799000.0,6.0,3.0,0.05,2280.0,Bronx,NY,2015
2,Hackettstown High School,7840,599900.0,4.0,4.0,0.18,2450.0,Allamuchy Township,NJ,2015
3,EDWARD A REYNOLDS WEST SIDE HIGH SCHOOL,10025,5995000.0,5.0,3.0,0.03,3264.0,New York City,NY,2015
4,Marlboro High School,7746,725000.0,4.0,4.0,0.34,2404.0,Marlboro,NJ,2015


In [7]:
# convert ZIP column to string
df['ZIP'] = df['ZIP'].astype(str)

# add a 0 to the front of the number in the ZIP column if the number is 4 digits long
df['ZIP'] = df['ZIP'].apply(lambda x: '0' + x if len(x) == 4 else x)

# # drop the State and City columns
# df.drop(columns=['State', 'City'], inplace=True)

df.head()

Unnamed: 0,SCHOOL_NAME,ZIP,price,bed,bath,acres,sqft,City,State,Sold Year
0,PS 48 PO MICHAEL J BUCZEK,10033,669000.0,2.0,1.0,0.27,870.0,New York City,NY,2015
1,PS 30 WILTON,10454,799000.0,6.0,3.0,0.05,2280.0,Bronx,NY,2015
2,Hackettstown High School,7840,599900.0,4.0,4.0,0.18,2450.0,Allamuchy Township,NJ,2015
3,EDWARD A REYNOLDS WEST SIDE HIGH SCHOOL,10025,5995000.0,5.0,3.0,0.03,3264.0,New York City,NY,2015
4,Marlboro High School,7746,725000.0,4.0,4.0,0.34,2404.0,Marlboro,NJ,2015


In [8]:
# move the STATE column to the 3rd column position
df = df[['SCHOOL_NAME', 'City', 'State', 'ZIP', 'price', 'bed', 'bath', 'acres', 'sqft', 'Sold Year']]

# rename Sold Year to sold_year
df.rename(columns={'Sold Year': 'sold_year'}, inplace=True)


# lower case all column headers
df.columns = map(str.lower, df.columns)
df.head()

Unnamed: 0,school_name,city,state,zip,price,bed,bath,acres,sqft,sold_year
0,PS 48 PO MICHAEL J BUCZEK,New York City,NY,10033,669000.0,2.0,1.0,0.27,870.0,2015
1,PS 30 WILTON,Bronx,NY,10454,799000.0,6.0,3.0,0.05,2280.0,2015
2,Hackettstown High School,Allamuchy Township,NJ,7840,599900.0,4.0,4.0,0.18,2450.0,2015
3,EDWARD A REYNOLDS WEST SIDE HIGH SCHOOL,New York City,NY,10025,5995000.0,5.0,3.0,0.03,3264.0,2015
4,Marlboro High School,Marlboro,NJ,7746,725000.0,4.0,4.0,0.34,2404.0,2015


In [9]:
df.dtypes

school_name     object
city            object
state           object
zip             object
price          float64
bed            float64
bath           float64
acres          float64
sqft           float64
sold_year        int64
dtype: object

In [10]:
# convert sold_year to float
df['sold_year'] = df['sold_year'].astype(float)

# save dataframe to csv
df.to_csv('../resources/trimmed_merged.csv', index=False)

In [11]:
from sklearn.preprocessing import LabelEncoder

# Initialize a LabelEncoder
le = LabelEncoder()

# Specify the categorical columns to be encoded
cat_cols = ['school_name', 'city', 'state', 'zip']

# Apply LabelEncoder to categorical columns
for col in cat_cols:
    df[col] = le.fit_transform(df[col])


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Define features X and target y
X = df.drop('price', axis=1)
y = df['price']

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the standard scaler
scaler = StandardScaler()

# Scale the features
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the linear regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 364761369656.4257
R-squared: 0.30883171717826485


In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

# Initialize the random forest regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# Perform cross-validation
scores = cross_val_score(model, X, y, cv=5)
print(f"Cross-validation scores: {scores}")
print(f"Average cross-validation score: {scores.mean()}")

Mean Squared Error: 43710479171.686714
R-squared: 0.9171751743917773
Cross-validation scores: [0.27303681 0.3974767  0.59620303 0.33350522 0.49327005]
Average cross-validation score: 0.41869836355390594
