In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import dependencies
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.linear_model import LinearRegression

In [3]:
# Load the data
file_path = Path('./Resources/Clean_MC_Master.csv')
df = pd.read_csv(file_path)
df

Unnamed: 0,MLSNumber,Address,SoldPrice,CurrentPrice,ListDate,SettledDate,#ofStories,City,Zip Code,Subdivision,New Construction YN,Age,InteriorSqFt,Bedrooms,Baths,Garage YN,Structure Type
0,1002388281,9701 Fields Rd #1806,"$127,000","$129,900",11/9/2015,1/4/2016,Main,Gaithersburg,20878,WASHINGTON TOWER CODM,No,1966,446.0,0.0,1.0,No,Unit/Flat/Apartment
1,1002388133,2211 Washington Ave #W-102,"$202,000","$207,000",11/9/2015,1/4/2016,Main,Silver Spring,20910,ROCK CREEK APTS CODM 2,No,1948,671.0,1.0,1.0,No,Unit/Flat/Apartment
2,1002384775,3117 University Blvd W #B4,"$139,900","$139,900",10/28/2015,1/4/2016,Main,Kensington,20895,MONTGOMERY CENTURY,No,1973,754.0,1.0,1.0,No,Unit/Flat/Apartment
3,1002382327,10201 Grosvenor Pl #210,"$195,000","$199,900",10/15/2015,1/4/2016,Main,Rockville,20852,GROSVENOR PARK,No,1972,851.0,1.0,1.0,No,Unit/Flat/Apartment
4,1002382267,10301 Rossmore Ct,"$840,000","$850,000",10/22/2015,1/4/2016,"Lower1,Lower2,Main,Upper1",Bethesda,20814,WILDWOOD KNOLLS,No,1963,3060.0,4.0,4.0,Yes,Detached
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61797,MDMC2005770,1 Paca Pl,"$625,000","$625,000",7/22/2021,9/16/2021,"Main,Upper1",Rockville,20852,HUNGERFORD,No,1955,2237.0,4.0,3.0,Yes,Detached
61798,MDMC753990,1108 Clagett Dr,"$499,500","$509,000",7/15/2021,9/16/2021,Main,Rockville,20851,ROCKCREST,No,1951,1457.0,3.0,3.0,Yes,Detached
61799,MDMC2003756,11307 Galt Ave,"$410,000","$445,000",7/26/2021,9/16/2021,"Lower1,Main,Upper1",Silver Spring,20902,WHEATON HILLS,No,1950,1872.0,4.0,2.0,No,Detached
61800,MDMC763464,8809 Thomas Lea Ter,"$400,000","$374,900",6/24/2021,9/16/2021,"Lower1,Main,Upper1",Montgomery Village,20886,THE REACH,No,1986,2160.0,4.0,4.0,No,Interior Row/Townhouse


In [4]:
# Drop MLS and Address columns
housing_df = df.drop(['MLSNumber', 'Address', 'ListDate', 'SettledDate', 'Subdivision', 'City', 'CurrentPrice', '#ofStories'], axis=1)
housing_df.head()

Unnamed: 0,SoldPrice,Zip Code,New Construction YN,Age,InteriorSqFt,Bedrooms,Baths,Garage YN,Structure Type
0,"$127,000",20878,No,1966,446.0,0.0,1.0,No,Unit/Flat/Apartment
1,"$202,000",20910,No,1948,671.0,1.0,1.0,No,Unit/Flat/Apartment
2,"$139,900",20895,No,1973,754.0,1.0,1.0,No,Unit/Flat/Apartment
3,"$195,000",20852,No,1972,851.0,1.0,1.0,No,Unit/Flat/Apartment
4,"$840,000",20814,No,1963,3060.0,4.0,4.0,Yes,Detached


In [5]:
# Convert SoldPrice to numberical
housing_df['SoldPrice'] = housing_df['SoldPrice'].str.replace(',', '').str.replace('$', '').astype(int)
housing_df.head()

Unnamed: 0,SoldPrice,Zip Code,New Construction YN,Age,InteriorSqFt,Bedrooms,Baths,Garage YN,Structure Type
0,127000,20878,No,1966,446.0,0.0,1.0,No,Unit/Flat/Apartment
1,202000,20910,No,1948,671.0,1.0,1.0,No,Unit/Flat/Apartment
2,139900,20895,No,1973,754.0,1.0,1.0,No,Unit/Flat/Apartment
3,195000,20852,No,1972,851.0,1.0,1.0,No,Unit/Flat/Apartment
4,840000,20814,No,1963,3060.0,4.0,4.0,Yes,Detached


In [6]:
# Convert text to numbers
clean_housing_df = pd.get_dummies(housing_df)
clean_housing_df.head()

Unnamed: 0,SoldPrice,Zip Code,Age,InteriorSqFt,Bedrooms,Baths,New Construction YN_No,New Construction YN_Yes,Garage YN_No,Garage YN_Yes,Structure Type _Detached,Structure Type _End of Row/Townhouse,Structure Type _Garage/Parking Space,Structure Type _Interior Row/Townhouse,Structure Type _Other,Structure Type _Penthouse Unit/Flat/Apartment,Structure Type _Twin/Semi-Detached,Structure Type _Unit/Flat/Apartment
0,127000,20878,1966,446.0,0.0,1.0,1,0,1,0,0,0,0,0,0,0,0,1
1,202000,20910,1948,671.0,1.0,1.0,1,0,1,0,0,0,0,0,0,0,0,1
2,139900,20895,1973,754.0,1.0,1.0,1,0,1,0,0,0,0,0,0,0,0,1
3,195000,20852,1972,851.0,1.0,1.0,1,0,1,0,0,0,0,0,0,0,0,1
4,840000,20814,1963,3060.0,4.0,4.0,1,0,0,1,1,0,0,0,0,0,0,0


In [7]:
# Create features
X = clean_housing_df.drop('SoldPrice', axis=1)

# Create target
y = clean_housing_df['SoldPrice']

In [8]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=100)

In [9]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [None]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [None]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [None]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

In [None]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

In [None]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

In [None]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)