In [39]:
# Import the required modules
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score, classification_report

In [2]:
# Read the csv file from the Resources folder into a Pandas DataFrame
file_path = Path("Resources/All_stats.csv")
OG_house_stats_df = pd.read_csv(file_path)

OG_house_stats_df.head()

Unnamed: 0,Counties,Year,Units Sold,Units Sold % Chg,Average Price,Price % Chg,Increase(1)/Decrease(0)
0,Allegany County,2015,541,0.055,93125,0.034,1
1,Anne Arundel County,2015,7909,0.13,361671,0.022,1
2,Baltimore City,2015,7998,0.067,158067,0.091,1
3,Baltimore County,2015,9890,0.085,249652,0.053,1
4,Calvert County,2015,1451,0.15,311314,0.008,1


In [3]:
house_stats_df = OG_house_stats_df.copy()

house_stats_df = house_stats_df.drop("Counties", axis=1)
house_stats_df.head()

Unnamed: 0,Year,Units Sold,Units Sold % Chg,Average Price,Price % Chg,Increase(1)/Decrease(0)
0,2015,541,0.055,93125,0.034,1
1,2015,7909,0.13,361671,0.022,1
2,2015,7998,0.067,158067,0.091,1
3,2015,9890,0.085,249652,0.053,1
4,2015,1451,0.15,311314,0.008,1


In [4]:
# Split the data into X (features) and y (target)

# The y variable should focus on the target column
y = house_stats_df['Increase(1)/Decrease(0)']

# The X variable should include all features except the target
X = house_stats_df.drop(columns=["Price % Chg",'Increase(1)/Decrease(0)'])

In [5]:
# Split into testing and training sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [6]:
print(y_train)

124    1
94     1
171    1
76     1
71     1
      ..
102    1
45     1
182    0
128    1
111    1
Name: Increase(1)/Decrease(0), Length: 144, dtype: int64


In [7]:
# Declare a logistic regression model.
# Apply a random_state of 1 to the model
logistic_regression_model = LogisticRegression(random_state=1)

# Fit and save the logistic regression model using the training data
lr_model = logistic_regression_model.fit(X_train, y_train)

In [8]:
#Generate training predictions
training_predictions = lr_model.predict(X_train)

#Generate testing predictions
testing_predictions = logistic_regression_model.predict(X_test)

In [12]:
# Create and save the confusion matrix for the training data
training_matrix = confusion_matrix(y_train, training_predictions)

# Print the confusion matrix for the training data
print(training_matrix)


[[  0   8]
 [  0 136]]


In [13]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, testing_predictions)

# Print the confusion matrix for the testing data
print(test_matrix)

#True -, False +
#False -, True +

[[ 0  1]
 [ 0 47]]


In [16]:
# Create and save the training classification report
target_names = ["Decrease", "Increase"]
training_report = classification_report(y_train, training_predictions, target_names=target_names)

# Print the training classification report
print(training_report)

              precision    recall  f1-score   support

    Decrease       0.00      0.00      0.00         8
    Increase       0.94      1.00      0.97       136

    accuracy                           0.94       144
   macro avg       0.47      0.50      0.49       144
weighted avg       0.89      0.94      0.92       144



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions, target_names=target_names)

# Print the testing classification report
print(testing_report)


              precision    recall  f1-score   support

    Decrease       0.00      0.00      0.00         1
    Increase       0.98      1.00      0.99        47

    accuracy                           0.98        48
   macro avg       0.49      0.50      0.49        48
weighted avg       0.96      0.98      0.97        48



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## SVM Testing

In [19]:
# Create a support vector machine linear classifer, and fit it to the training data
from sklearn.svm import SVC 
SVMmodel = SVC(kernel='linear')
SVMmodel.fit(X_train, y_train)

In [20]:
# Print the model score by using the test data
print('Test Acc: %.3f' % SVMmodel.score(X_test, y_test))

Test Acc: 0.979


In [21]:
predictions = model.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

              precision    recall  f1-score   support

    Decrease       0.00      0.00      0.00         1
    Increase       0.98      1.00      0.99        47

    accuracy                           0.98        48
   macro avg       0.49      0.50      0.49        48
weighted avg       0.96      0.98      0.97        48



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Random Forest

In [25]:
y= y.ravel()
y[:5]

array([1, 1, 1, 1, 1], dtype=int64)

In [27]:
# Splitting into Train and Test sets for RF
rf_X_train, rf_X_test, rf_y_train, rf_y_test = train_test_split(X, y, random_state=78)

In [29]:
from sklearn.preprocessing import StandardScaler
# Creating StandardScaler instance
scaler = StandardScaler()

In [32]:
# Fitting Standard Scaller
X_scaler = scaler.fit(rf_X_train)

In [33]:
# Scaling data
X_train_scaled = X_scaler.transform(rf_X_train)
X_test_scaled = X_scaler.transform(rf_X_test)

In [35]:
from sklearn.ensemble import RandomForestClassifier
# Create a random forest classifier. a range of 64-128 trees for N_estimators is suggested for initial modeling
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

In [36]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, rf_y_train)

In [37]:
# Making predictions using the testing data
rf_predictions = rf_model.predict(X_test_scaled)

In [40]:
# Calculating the confusion matrix
cm = confusion_matrix(rf_y_test, rf_predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(rf_y_test, rf_predictions)

print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,0,1
Actual 1,0,47


Accuracy Score : 0.9791666666666666


In [41]:
print("Classification Report")
print(classification_report(rf_y_test, rf_predictions, target_names=target_names))

Classification Report
              precision    recall  f1-score   support

    Decrease       0.00      0.00      0.00         1
    Increase       0.98      1.00      0.99        47

    accuracy                           0.98        48
   macro avg       0.49      0.50      0.49        48
weighted avg       0.96      0.98      0.97        48



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
