In [35]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `housing_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [36]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = Path("Resources/housing_data.csv")
df_housing = pd.read_csv(file_path)
# Review the DataFrame
df_housing.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7229300521,20141013T000000,231300.0,2,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [37]:
df_housing.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [38]:
# Drop the non-beneficial columns.
drop_columns = ['id', 'date', 'sqft_above', 'sqft_basement', 'sqft_living15', 'sqft_lot15', 'waterfront']
housing_clean_df = df_housing.drop(columns=drop_columns)
housing_clean_df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,view,condition,grade,yr_built,yr_renovated,zipcode,lat,long
0,231300.0,2,1.0,1180,5650,1.0,0,3,7,1955,0,98178,47.5112,-122.257
1,538000.0,3,2.25,2570,7242,2.0,0,3,7,1951,1991,98125,47.721,-122.319
2,180000.0,2,1.0,770,10000,1.0,0,3,6,1933,0,98028,47.7379,-122.233
3,604000.0,4,3.0,1960,5000,1.0,0,5,7,1965,0,98136,47.5208,-122.393
4,510000.0,3,2.0,1680,8080,1.0,0,3,8,1987,0,98074,47.6168,-122.045


In [39]:
# Look at view value counts 
view_counts = housing_clean_df['view'].value_counts()
view_counts

view
0    19489
2      963
3      510
1      332
4      319
Name: count, dtype: int64

In [40]:
# Look at condition value counts 
condition_counts = housing_clean_df['condition'].value_counts()
condition_counts

condition
3    14031
4     5679
5     1701
2      172
1       30
Name: count, dtype: int64

In [41]:
# Look at price value counts 
price_counts = housing_clean_df['price'].value_counts()
price_counts

price
350000.0    172
450000.0    172
550000.0    159
500000.0    152
425000.0    150
           ... 
954500.0      1
355200.0      1
526750.0      1
278100.0      1
402101.0      1
Name: count, Length: 4029, dtype: int64

### Step 2: Create the labels set (`y`)  from the “price” column, and then create the features (`X`) DataFrame from the remaining columns.

In [42]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = housing_clean_df["view"]

# Separate the X variable, the features
X = housing_clean_df.drop("view", axis=1)
feature_names = X.columns
X.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,condition,grade,yr_built,yr_renovated,zipcode,lat,long
0,231300.0,2,1.0,1180,5650,1.0,3,7,1955,0,98178,47.5112,-122.257
1,538000.0,3,2.25,2570,7242,2.0,3,7,1951,1991,98125,47.721,-122.319
2,180000.0,2,1.0,770,10000,1.0,3,6,1933,0,98028,47.7379,-122.233
3,604000.0,4,3.0,1960,5000,1.0,5,7,1965,0,98136,47.5208,-122.393
4,510000.0,3,2.0,1680,8080,1.0,3,8,1987,0,98074,47.6168,-122.045


In [43]:
# Review the y variable Series
print(y[:-5])
print(y[:5])

0        0
1        0
2        0
3        0
4        0
        ..
21603    0
21604    0
21605    0
21606    0
21607    0
Name: view, Length: 21608, dtype: int64
0    0
1    0
2    0
3    0
4    0
Name: view, dtype: int64


In [44]:
# Review the y variable Series
y.info()

<class 'pandas.core.series.Series'>
RangeIndex: 21613 entries, 0 to 21612
Series name: view
Non-Null Count  Dtype
--------------  -----
21613 non-null  int64
dtypes: int64(1)
memory usage: 169.0 KB


In [45]:
# Review the X variable DataFrame
X.describe()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,condition,grade,yr_built,yr_renovated,zipcode,lat,long
count,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0
mean,540088.6,3.370795,2.114757,2079.899736,15106.97,1.494309,3.40943,7.656873,1971.005136,84.402258,98077.939805,47.560053,-122.213896
std,367126.8,0.930105,0.770163,918.440897,41420.51,0.539989,0.650743,1.175459,29.373411,401.67924,53.505026,0.138564,0.140828
min,75000.0,0.0,0.0,290.0,520.0,1.0,1.0,1.0,1900.0,0.0,98001.0,47.1559,-122.519
25%,321950.0,3.0,1.75,1427.0,5040.0,1.0,3.0,7.0,1951.0,0.0,98033.0,47.471,-122.328
50%,450000.0,3.0,2.25,1910.0,7618.0,1.5,3.0,7.0,1975.0,0.0,98065.0,47.5718,-122.23
75%,645000.0,4.0,2.5,2550.0,10688.0,2.0,4.0,8.0,1997.0,0.0,98118.0,47.678,-122.125
max,7700000.0,33.0,8.0,13540.0,1651359.0,3.5,5.0,13.0,2015.0,2015.0,98199.0,47.7776,-121.315


### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [46]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

---

## Machine Learning Model 1: Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [47]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
classifier = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using training data
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [48]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
15544,0,0
17454,0,0
21548,0,0
3427,0,0
8809,0,0
...,...,...
12416,0,0
8253,0,0
4251,0,0
11404,0,0


### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [49]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test, predictions)
unique_classes = sorted(list(set(y_test)))
cm_df = pd.DataFrame(
    cm, index=unique_classes, columns=unique_classes
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [50]:
# Print the classification report for the model
print("** Logistic Regression Model")
print("* Confusion Matrix")
display(cm_df)
print(f"* Accuracy Score : {acc_score}")
print(f"""
* Classification Report""")
print(classification_report(y_test, predictions))

** Logistic Regression Model
* Confusion Matrix


Unnamed: 0,0,1,2,3,4
0,4848,0,0,1,19
1,92,0,0,0,3
2,220,0,0,1,6
3,118,0,0,0,11
4,66,0,0,1,18


* Accuracy Score : 0.9004441154700222

* Classification Report
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      4868
           1       0.00      0.00      0.00        95
           2       0.00      0.00      0.00       227
           3       0.00      0.00      0.00       129
           4       0.32      0.21      0.25        85

    accuracy                           0.90      5404
   macro avg       0.24      0.24      0.24      5404
weighted avg       0.82      0.90      0.86      5404



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


---

---

## Machine Learning Model 2: Decision Tree Model

In [51]:
from sklearn.preprocessing import StandardScaler
from sklearn import tree
# Creating StandardScaler instance
scaler = StandardScaler()

In [52]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [53]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [54]:
# Creating the decision tree classifier instance
model_tree = tree.DecisionTreeClassifier()

In [55]:
# Fitting the model
model_tree = model_tree.fit(X_train_scaled, y_train)

In [56]:
# Making predictions using the testing data
predictions_tree = model_tree.predict(X_test_scaled)

In [57]:
# Generate a confusion matrix for the model
cm_tree = confusion_matrix(y_test, predictions_tree)
cm_tree_df = pd.DataFrame(
    cm_tree, index=unique_classes, columns=unique_classes
)

# Calculating the accuracy score
acc_tree_score = accuracy_score(y_test, predictions_tree)

In [58]:
# Print the classification report for the model
print("** Decision Tree Model")
print("* Confusion Matrix")
display(cm_tree_df)
print(f"* Accuracy Score : {acc_tree_score}")
print(f"""
* Classification Report""")
print(classification_report(y_test, predictions_tree))

** Decision Tree Model
* Confusion Matrix


Unnamed: 0,0,1,2,3,4
0,4555,59,160,74,20
1,59,11,19,6,0
2,141,13,35,30,8
3,53,9,31,26,10
4,25,4,12,9,35


* Accuracy Score : 0.8626943005181347

* Classification Report
              precision    recall  f1-score   support

           0       0.94      0.94      0.94      4868
           1       0.11      0.12      0.12        95
           2       0.14      0.15      0.14       227
           3       0.18      0.20      0.19       129
           4       0.48      0.41      0.44        85

    accuracy                           0.86      5404
   macro avg       0.37      0.36      0.37      5404
weighted avg       0.87      0.86      0.87      5404



---

## Machine Learning Model 3: K-Nearest Neighbors (KNN) Model

In [59]:
from sklearn.neighbors import KNeighborsClassifier

# Instantiate the model with k = 3 neighbors
model_knn = KNeighborsClassifier(n_neighbors=3)

In [60]:
# Train the model
model_knn.fit(X_train_scaled, y_train)

In [61]:
# Create predictions
predictions_knn = model_knn.predict(X_test_scaled)

# Review the predictions
predictions_knn

array([0, 0, 0, ..., 0, 0, 0])

In [62]:
# Generate a confusion matrix for the model
cm_knn = confusion_matrix(predictions_knn, y_test)
cm_knn_df = pd.DataFrame(
    cm_knn, index=unique_classes, columns=unique_classes
)

# Calculating the accuracy score
acc_knn_score = accuracy_score(y_test, predictions_knn)

In [63]:
# Print the classification report for the model
print("** K-Nearest Neighbors (KNN) Model")
print("* Confusion Matrix")
display(cm_knn_df)
print(f"* Accuracy Score : {acc_knn_score}")
print(f"""
* Classification Report""")
print(classification_report(y_test, predictions_knn))

** K-Nearest Neighbors (KNN) Model
* Confusion Matrix


Unnamed: 0,0,1,2,3,4
0,4796,83,199,109,65
1,8,1,5,5,2
2,46,6,17,8,6
3,12,3,4,5,3
4,6,2,2,2,9


* Accuracy Score : 0.8934122871946706

* Classification Report
              precision    recall  f1-score   support

           0       0.91      0.99      0.95      4868
           1       0.05      0.01      0.02        95
           2       0.20      0.07      0.11       227
           3       0.19      0.04      0.06       129
           4       0.43      0.11      0.17        85

    accuracy                           0.89      5404
   macro avg       0.36      0.24      0.26      5404
weighted avg       0.84      0.89      0.86      5404



---

## Machine Learning Model 4: Random Forest Model

In [64]:
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier
model_rf = RandomForestClassifier(n_estimators=500, random_state=78)

In [65]:
# Fitting the model
model_rf = model_rf.fit(X_train_scaled, y_train)

In [66]:
# Making predictions using the testing data
predictions_rf = model_rf.predict(X_test_scaled)

In [67]:
# Generate a confusion matrix for the model
cm_rf = confusion_matrix(predictions_knn, y_test)
cm_rf_df = pd.DataFrame(
    cm_rf, index=unique_classes, columns=unique_classes
)

# Calculating the accuracy score
acc_rf_score = accuracy_score(y_test, predictions_rf)

In [68]:
# Print the classification report for the model
print("** Random Forest Model")
print("* Confusion Matrix")
display(cm_rf_df)
print(f"* Accuracy Score : {acc_rf_score}")
print(f"""
* Classification Report""")
print(classification_report(y_test, predictions_rf))

** Random Forest Model
* Confusion Matrix


Unnamed: 0,0,1,2,3,4
0,4796,83,199,109,65
1,8,1,5,5,2
2,46,6,17,8,6
3,12,3,4,5,3
4,6,2,2,2,9


* Accuracy Score : 0.9063656550703183

* Classification Report
              precision    recall  f1-score   support

           0       0.92      1.00      0.96      4868
           1       0.00      0.00      0.00        95
           2       0.17      0.03      0.05       227
           3       0.40      0.06      0.11       129
           4       0.67      0.34      0.45        85

    accuracy                           0.91      5404
   macro avg       0.43      0.29      0.31      5404
weighted avg       0.85      0.91      0.87      5404

