In [187]:
from imblearn.over_sampling import SMOTE

In [227]:
# Importing the required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score
from imblearn.under_sampling import NearMiss

%matplotlib inline 

# To ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Load the dataset
df = pd.read_csv("housing.csv")

In [228]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [229]:
# Check the data types of each column and tells you how many rows and columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [230]:
#print out the columns with missing values in descending order 
missing_values = df.isnull().sum()
highest_counts = missing_values.sort_values(ascending=False)
highest_counts[highest_counts>0]

total_bedrooms    207
dtype: int64

In [193]:
# List of categorical columns
categorical = df.select_dtypes(include=['object']).columns

# Replace nulls with 'NA' in categorical columns
df[categorical] = df[categorical].fillna('NA')

# Optionally, you can verify the changes
print(df.head())

   MS SubClass MS Zoning  Lot Frontage  Lot Area Street Alley Lot Shape  \
0           20        RL         141.0     31770   Pave    NA       IR1   
1           20        RH          80.0     11622   Pave    NA       Reg   
2           20        RL          81.0     14267   Pave    NA       IR1   
3           20        RL          93.0     11160   Pave    NA       Reg   
4           60        RL          74.0     13830   Pave    NA       IR1   

  Land Contour Utilities Lot Config  ... Pool Area Pool QC  Fence  \
0          Lvl    AllPub     Corner  ...         0      NA     NA   
1          Lvl    AllPub     Inside  ...         0      NA  MnPrv   
2          Lvl    AllPub     Corner  ...         0      NA     NA   
3          Lvl    AllPub     Corner  ...         0      NA     NA   
4          Lvl    AllPub     Inside  ...         0      NA  MnPrv   

  Misc Feature Misc Val Mo Sold  Yr Sold  Sale Type  Sale Condition  SalePrice  
0           NA        0       5     2010        WD   

In [194]:
# List of numeric columns
numeric = df.select_dtypes(include=['number']).columns

# Replace nulls with 0.0 in numeric columns
df[numeric] = df[numeric].fillna(0.0)

# Optionally, you can verify the changes
print(df.head())

   MS SubClass MS Zoning  Lot Frontage  Lot Area Street Alley Lot Shape  \
0           20        RL         141.0     31770   Pave    NA       IR1   
1           20        RH          80.0     11622   Pave    NA       Reg   
2           20        RL          81.0     14267   Pave    NA       IR1   
3           20        RL          93.0     11160   Pave    NA       Reg   
4           60        RL          74.0     13830   Pave    NA       IR1   

  Land Contour Utilities Lot Config  ... Pool Area Pool QC  Fence  \
0          Lvl    AllPub     Corner  ...         0      NA     NA   
1          Lvl    AllPub     Inside  ...         0      NA  MnPrv   
2          Lvl    AllPub     Corner  ...         0      NA     NA   
3          Lvl    AllPub     Corner  ...         0      NA     NA   
4          Lvl    AllPub     Inside  ...         0      NA  MnPrv   

  Misc Feature Misc Val Mo Sold  Yr Sold  Sale Type  Sale Condition  SalePrice  
0           NA        0       5     2010        WD   

In [195]:
#Add all the baths together in the house and put into one column then delete the rest. We only care how many baths there are
bath_columns = [col for col in df.columns if 'Bath' in col]

# Check if any columns contain the word 'bath'
if not bath_columns:
    print("No columns containing the word 'bath' found.")
else:
    # Summing up the bath columns
    total_baths = df[bath_columns].sum(axis=1)

    # Storing the total in a variable called 'Total Baths'
    df['Total Baths'] = total_baths

    # Drop the original bath columns
    df.drop(columns=bath_columns, inplace=True)

    # Display the DataFrame to verify changes
    print(df.head())

   MS SubClass MS Zoning  Lot Frontage  Lot Area Street Alley Lot Shape  \
0           20        RL         141.0     31770   Pave    NA       IR1   
1           20        RH          80.0     11622   Pave    NA       Reg   
2           20        RL          81.0     14267   Pave    NA       IR1   
3           20        RL          93.0     11160   Pave    NA       Reg   
4           60        RL          74.0     13830   Pave    NA       IR1   

  Land Contour Utilities Lot Config  ... Pool QC  Fence Misc Feature Misc Val  \
0          Lvl    AllPub     Corner  ...      NA     NA           NA        0   
1          Lvl    AllPub     Inside  ...      NA  MnPrv           NA        0   
2          Lvl    AllPub     Corner  ...      NA     NA         Gar2    12500   
3          Lvl    AllPub     Corner  ...      NA     NA           NA        0   
4          Lvl    AllPub     Inside  ...      NA  MnPrv           NA        0   

  Mo Sold Yr Sold  Sale Type  Sale Condition  SalePrice  Total

In [196]:
# Creating ' Finished Garage' column for Finished or Rough Finished garages.
#This is because it's relevant whether the garage is finished and also whether its detached when selling property (bias?)
searchfor = ['Fin', 'RFn']
df['Finished Garage'] = np.where(df['Garage Finish'].str.contains('|'.join(searchfor)), 1, 0)

# Creating 'Has Detached Garage' column
df['Detached Garage'] = np.where(df['Garage Type'].str.contains('Detchd'), 1, 0)

# Dropping unnecessary columns
df.drop(['Garage Finish', 'Garage Yr Blt', 'Garage Type'], axis=1, inplace=True)

In [197]:
#We only need "Bsmt Qual" to determine if a basement exists or not
df.drop(['BsmtFin Type 1', 'BsmtFin Type 2', 'Bsmt Cond', 'Bsmt Exposure'], axis=1, inplace=True)


In [198]:
# Create a new column 'Is Remodeled' based on 'Year Remod/Add'
df['Remodeled'] = (df['Year Remod/Add'] != df['Year Built']).astype(int)

# Drop the 'Year Remod/Add' column
df.drop(columns=['Year Remod/Add'], inplace=True)

From looking at https://jse.amstat.org/v19n3/decock/DataDocumentation.txt, I can dig out Ordinal values with scales as follows:  Ex-Gd-TA-Fa-Po-NA” scale, so I’ll define a function to convert them all to a “5–4–3–2–1–0” scale at the same time.   NA

In [199]:
def scale(df, column_list):
    for column in column_list:
        df[column] = df[column].map({'Ex':5, 'Gd':4, 'TA':3, 'Fa':2, 'Po':1, 'NA':0})
        df[column] = pd.to_numeric(df[column])
        df[column] = df[column].fillna(0)

column_list = ['Bsmt Qual', 'Exter Qual', 'Exter Cond', 'Fireplace Qu', 'Garage Qual', 'Garage Cond', 'Pool QC', 'Kitchen Qual', 'Heating QC']
scale(df, column_list)

In [200]:
# Created dummy columns

df = pd.get_dummies(df, columns = ['Lot Config', 'MS Zoning', 'Misc Feature', 
                                   'House Style', 'Bldg Type', 'Mas Vnr Type', 'Condition 1', 'Roof Style', 'Heating', 'Sale Condition', 'Street', 'Exterior 1st', 'Neighborhood'], drop_first=True, dtype=int)

In [201]:
df["Lot Shape"] = df["Lot Shape"].map({"Reg":3,"IR1":2, "IR2":1, "IR2":0}).fillna(0)
df["Utilities"] = df["Utilities"].map({"AllPub":3,"NoSewr":2, "NoSeWa":1, "ELO":0}).fillna(0)
df["Land Slope"] = df["Land Slope"].map({"Gtl":2,"Mod":1, "Sev":0}).fillna(0)
df["Electrical"] = df["Electrical"].map({"SBrkr":4, "FuseA":3,"FuseF":2, "FuseP":1, "Mix":0}).fillna(0)
df["Functional"] = df["Functional"].map({"Typ":7, "Min1":6, "Min2":5, "Mod":4, "Maj1":3,"Maj2":2, "Sev":1, "Sal":0}).fillna(0)
df["Paved Drive"] = df["Paved Drive"].map({"N":0, "P":1, "Y":2}).fillna(0)
df["Fence"] = df["Fence"].map({"GdPrv":4, "MnPrv":3,"GdWo":2, "MnWw":1, "NA":0}).fillna(0)
df["Central Air"] = df["Central Air"].map({"Y":1, "N":0}).fillna(0)
#df["Misc Feature"] = df["Misc Feature"].map({"Elev":5, "Gar2":4, "Othr":3,"Shed":2, "TenC":1, "NA":0}).fillna(0)
#df["Alley"] = df["Alley"].map({"Grvl":2,"Pave":1, "NA":0}).fillna(0)
#df["Mas Vnr Type"] = df["Mas Vnr Type"].map({"Stone":4, "BrkFace":3,"BrkCmn":2, "CBlock":1, "None":0}).fillna(0)
#df["Garage Type"] = df["Garage Type"].map({"Attchd":6, "BuitIn":5, "Basment":4, "Detchd":3,"2Types":2, "CarPort":1, "NA":0}).fillna(0)

In [202]:
#Convert Year Build into Age
import datetime
# Get the current year
current_year = datetime.datetime.now().year

# Create a new column 'Age' based on the year column
df['Age'] = current_year - df['Year Built']

# Optionally, drop the original year column
df.drop(columns=['Year Built'], inplace=True)

In [203]:
# Consolidating Alley columns into 'Has Alley'

lookfor = 'Grvl', 'Pave'

df['Has Alley'] = np.where(df['Alley'].str.contains('|'.join(lookfor)), 1, 0)


df.drop(['Alley'], axis=1, inplace=True)

In [204]:
#Drop BsmtFin SF 1,	BsmtFin SF 2, Bsmt Unf SF and keep Total Bsmt SF (its the total)
#Drop '1st Flr SF',	'2nd Flr SF', 'Low Qual Fin SF' and keep Gr Liv Area (it's the total)
 
df.drop(['BsmtFin SF 1','BsmtFin SF 2',	'Bsmt Unf SF'], axis=1, inplace=True)
df.drop(['1st Flr SF',	'2nd Flr SF', 'Low Qual Fin SF'], axis=1, inplace=True)

In [205]:
numerical = df.select_dtypes("number")
numerical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Columns: 143 entries, MS SubClass to Has Alley
dtypes: float64(8), int32(100), int64(35)
memory usage: 2.1 MB


In [206]:
#Drop Land Contour, Roof Matl
#Drop Condition 2, Exterior 2nd, Foundation as not important (possible bias from me)
df.drop(['Land Contour', 'Condition 2', 'Roof Matl', 'Foundation', 'Sale Type', 'Exterior 2nd'], axis=1, inplace=True)

In [207]:
# Now examine the strings
string = df.select_dtypes(object)
string.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Empty DataFrame


In [208]:
df.to_csv('df.csv', index=False) #output to df.csv to help me analyse

In [209]:

# Calculate the number of zeros in each column
zero_counts = df.eq(0).sum()

# Sort the columns based on the number of zeros (descending order)
sorted_zero_counts = zero_counts.sort_values(ascending=False)

# Print all columns with their corresponding counts of zeros
print("Columns with the most zeros:")
for column_name, count in sorted_zero_counts.items():
    print(f"{column_name}: {count}")

Columns with the most zeros:
Mas Vnr Type_CBlock: 2929
Misc Feature_TenC: 2929
Exterior 1st_ImStucc: 2929
Neighborhood_Landmrk: 2929
Exterior 1st_PreCast: 2929
Exterior 1st_AsphShn: 2928
Exterior 1st_Stone: 2928
MS Zoning_I (all): 2928
Exterior 1st_CBlock: 2928
Heating_OthW: 2928
Neighborhood_GrnHill: 2928
Misc Feature_Othr: 2926
Misc Feature_Gar2: 2925
Roof Style_Shed: 2925
Exterior 1st_BrkComm: 2924
Condition 1_RRNe: 2924
Heating_Wall: 2924
Neighborhood_Greens: 2922
House Style_2.5Fin: 2922
Condition 1_RRNn: 2921
Heating_Grav: 2921
Neighborhood_Blueste: 2920
Roof Style_Mansard: 2919
Sale Condition_AdjLand: 2918
Pool QC: 2917
Pool Area: 2917
Lot Config_FR3: 2916
House Style_1.5Unf: 2911
Condition 1_PosA: 2910
Roof Style_Gambrel: 2908
Neighborhood_NPkVill: 2907
House Style_2.5Unf: 2906
Neighborhood_Veenker: 2906
Sale Condition_Alloca: 2906
MS Zoning_C (all): 2905
Heating_GasW: 2903
MS Zoning_RH: 2903
Condition 1_RRAe: 2902
Neighborhood_BrDale: 2900
Neighborhood_MeadowV: 2893
3Ssn Porch

In [216]:
# Remove the last column 
X = df.drop("SalePrice", axis = 1)    # Independent variables
y = df.SalePrice                      # Dependent variable

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [217]:
X_train

Unnamed: 0,MS SubClass,Lot Frontage,Lot Area,Lot Shape,Utilities,Land Slope,Overall Qual,Overall Cond,Mas Vnr Area,Exter Qual,...,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Age,Has Alley
533,20,80.0,9605,3.0,3,2,7,6,0.0,4,...,0,0,0,1,0,0,0,0,17,0
802,20,90.0,14684,2.0,3,2,7,7,234.0,4,...,0,0,0,1,0,0,0,0,34,0
955,20,0.0,14375,2.0,1,2,6,6,541.0,3,...,0,0,0,0,0,0,1,0,66,0
459,120,48.0,6472,3.0,3,2,9,5,500.0,5,...,0,0,0,0,0,0,0,0,16,0
486,80,61.0,9734,2.0,3,2,7,5,0.0,4,...,0,0,0,0,0,0,0,0,20,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2763,60,80.0,10041,2.0,3,2,8,5,0.0,4,...,0,0,0,1,0,0,0,0,32,0
905,50,70.0,6300,3.0,3,2,5,4,88.0,3,...,0,1,0,0,0,0,0,0,86,1
1096,60,41.0,12460,2.0,3,2,7,5,0.0,4,...,0,0,0,0,0,0,0,0,25,0
235,30,85.0,10625,3.0,3,2,5,5,0.0,3,...,0,0,0,0,0,0,0,0,104,0


In [218]:
X_test

Unnamed: 0,MS SubClass,Lot Frontage,Lot Area,Lot Shape,Utilities,Land Slope,Overall Qual,Overall Cond,Mas Vnr Area,Exter Qual,...,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Age,Has Alley
2126,20,60.0,8070,3.0,3,2,4,5,0.0,3,...,0,0,0,0,0,0,0,0,30,0
192,75,0.0,7793,2.0,3,2,7,7,0.0,3,...,0,0,0,0,0,0,0,0,102,0
2406,120,40.0,6792,2.0,3,2,7,5,94.0,4,...,0,0,0,0,0,0,0,0,19,0
45,120,44.0,6371,2.0,3,2,7,5,128.0,4,...,0,0,0,0,0,0,0,0,15,0
2477,60,70.0,8304,2.0,3,2,6,5,0.0,3,...,0,0,0,1,0,0,0,0,27,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
452,120,34.0,5381,2.0,3,2,6,5,135.0,4,...,0,0,0,0,0,0,0,0,19,0
1351,50,64.0,13053,3.0,3,2,6,7,0.0,3,...,0,0,0,0,0,0,0,0,101,1
196,50,53.0,6360,3.0,3,2,5,6,300.0,3,...,0,0,0,0,0,0,0,0,82,0
2849,70,43.0,7000,3.0,3,2,7,8,0.0,3,...,0,1,0,0,0,0,0,0,98,0


In [219]:
y_train

533     159000
802     271900
955     137500
459     248500
486     167000
         ...  
2763    220000
905     160000
1096    225000
235      83000
1061    250000
Name: SalePrice, Length: 2197, dtype: int64

In [220]:
X.shape, y.shape, X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2930, 142), (2930,), (2197, 142), (733, 142), (2197,), (733,))

In [222]:
# Create an object by calling a method LogisticRegression()
lr = LogisticRegression()

# Train the model by calling a fit() method
lr.fit(X_train, y_train)

# Predict the values
y_pred = lr.predict(X_test)

# Call and display confusion matrix results
confusion_matrix(y_test, y_pred)

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [224]:
# Accuracy and recall based on Logistic regression on imbalanced dataset
accuracy = accuracy_score(y_test, y_pred)

recall = recall_score(y_test, y_pred, average=None)

print(accuracy, recall)

0.010914051841746248 [0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.2
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.4        0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         

In [226]:
# SMOTE

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

smt = SMOTE()

X_train, y_train = smt.fit_resample(X_train, y_train)

ValueError: Expected n_neighbors <= n_samples,  but n_samples = 1, n_neighbors = 6