In [51]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import warnings

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
%matplotlib inline 

In [101]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,Price,Land Size,Building Size,Year Built,Rooms,County
0,"USD $32,500","22,500.54 m2",,,,Somerset
1,"USD $257,000","94,494.18 m2",,,,Faulkner
2,"USD $49,000","3,318.43 m2",,,,Rhea
3,"USD $109,900","51,435.59 m2",,,,Muskegon
4,"USD $185,000","189,433.52 m2",,,,Ottawa


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2470 entries, 0 to 2469
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Price          2470 non-null   object 
 1   Land Size      2051 non-null   object 
 2   Building Size  2046 non-null   object 
 3   Year Built     2121 non-null   float64
 4   Rooms          2161 non-null   object 
 5   County         2319 non-null   object 
dtypes: float64(1), object(5)
memory usage: 115.9+ KB


In [102]:
irows_to_remove = []
for i in range(df.shape[0]): 
    row = df.iloc[i]
    if row.isna().sum() > 0:
        irows_to_remove.append(i) 

df = df.drop(irows_to_remove, axis=0) 
df.reset_index(inplace=True)
df.drop("index", axis=1, inplace=True)
df.head()

Unnamed: 0,Price,Land Size,Building Size,Year Built,Rooms,County
0,"USD $279,900","1,173.59 m2",149.85 m2,1997.0,"3 bedroom(s), 2 bathroom(s)",Chatham County
1,"USD $745,000","2,221.50 m2",284.56 m2,2025.0,"4 bedroom(s), 5 bathroom(s), 1 livingroom(s)",Stanislaus
2,"USD $225,000","2,387.65 m2",115.94 m2,1964.0,"3 bedroom(s), 2 bathroom(s)",Monroe
3,"USD $724,900","1,214.06 m2",309.74 m2,2014.0,"5 bedroom(s), 4 bathroom(s)",Wake
4,"USD $247,900",809.37 m2,165.00 m2,1970.0,"3 bedroom(s), 3 bathroom(s), 1 livingroom(s)",Grant


In [103]:
df.isna().sum()

Price            0
Land Size        0
Building Size    0
Year Built       0
Rooms            0
County           0
dtype: int64

## Select only the price(the number) in the "Price" column

In [104]:
for i in range(df.shape[0]): 
    val = df["Price"][i].split("$")[-1]
    price = "".join(val.split(","))
    df["Price"][i] = int(price)
df.head()

Unnamed: 0,Price,Land Size,Building Size,Year Built,Rooms,County
0,279900,"1,173.59 m2",149.85 m2,1997.0,"3 bedroom(s), 2 bathroom(s)",Chatham County
1,745000,"2,221.50 m2",284.56 m2,2025.0,"4 bedroom(s), 5 bathroom(s), 1 livingroom(s)",Stanislaus
2,225000,"2,387.65 m2",115.94 m2,1964.0,"3 bedroom(s), 2 bathroom(s)",Monroe
3,724900,"1,214.06 m2",309.74 m2,2014.0,"5 bedroom(s), 4 bathroom(s)",Wake
4,247900,809.37 m2,165.00 m2,1970.0,"3 bedroom(s), 3 bathroom(s), 1 livingroom(s)",Grant


In [105]:
df["Price"] = pd.to_numeric(df["Price"])

## Convert "Land Size" and "Bulding Size" to float64 vals

In [None]:
for i in range(df.shape[0]): 
    int_part, decimal_part = df["Land Size"][i][:-3].split(".")
    land_size = float(int_part.replace(",", "") + f".{decimal_part}")
    df["Land Size"][i] = land_size
df["Land Size"] = pd.to_numeric(df["Land Size"])

df.head()

In [None]:
for i in range(df.shape[0]): 
    int_part, decimal_part = df["Building Size"][i][:-3].split(".")
    building_size = float(int_part.replace(",", "") + f".{decimal_part}")
    df["Building Size"][i] = building_size
df["Building Size"] = pd.to_numeric(df["Building Size"])

df.head()

In [108]:
df["Year Built"] = df["Year Built"].astype(int)

## Devide "Rooms" column into bedrooms, bathrooms and livingrooms features

In [111]:
df["bedroom(s)"] = 0
df["bathroom(s)"] = 0
df["livingroom(s)"] = 0

In [112]:
def rooms_col_processor(row):
    new_row = row["Rooms"].split(",")
    row_separated = [
    [word for word in x.split(" ") if word != ""]
    for x in new_row
    ]
    for item in row_separated: 
        row[item[-1]] = item[0]
    return row

In [113]:
df = df.apply(rooms_col_processor, axis=1)
df.drop("Rooms", axis=1, inplace=True)

df.head()

Unnamed: 0,Building Size,County,Land Size,Price,Rooms,Year Built,bathroom(s),bedroom(s),livingroom(s)
0,149.85,Chatham County,1173.59,279900,"3 bedroom(s), 2 bathroom(s)",1997,2,3,0
1,284.56,Stanislaus,2221.5,745000,"4 bedroom(s), 5 bathroom(s), 1 livingroom(s)",2025,5,4,1
2,115.94,Monroe,2387.65,225000,"3 bedroom(s), 2 bathroom(s)",1964,2,3,0
3,309.74,Wake,1214.06,724900,"5 bedroom(s), 4 bathroom(s)",2014,4,5,0
4,165.0,Grant,809.37,247900,"3 bedroom(s), 3 bathroom(s), 1 livingroom(s)",1970,3,3,1


In [118]:
df.to_csv("cleaned_dataset.csv", index=False)