In [50]:
import pandas as pd
import numpy as np

#Read the WestRoxbury csv file
df = pd.read_csv("D:\Data Mining\Datasets\WestRoxbury.csv")
df.head()

Unnamed: 0,TOTAL VALUE,TAX,LOT SQFT,YR BUILT,GROSS AREA,LIVING AREA,FLOORS,ROOMS,BEDROOMS,FULL BATH,HALF BATH,KITCHEN,FIREPLACE,REMODEL
0,344.2,4330,9965,1880,2436,1352,2.0,6,3,1,1,1,0,
1,412.6,5190,6590,1945,3108,1976,2.0,10,4,2,1,1,0,Recent
2,330.1,4152,7500,1890,2294,1371,2.0,8,4,1,1,1,0,
3,498.6,6272,13773,1957,5032,2608,1.0,9,5,1,1,1,1,
4,331.5,4170,5000,1910,2370,1438,2.0,7,3,2,0,1,0,


In [51]:
#find the shape and size
print(df.shape)
print(df.size)

(5802, 14)
81228


In [52]:
#Replace all the space with _ for all columns
df.columns = [s.strip().replace(" ","_") for s in df.columns]
df.head()

Unnamed: 0,TOTAL_VALUE,TAX,LOT_SQFT,YR_BUILT,GROSS_AREA,LIVING_AREA,FLOORS,ROOMS,BEDROOMS,FULL_BATH,HALF_BATH,KITCHEN,FIREPLACE,REMODEL
0,344.2,4330,9965,1880,2436,1352,2.0,6,3,1,1,1,0,
1,412.6,5190,6590,1945,3108,1976,2.0,10,4,2,1,1,0,Recent
2,330.1,4152,7500,1890,2294,1371,2.0,8,4,1,1,1,0,
3,498.6,6272,13773,1957,5032,2608,1.0,9,5,1,1,1,1,
4,331.5,4170,5000,1910,2370,1438,2.0,7,3,2,0,1,0,


In [53]:
#show first 10 values of TOTAL_VALUE columns
df.iloc[0:10]['TOTAL_VALUE']

0    344.2
1    412.6
2    330.1
3    498.6
4    331.5
5    337.4
6    359.4
7    320.4
8    333.5
9    409.4
Name: TOTAL_VALUE, dtype: float64

In [54]:
#show fifth row of first 10 columns
df.iloc[4][0:10]

TOTAL_VALUE    331.5
TAX             4170
LOT_SQFT        5000
YR_BUILT        1910
GROSS_AREA      2370
LIVING_AREA     1438
FLOORS             2
ROOMS              7
BEDROOMS           3
FULL_BATH          2
Name: 4, dtype: object

In [55]:
#Mean of first column
print(df['TOTAL_VALUE'].mean())

392.6857149258885


In [56]:
#Statistical description of the dataset
df.describe()

Unnamed: 0,TOTAL_VALUE,TAX,LOT_SQFT,YR_BUILT,GROSS_AREA,LIVING_AREA,FLOORS,ROOMS,BEDROOMS,FULL_BATH,HALF_BATH,KITCHEN,FIREPLACE
count,5802.0,5802.0,5802.0,5802.0,5802.0,5802.0,5802.0,5802.0,5802.0,5802.0,5802.0,5802.0,5802.0
mean,392.685715,4939.485867,6278.083764,1936.744916,2924.842123,1657.065322,1.68373,6.994829,3.230093,1.296794,0.613926,1.01534,0.739917
std,99.177414,1247.649118,2669.707974,35.98991,883.984726,540.456726,0.444884,1.437657,0.846607,0.52204,0.533839,0.12291,0.565108
min,105.0,1320.0,997.0,0.0,821.0,504.0,1.0,3.0,1.0,1.0,0.0,1.0,0.0
25%,325.125,4089.5,4772.0,1920.0,2347.0,1308.0,1.0,6.0,3.0,1.0,0.0,1.0,0.0
50%,375.9,4728.0,5683.0,1935.0,2700.0,1548.5,2.0,7.0,3.0,1.0,1.0,1.0,1.0
75%,438.775,5519.5,7022.25,1955.0,3239.0,1873.75,2.0,8.0,4.0,2.0,1.0,1.0,1.0
max,1217.8,15319.0,46411.0,2011.0,8154.0,5289.0,3.0,14.0,9.0,5.0,3.0,2.0,4.0


In [57]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression

In [58]:
#random sample of 5 observations
df.sample(5)

#oversample houses with more than 10 rooms
weights = [0.9 if r >10 else 0.01 for r in df.ROOMS]
df.sample(5,weights=weights)

Unnamed: 0,TOTAL_VALUE,TAX,LOT_SQFT,YR_BUILT,GROSS_AREA,LIVING_AREA,FLOORS,ROOMS,BEDROOMS,FULL_BATH,HALF_BATH,KITCHEN,FIREPLACE,REMODEL
923,330.2,4153,7092,1950,2778,1270,1.0,7,3,1,0,1,1,
5175,407.3,5123,7011,1925,2888,1752,2.0,6,3,2,1,1,1,
4968,406.0,5107,7660,1950,2252,960,1.0,12,2,1,0,1,1,
3224,629.9,7924,11533,1918,5934,3927,2.5,11,6,1,1,1,2,
4261,318.8,4010,5772,1925,2013,968,1.0,5,2,1,0,1,1,


In [59]:
#convert REMODEL variable to categorical variable
df.REMODEL = df.REMODEL.astype('category')
df['REMODEL'].unique()

['None', 'Recent', 'Old']
Categories (3, object): ['None', 'Recent', 'Old']

In [60]:
# use drop_first=True to drop the first dummy variable
df = pd.get_dummies(df, prefix_sep='_', drop_first=True)
df.columns

Index(['TOTAL_VALUE', 'TAX', 'LOT_SQFT', 'YR_BUILT', 'GROSS_AREA',
       'LIVING_AREA', 'FLOORS', 'ROOMS', 'BEDROOMS', 'FULL_BATH', 'HALF_BATH',
       'KITCHEN', 'FIREPLACE', 'REMODEL_Old', 'REMODEL_Recent'],
      dtype='object')

In [61]:
#Checking for null values
df.isnull().sum()

TOTAL_VALUE       0
TAX               0
LOT_SQFT          0
YR_BUILT          0
GROSS_AREA        0
LIVING_AREA       0
FLOORS            0
ROOMS             0
BEDROOMS          0
FULL_BATH         0
HALF_BATH         0
KITCHEN           0
FIREPLACE         0
REMODEL_Old       0
REMODEL_Recent    0
dtype: int64

In [62]:
# To illustrate missing data procedures, we first convert a few entries for
# bedrooms to NA’s. Then we impute these missing values using the median of the
# remaining values.
missingRows = df.sample(10).index
df.loc[missingRows, 'BEDROOMS'] = np.nan
print('Number of rows with valid BEDROOMS values after setting to NAN: ',
df['BEDROOMS'].count())

# remove rows with missing values
reduced_df = df.dropna()
print('Number of rows after removing rows with missing values: ', len(reduced_df))

# replace the missing values using the median of the remaining values.
medianBedrooms = df['BEDROOMS'].median()
df.BEDROOMS = df.BEDROOMS.fillna(value=medianBedrooms)
print('Number of rows with valid BEDROOMS values after filling NA values: ',
df['BEDROOMS'].count())


Number of rows with valid BEDROOMS values after setting to NAN:  5792
Number of rows after removing rows with missing values:  5792
Number of rows with valid BEDROOMS values after filling NA values:  5802


In [63]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
df1 = df.copy()
# Normalizing a data frame
# pandas:
norm_df = (df - df.mean()) / df.std()
# scikit-learn:
scaler = StandardScaler()
norm_df = pd.DataFrame(scaler.fit_transform(df), index=df.index,
columns=df.columns)
# the result of the transformation is a numpy array, we convert it into a dataframe

# Rescaling a data frame
# pandas:
norm_df = (df - df.min()) / (df.max() - df.min())
# scikit-learn:
scaler = MinMaxScaler()
norm_df = pd.DataFrame(scaler.fit_transform(df), index=df.index,
columns=df.columns)

In [64]:
# random_state is set to a defined value to get the same partitions when re-running
# the code
# training (60
trainData, validData = train_test_split(df, test_size=0.40, random_state=1)
print('Training : ', trainData.shape)
print('Validation : ', validData.shape)
print()
# training (50
trainData, temp = train_test_split(df, test_size=0.5, random_state=1)
validData, testData = train_test_split(temp, test_size=0.4, random_state=1)
print('Training : ', trainData.shape)
print('Validation : ', validData.shape)
print('Test : ', testData.shape)

Training :  (3481, 15)
Validation :  (2321, 15)

Training :  (2901, 15)
Validation :  (1740, 15)
Test :  (1161, 15)


In [65]:
from sklearn.linear_model import LinearRegression

# data loading and preprocessing
housing_df = pd.read_csv("D:\Data Mining\Datasets\WestRoxbury.csv")
housing_df.columns = [s.strip().replace(' ', '_') for s in housing_df.columns]
housing_df = pd.get_dummies(housing_df, prefix_sep='_', drop_first=True)
# create list of predictors and outcome
excludeColumns = ('TOTAL_VALUE', 'TAX')
predictors = [s for s in housing_df.columns if s not in excludeColumns]
outcome = 'TOTAL_VALUE'
# partition data
X = housing_df[predictors]
y = housing_df[outcome]
train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.4, random_state=1)
model = LinearRegression()
model.fit(train_X, train_y)
train_pred = model.predict(train_X)
train_results = pd.DataFrame({ 'TOTAL_VALUE': train_y, 'predicted': train_pred, 'residual':
train_y - train_pred })
train_results.head()

Unnamed: 0,TOTAL_VALUE,predicted,residual
2024,392.0,387.726258,4.273742
5140,476.3,430.78554,45.51446
5259,367.4,384.042952,-16.642952
421,350.3,369.005551,-18.705551
1401,348.1,314.725722,33.374278


In [66]:
valid_pred = model.predict(valid_X)
valid_results = pd.DataFrame({ 'TOTAL_VALUE': valid_y, 'predicted': valid_pred, 'residual':
valid_y - valid_pred })
valid_results.head()

Unnamed: 0,TOTAL_VALUE,predicted,residual
1822,462.0,406.946377,55.053623
1998,370.4,362.888928,7.511072
5126,407.4,390.287208,17.112792
808,316.1,382.470203,-66.370203
4034,393.2,434.334998,-41.134998


In [67]:
# import the utility function regressionSummary
from dmba import regressionSummary

# training set
regressionSummary(train_results.TOTAL_VALUE, train_results.predicted)

# validation set
regressionSummary(valid_results.TOTAL_VALUE, valid_results.predicted)


Regression statistics

                      Mean Error (ME) : -0.0000
       Root Mean Squared Error (RMSE) : 43.0306
            Mean Absolute Error (MAE) : 32.6042
          Mean Percentage Error (MPE) : -1.1116
Mean Absolute Percentage Error (MAPE) : 8.4886

Regression statistics

                      Mean Error (ME) : -0.1463
       Root Mean Squared Error (RMSE) : 42.7292
            Mean Absolute Error (MAE) : 31.9663
          Mean Percentage Error (MPE) : -1.0884
Mean Absolute Percentage Error (MAPE) : 8.3283


In [70]:
from sklearn.metrics import classification_report, accuracy_score
train_pred2 = model.predict(train_y)
print("the accuracy  of training data is : /n", accuracy_score(train_pred2,valid_y))

ValueError: Expected 2D array, got 1D array instead:
array=[392.  476.3 367.4 ... 404.8 325.  374.3].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.