# Cleaning and Preprocessing Data for Machine Learning

In [8]:
import warnings
warnings.simplefilter('ignore')
from sklearn.ensemble import RandomForestRegressor

# %matplotlib inline
# import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [9]:
# Read the csv file into a pandas DataFrame

data = pd.read_excel('Active and Inconclusive Properties GA.xlsx')
data.head()

Unnamed: 0,NHPD Property ID,Property Name,Property Address,City,State,Zip,CBSACode,CBSA Type,County,County Code,...,State_1_ConstructionType,State_2_ID,State_2_Status,State_2_ProgramName,State_2_StartDate,State_2_EndDate,State_2_AssistedUnits,State_2_InacStatusDesc,State_2_ConstructionType,Old NHPD Property ID
0,1037886,SWAINSBORO PRESBYTERIAN APARTMENTS,310 Mary Ann St,Swainsboro,GA,30401-3050,,,Emanuel,13107.0,...,,,,,,,,,,241.0
1,1037884,TATTNAL EVANS HSNG INC,312 Henry St,Glennville,GA,30427-2456,,,Tattnall,13267.0,...,,,,,,,,,,240.0
2,1015106,PARADISE MOULTRIE APARTMENTS,502 27th St SE,Moultrie,GA,31788-6271,34220.0,Micropolitan Statistical Area,Colquitt,13071.0,...,,,,,,,,,,
3,1038022,HOUSING AUTH OF THE CITY OF DAWSON,545 Lemon St NE,Dawson,GA,39842-1069,10500.0,Metropolitan Statistical Area,Terrell,13273.0,...,,,,,,,,,,20256.0
4,1002320,PINELAND SQUARE,1001 Pineland Ave,Hinesville,GA,31313-5162,25980.0,Metropolitan Statistical Area,Liberty,13179.0,...,,,,,,,,,,


In [10]:

data.dropna(axis=1, how='all',inplace=True)


In [11]:
data.dropna()
data.head()

Unnamed: 0,NHPD Property ID,Property Name,Property Address,City,State,Zip,CBSACode,CBSA Type,County,County Code,...,PH_2_ID,PH_2_Status,PH_2_ProgramName,PH_2_StartDate,PH_2_AssistedUnits,PH_2_PhaCode,NumberActiveState,NumberInconclusiveState,NumberInactiveState,Old NHPD Property ID
0,1037886,SWAINSBORO PRESBYTERIAN APARTMENTS,310 Mary Ann St,Swainsboro,GA,30401-3050,,,Emanuel,13107.0,...,,,,NaT,,,0,0,0,241.0
1,1037884,TATTNAL EVANS HSNG INC,312 Henry St,Glennville,GA,30427-2456,,,Tattnall,13267.0,...,,,,NaT,,,0,0,0,240.0
2,1015106,PARADISE MOULTRIE APARTMENTS,502 27th St SE,Moultrie,GA,31788-6271,34220.0,Micropolitan Statistical Area,Colquitt,13071.0,...,,,,NaT,,,0,0,0,
3,1038022,HOUSING AUTH OF THE CITY OF DAWSON,545 Lemon St NE,Dawson,GA,39842-1069,10500.0,Metropolitan Statistical Area,Terrell,13273.0,...,,,,NaT,,,0,0,0,20256.0
4,1002320,PINELAND SQUARE,1001 Pineland Ave,Hinesville,GA,31313-5162,25980.0,Metropolitan Statistical Area,Liberty,13179.0,...,,,,NaT,,,0,0,0,


In [12]:
np.isnan(data.values.any())

False

In [13]:
#Do not forget to check for inf values as well.
data[data==np.inf]=np.nan
data.fillna(data.mean(), inplace=True)

In [14]:
X = data[["Latitude", "Longitude"]]
y = data["FairMarketRent_2BR"].values.reshape(-1, 1)
print(X.shape, y.shape)

(2161, 2) (2161, 1)


In [15]:
data_X = pd.get_dummies(X)
data_X.head()

Unnamed: 0,Latitude,Longitude
0,32.593485,-82.33603
1,31.93787,-81.92323
2,31.170928,-83.753068
3,31.77931,-84.44241
4,31.81713,-81.6198


## Scaling and Normalization

The final step that we need to perform is scaling and normalization. Many algorithms will perform better with a normalized or scaled dataset. You may not see a difference with the Sklearn LinearRegression model, but other models that use gradient descent need normalization to help the algorithms converge to a local optima.

Sklearn provides a variety of scaling and normalization options. The two most common are minmax and StandardScaler. Use StandardScaler when you don't know anything about your data.

The first step is to split your data into Training and Testing using `train_test_split`.

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

X_train

Unnamed: 0,Latitude,Longitude
1541,32.195080,-84.137000
1629,33.424273,-82.311032
1190,33.740480,-84.313850
585,31.702920,-83.636310
261,33.047551,-83.935615
2156,33.598292,-85.039859
548,34.069078,-84.679831
555,34.355643,-84.034964
634,32.083100,-84.257800
839,33.276817,-85.095222


### StandardScaler

Now, we fit our StandardScaler model to our training data. We can apply this StandardScaler model to any future data. Note that we use this fit/transform approach so that we isolate our testing data from the training data that we use to fit our model. Otherwise, we might bias our model to the testing data. 

In [17]:
from sklearn.preprocessing import StandardScaler
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)

In [18]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [19]:
# Plot the results 

model = RandomForestRegressor(n_estimators=100, random_state = 42)
model.fit(X_train_scaled, y_train_scaled)
#plt.scatter(model.predict(X_train_scaled), model.predict(X_train_scaled) - y_train_scaled, c="blue", label="Training Data")
#plt.scatter(model.predict(X_test_scaled), model.predict(X_test_scaled) - y_test_scaled, c="orange", label="Testing Data")
#plt.legend()
#plt.hlines(y=0, xmin=y_test_scaled.min(), xmax=y_test_scaled.max())
#plt.title("Residual Plot")
#plt.show

predictions = model.predict(X_test_scaled)
predictions

array([ 0.01291445, -0.17515667, -0.76379059,  1.09419402, -1.05330814,
       -1.0052199 , -0.78585848, -1.05330814,  0.45831882, -0.1028361 ,
       -0.64164082, -1.05330814, -0.12341142, -0.67481168,  0.32158259,
        1.09748773, -0.22517409, -1.05330814,  0.52814558,  1.82351662,
       -0.22517409,  1.49790937, -0.14259595, -0.12786834,  0.92903774,
        0.81281666, -0.78999915,  2.21904473, -0.45573414,  1.07960757,
        1.07584332, -0.46514475, -1.0374042 ,  0.46085968,  1.44405129,
       -1.05330814, -0.80863217, -0.67481168, -0.1028361 ,  2.62435989,
        0.53562701, -0.45573414, -0.82096007, -1.05189655, -0.65768592,
       -0.45573414, -1.05330814, -1.05330814, -0.45573414, -0.91403105,
       -1.05330814, -0.40693565, -1.05330814,  1.63059903, -0.1028361 ,
       -0.17515667,  2.55707399,  1.01043955, -0.81004376,  0.52767505,
       -1.05330814,  0.62460437,  0.59213775, -0.22517409,  0.52767505,
        0.02837433, -0.94979138,  0.52767505, -1.05330814,  0.04

Quantify your model using the scaled data

In [20]:

# Calculate the absolute errors
errors = abs(predictions - y_test_scaled)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 1.02 degrees.


In [21]:

#inverse_transform(self, X, copy=None)
y_new_inverse = y_scaler.inverse_transform(predictions)
y_new_inverse

array([ 886.6       ,  846.63      ,  721.53      , 1116.4       ,
        660.        ,  670.22      ,  716.84      ,  660.        ,
        981.26      ,  862.        ,  747.49      ,  660.        ,
        857.62721006,  740.44033069,  952.2       , 1117.1       ,
        836.        ,  660.        ,  996.1       , 1271.4       ,
        836.        , 1202.2       ,  853.55      ,  856.68      ,
       1081.3       , 1056.6       ,  715.96      , 1355.46      ,
        787.        , 1113.3       , 1112.5       ,  785.        ,
        663.38      ,  981.8       , 1190.75375873,  660.        ,
        712.        ,  740.44033069,  862.        , 1441.6       ,
        997.69      ,  787.        ,  709.38      ,  660.3       ,
        744.08      ,  787.        ,  660.        ,  660.        ,
        787.        ,  689.6       ,  660.        ,  797.37094551,
        660.        , 1230.4       ,  862.        ,  846.63      ,
       1427.3       , 1098.6       ,  711.7       ,  996.     

In [22]:
from sklearn.metrics import mean_squared_error

predictions = y_new_inverse
MSE = mean_squared_error(y_test, predictions)
RMSE=np.sqrt(MSE)

print(f"MSE: {MSE}, RMSE: {RMSE}")

MSE: 2168.671607047839, RMSE: 46.56899834705315
