In [25]:
import pandas as pd
from random import random
import math

In [26]:
df = pd.read_csv("wrangledNYC - data.csv")
df.drop("Unnamed: 0", axis=1, inplace=True)
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3536 entries, 0 to 3535
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   rent       3536 non-null   float64
 1   bedrooms   3536 non-null   float64
 2   bathrooms  3536 non-null   float64
 3   size_sqft  3536 non-null   float64
dtypes: float64(4)
memory usage: 110.6 KB


Unnamed: 0,rent,bedrooms,bathrooms,size_sqft
0,0.066845,0.0,0.2,0.050549
1,0.545455,0.4,0.4,0.384615
2,0.171123,0.2,0.2,0.146374
3,0.186898,0.2,0.2,0.159341
4,0.866310,0.4,0.4,1.000000
...,...,...,...,...
3531,0.155615,0.2,0.2,0.061978
3532,0.287433,0.4,0.4,0.162198
3533,0.021337,0.0,0.2,0.000000
3534,0.116310,0.2,0.2,0.088132


In [27]:
# Test/Train Split
length = len(df.index)
splitHere = int(length * 0.7)
train = df[0:splitHere]
test = df[splitHere:length]

In [28]:
# Training Algorithm:  
# Reminder --> Standard equation for 3 features: ax + by + cz + d = 0
def trainMRModel(trainDF, iterations):
  '''
  Pseudocode Algorithm:
  1. Naively choose starting weights for initial model. This is model A.
  2. Calculate the distance between data and model A.
  3. Randomly generate steps to take when generating model B from current model A.
    - if distance of model B is less than model A, replace model A with model B.
    - elif distance of model B is NOT less than model A after 50 tries of randomly stepping, keep model A.
  4. Repeat step 3 for desired number of iterations.
  '''
  
  # Initialize naive model A
  modelA = [0.5, 0.5, 0.5, 0.5] # [a, b, c, d]

  distanceA = calculateDistance(trainDF, modelA)
  print("First model - weights: ", modelA, " distance: ", distanceA)
  
  modelB = modelA[:] # make a copy of model A -> 'modelB = modelA' just references original model A
  
  # Randomly take steps for desired iterations
  for i in range(iterations):
    modelB = take_best_step(trainDF, modelB)
  
  # Calculate distance of best model found
  distanceB = calculateDistance(trainDF, modelB)
  print("Final model - weights: ", modelB, " distance: ", distanceB)

In [29]:
# Helper functions
def take_random_step(model):
  return [
      model[0] + (random() - 0.5), # Subtract off 0.5 to generate negative and positive steps
      model[1] + (random() - 0.5),
      model[2] + (random() - 0.5),
      model[3] + (random() - 0.5)
  ]

def take_best_step(trainDF, model):
  old_distance = calculateDistance(trainDF, model)
  for i in range(50): # try 50 times; if we fail to improve, return the old model
    new_model = take_random_step(model)
    new_distance = calculateDistance(trainDF, new_model)
    if new_distance < old_distance:
      return new_model
  return model # by default, return the old model

def calculateDistance(trainDF, model):
  sum = 0
  for i in range(len(trainDF.index[:10])):
    # Get data points
    bed = trainDF.iat[i, 1]
    bath = trainDF.iat[i, 2]
    sqft = trainDF.iat[i, 3]

    # Get coefficients
    a = model[0]
    b = model[1]
    c = model[2]
    d = model[3]

    # Calculate distance using formula: https://i.ytimg.com/vi/zWMTTRJ0l4w/maxresdefault.jpg
    numerator = abs((a * bed) + (b * bath) + (c * sqft) + d)
    denominator = math.sqrt((a * a) + (b * b) + (c * c ) + (d * d))
    distance = numerator / denominator
    sum = sum + distance
 
  return sum

In [30]:
trainMRModel(train, 100)

First model - weights:  [0.5, 0.5, 0.5, 0.5]  distance:  8.77252747254
Final model - weights:  [1.5739333346908857, -2.9433953145048166, 0.2936516821041284, 0.2463025548162293]  distance:  0.42829554951037124


In [31]:
def MSE(test, w1, w2, w3, b):
    sum = 0
    for row in range(0, len(test.index)):
        bedrooms = test.iat[row, 1]
        bathrooms = test.iat[row, 2]
        size_sqft = test.iat[row, 3]
        actual = test.iat[row, 0]
        predicted = (w1 * bedrooms) + (w2 * bathrooms) + (w3 *size_sqft) + b
        sum = sum + (actual - predicted) * (actual - predicted)
    return sum/len(test.index)

In [32]:
MSE(test,-1.7515804406788689,3.596717357220347,-0.43382785150620806,-0.3059737545463622) #10,000 iterations

0.08430112235849602

In [33]:
MSE(test,-1.6086322282630496, 3.2119239086874107, -0.3872185641246365,-0.26360815921849834) #1,000 iterations

0.07733983316047599

In [10]:
MSE(train,-1.7515804406788689,3.596717357220347,-0.43382785150620806,-0.3059737545463622)

0.09501308793697975

In [11]:
MSE(test,0.024651779254613815,0.7765181213835992,-0.43382785150620806,0.01869844856885916)

0.023499981064529777

In [12]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

regr = linear_model.LinearRegression()

regr.fit(train, train['bedrooms'],train['size_sqft'])

LinearRegression()

In [13]:
prediction = regr.predict(test)

In [14]:
regr.coef_

array([ 1.58953127e-16,  1.00000000e+00, -4.59880279e-16, -3.24764095e-16])

In [15]:
mean_squared_error(test['rent'],prediction)

0.02550829570362983

In [21]:
def rent(w1,w2,w3,b):
    
   
    for row in range(0, len(test.index)):
        bedrooms = test.iat[row, 1]
        bathrooms = test.iat[row, 2]
        size_sqft = test.iat[row, 3]
    
        predicted = (w1 * bedrooms) + (w2 * bathrooms) + (w3 *size_sqft) + b
    
    return predicted

In [24]:
predicted =  (1.6086322282630496 * 2) + (3.2119239086874107 * 1) + (0.3872185641246365 * 750) + 0.26360815921849834


print(predicted)

297.1067196179094
