In [1]:
%autosave 10

import numpy as np
import pandas as pd
import random
import math

from HW2Utils import *
from functools import cmp_to_key

Autosaving every 10 seconds


In [2]:
def getData(file):
    data = pd.read_table(file, sep=",") # area, rooms, price
    return data[['area', 'rooms']].as_matrix(), data['price'].as_matrix()

In [3]:
data, lables = getData('prices.txt')

In [4]:
max_area = max_rooms = 0
data, max_area, max_rooms = normalize_data(data)
data_for_training = [[1, d[0], d[1]] for d in data]

In [5]:
def calculate_error(data, lables, coefficients):
    alpha = 10 ** (-4)
    data_len = len(data)
    parameters_count = len(data[0])
    standart_deviation = 0
    for i in range(data_len):
        predicted = 0
        for j in range(parameters_count):
            predicted += coefficients[j] * data[i][j]
        error = predicted - lables[i]
        for j in range(parameters_count):
            coefficients[j] = coefficients[j] - alpha * error * data[i][j]
        standart_deviation += abs(error) ** 2
    standart_deviation = math.sqrt(standart_deviation / data_len)
    return standart_deviation, coefficients

In [6]:
def gradient_decent_training(data, lables):
    parameters_count = len(data[0])
    data_len = len(data)
    coefficients = [0. for i in range(parameters_count)]
    eps = 10 ** (-10)
    previos_standart_deviation = standart_deviation = 0
    while True:
        previos_standart_deviation = standart_deviation
        standart_deviation, coefficients = calculate_error(data, lables, coefficients)
        if abs(previos_standart_deviation - standart_deviation) < eps:
            break
    return standart_deviation, coefficients

In [7]:
def genetic_training(data, lables):
    iterations = 2000
    vectors_count = 50
    parameters_count = len(data[0])
    data_len = len(data)
    init_err, init_coeffs = calculate_error(data, lables, [0. for i in range(parameters_count)])
    coefficients_list = [(init_coeffs, init_err)]
    for it in range(iterations):
        for i in range(len(coefficients_list)):
            new_coeffs = list(coefficients_list[i][0])
            for j in range(len(new_coeffs)):
                new_coeffs[j] += 5000 * (random.random() - 0.5)
            coefficients_list.append((new_coeffs, calculate_error(data, lables, new_coeffs)[0]))
        coefficients_list = sorted(coefficients_list, key=cmp_to_key(lambda c1, c2: c1[1] - c2[1]))
        coefficients_list = coefficients_list[:vectors_count]
    return calculate_error(data, lables, coefficients_list[0][0])

In [8]:
print('GRADIENT DECENT TRAINING')
standart_deviation, coefficients = gradient_decent_training(data_for_training, lables)
print('Standart deviation:', standart_deviation)
print('Coefficients:', coefficients)
print('Resulted equation:', 'price = (' + str(coefficients[1]) + ') * area + (' + str(coefficients[2]) + ') * rooms + (' + str(coefficients[0]) + ')')

GRADIENT DECENT TRAINING
Standart deviation: 63931.67758272509
Coefficients: [89579.359457160695, 623359.2778823853, -43665.914000270306]
Resulted equation: price = (623359.277882) * area + (-43665.9140003) * rooms + (89579.3594572)


In [9]:
print('GENETIC TRAINING')
genetic_standart_deviation, genetic_coefficients = genetic_training(data_for_training, lables)
print('Standart deviation:', genetic_standart_deviation)
print('Coefficients:', genetic_coefficients)
print('Resulted equation:', 'price = (' + str(genetic_coefficients[1]) + ') * area + (' + str(genetic_coefficients[2]) + ') * rooms + (' + str(genetic_coefficients[0]) + ')')

GENETIC TRAINING
Standart deviation: 63931.68283639354
Coefficients: [89696.265888392343, 623270.99781875056, -43737.9069429152]
Resulted equation: price = (623270.997819) * area + (-43737.9069429) * rooms + (89696.2658884)


In [10]:
costs = pd.DataFrame(columns=['Area', 'Rooms', 'Price', 'Predicted Price (Gradient decent)', 'Predicted price (Genetic)'])
for i in range(len(data)):
    predicted_gradient = coefficients[0] + coefficients[1] * data[i][0] + coefficients[2] * data[i][1]
    predicted_genetic = genetic_coefficients[0] + genetic_coefficients[1] * data[i][0] + genetic_coefficients[2] * data[i][1]
    result = pd.DataFrame([[data[i][0] * max_area, data[i][1] * max_rooms, lables[i], predicted_gradient, predicted_genetic]], columns=['Area', 'Rooms', 'Price', 'Predicted Price (Gradient decent)', 'Predicted price (Genetic)'])
    costs = costs.append(result, ignore_index=True)

In [11]:
# display result values
costs

Unnamed: 0,Area,Rooms,Price,Predicted Price (Gradient decent),Predicted price (Genetic)
0,2104.0,3.0,399900.0,356266.796467,356299.028514
1,1600.0,3.0,329900.0,286107.556616,286149.724606
2,2400.0,3.0,369000.0,397471.429395,397497.826047
3,1416.0,2.0,232000.0,269227.048676,269287.242663
4,3000.0,4.0,539900.0,472261.151179,472261.32074
5,1985.0,4.0,299900.0,330968.237591,330988.417036
6,1534.0,3.0,314900.0,276920.037111,276963.506237
7,1427.0,3.0,198999.0,262025.119127,262070.697669
8,1380.0,3.0,212000.0,255482.491601,255528.996709
9,1494.0,3.0,242500.0,271351.843472,271396.101165


In [12]:
def predict(area, rooms, isGradient):
    area = area / max_area
    rooms = rooms / max_rooms
    curr_coefficients = []
    if isGradient:
        curr_coefficients = coefficients
    else:
        curr_coefficients = genetic_coefficients
    return curr_coefficients[0] + curr_coefficients[1] * area + curr_coefficients[2] * rooms

In [13]:
# for testing coefficients
area = 3000
rooms = 3
print(predict(area, rooms, True))

480994.33398


In [14]:
area = 3000
rooms = 3
print(predict(area, rooms, False))

481008.902128
