In [554]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from scipy import stats
import warnings
import math
import random
warnings.filterwarnings('ignore')
%matplotlib inline

In [555]:
class FeatureExtractor:
    def normalize_data(self, dataframe):
        dataframe = dataframe.copy()
        
        for col in dataframe:
            new_col = []
            for elem in dataframe[col]:
                if type(elem) == unicode:
                    new_col += [elem.encode('utf-8')]
                else:
                    new_col += [elem]
            dataframe[col] = new_col

        
        dataframe = dataframe.query('geocode_lat != \'None\'')

        dataframe[u'geocode_lat'] = dataframe[u'geocode_lat'].convert_objects(convert_numeric = True)
        dataframe[u'geocode_long'] = dataframe[u'geocode_long'].convert_objects(convert_numeric = True)
    
        dataframe[u'price'] = dataframe[u'price'].map(lambda x: x.translate(None, ' ₸') if type(x) == str else x)
        dataframe[u'price'] = dataframe[u'price'].convert_objects(convert_numeric = True)

        dataframe[u'all_space'] = dataframe[u'all_space'].map(lambda x: x.rstrip(' м2') if type(x) == str else x)
        dataframe[u'all_space'] = dataframe[u'all_space'].convert_objects(convert_numeric = True)

        dataframe[u'living_space'] = dataframe[u'living_space'].map(lambda x: x.rstrip(' м2') if type(x) != float else '0')
        dataframe[u'living_space'] = dataframe[u'living_space'].convert_objects(convert_numeric = True)

        dataframe[u'kitchen_space'] = dataframe[u'kitchen_space'].map(lambda x: x.rstrip(' м2') if type(x) != float else '0')
        dataframe[u'kitchen_space'] = dataframe[u'kitchen_space'].convert_objects(convert_numeric = True)

        dataframe[u'built_time'] = dataframe[u'built_time'].map(lambda x: x.rstrip(' г.п.') if type(x) == str else x)
        dataframe[u'built_time'] = dataframe[u'built_time'].convert_objects(convert_numeric = True)
        
        dataframe[u'age'] = 2017 - dataframe[u'built_time']
        
        dataframe = self.modify_house_type(dataframe)
        dataframe = self.get_log_space(dataframe)
        dataframe = self.get_price_for_sqr_meter(dataframe)
        dataframe = self.get_distance(dataframe)
        
        # Fake, krisha.kz deleted this posts
        dataframe = dataframe.query('all_space >= 10')
        dataframe = dataframe.query('all_space >= 25 or price <= 20000000')

        return dataframe
    
    def modify_house_type(self, dataframe):
        dataframe = dataframe.copy()
        
        instance = dataframe[u'house_type'][dataframe.index[0]]
        if type(instance) != object and type(instance) != str and type(object) != unicode:
            dataframe[u'house_type'].fillna(2, inplace = True)
            return dataframe
        
        new_house_type = []
        for house_type in dataframe[u'house_type']:
            if house_type == 'кирпичный':
                new_house_type += [3]
            elif house_type == 'монолитный':
                new_house_type += [4]
            elif house_type == 'панельный':
                new_house_type += [1]
            elif house_type == 'каркасно-камышитовый':
                new_house_type += [0]
            else:
                new_house_type += [2]
            
            
        dataframe[u'house_type'] = new_house_type
        return dataframe
    
    def get_log_space(self, dataframe):
        dataframe = dataframe.copy()
        dataframe[u'log_space'] = np.log(dataframe[u'all_space']) / np.log(1.4)
        return dataframe

    def get_price_for_sqr_meter(self, dataframe):
        dataframe = dataframe.copy()
        dataframe[u'price_for_sqr_meter'] = dataframe.apply(self.calculate_price_for_sqr_meter, axis = 1)
        return dataframe

    def calculate_price_for_sqr_meter(self, x):
        return (x[u'price'] + 0.0) / x[u'all_space']
    
    def get_distance(self, dataframe):
        dataframe['distance'] = (dataframe['geocode_lat'] ** 2 + dataframe['geocode_long'] ** 2) ** 0.5
        return dataframe
    
    def learning_data(self, dataframe, part):
        dataframe = dataframe.copy()
        
        if part >= 1:
            return dataframe
        
        geocode_lat  = dataframe[u'geocode_lat']
        geocode_long = dataframe[u'geocode_long']
        
        mid_lat  = geocode_lat.sum() / len(dataframe)
        mid_long = geocode_long.sum() / len(dataframe)
        
        max_radius = 0
        for i in dataframe.index:
            dist_lat  = mid_lat - geocode_lat[i]
            dist_long = mid_long - geocode_long[i]
            max_radius = max(max_radius, (dist_lat ** 2 + dist_long ** 2) ** 0.5)
        
        percent = part ** 0.5
        inside = []
        while percent <= 1:
            inside = []
            for i in dataframe.index:
                dist_lat  = mid_lat - geocode_lat[i]
                dist_long = mid_long - geocode_long[i]
                dist = (dist_lat ** 2 + dist_long ** 2) ** 0.5
                
                if dist <= max_radius * percent:
                    inside += [i]
            if len(inside) >= len(dataframe) * part or percent == 1:
                break
            percent = min(percent + part, 1.0)
        
        random.shuffle(inside)
        chosen = []
        for i in range(0, len(inside)):
            chosen += [inside[i]]
            if i + 1 >= len(dataframe) * part:
                break
                
        return dataframe.ix[chosen]

In [556]:
class QualityMetric:
    def calculate_relative_error(self, real_data, predicted_data):
        predicted_data = predicted_data.copy()
        real_data      = real_data.copy()
        
        real      = real_data[u'price']
        predicted = predicted_data[u'price']
        
        result = 0.0
        for i in real_data.index:
            result += abs(real[i] - predicted[i] + 0.0) / real[i]
        return result
        
    def calculate_rss(self, real_data, predicted_data):
        real_data = real_data.copy()
        predicted_data = predicted_data.copy()
        
        real_prices      = real_data[u'price']
        predicted_prices = predicted_data[u'price'] 

        result = 0.0
        for i in real_data.index:
            result += (real_prices[i] - predicted_prices[i]) ** 2
        return result ** 0.5

In [557]:
class Model:
    def predict_price(self, dataframe):
        pass
    def train(self, dataframe):
        pass

In [558]:
class StupidModel(Model):
    def __init__(self, learning_data):
        pass
    
    def predict_price(self, dataframe):
        dataframe = dataframe.copy()
        
        prices = dataframe[u'price']
        spaces = dataframe[u'all_space']
    
        for i in dataframe.index:
            prices[i] = self.price_for_sqr_meter * spaces[i]
        
        dataframe[u'price'] = prices
        return dataframe
    
    def train(self, dataframe):
        dataframe = dataframe.copy()
        
        print "Training stupid model..."
        self.price_for_sqr_meter = dataframe[u'price'].sum() / dataframe[u'all_space'].sum()
        print "Finished training!"

In [559]:
class KNNModel(Model):
    def __init__(self, learning_data):
        self.given_features = ['distance', u'age', u'log_space', u'house_type', u'room_number']
        learning_data = learning_data.copy()

        print "Choosing k..."
        self.k   = self.choose_k(learning_data, 1, 1000) 
        print "Chosen k = " + str(self.k)
        
    def predict_price(self, dataframe):
        dataframe = dataframe.copy()
        dataframe = FeatureExtractor().normalize_data(dataframe)
        dataframe[u'price_for_sqr_meter'] = self.knn.predict(dataframe[self.given_features])
        
        price_for_sqr_meter  = dataframe[u'price_for_sqr_meter']
        all_space            = dataframe[u'all_space']
        price                = dataframe[u'price']
        for i in dataframe.index:
            price[i] = price_for_sqr_meter[i] * all_space[i]

        del dataframe[u'price_for_sqr_meter']
        dataframe[u'price'] = price
        return dataframe
    
    def predict_price_given_knn(self, dataframe, knn_model):
        dataframe = dataframe.copy()
        
        dataframe[u'price_for_sqr_meter'] = knn_model.predict(dataframe[[u'geocode_lat', u'geocode_long']])
        
        price_for_sqr_meter  = dataframe[u'price_for_sqr_meter']
        all_space            = dataframe[u'all_space']
        price                = dataframe[u'price']
        for i in dataframe.index:
            price[i] = price_for_sqr_meter[i] * all_space[i]

        dataframe[u'price'] = price
        return dataframe

    def train(self, dataframe):
        dataframe = dataframe.copy()

        print "Training model..."

        self.knn = KNeighborsRegressor(n_neighbors = self.k)
        self.knn.fit(dataframe[self.given_features], dataframe[[u'price_for_sqr_meter']])
        
        print "Finished training!"
        
    def choose_k(self, dataframe, low_k, high_k):
        dataframe = dataframe.copy()
        train, test = train_test_split(dataframe, test_size = 0.1)

        print "  Choosing between " + str(low_k) + ' ' + str(high_k)
        if low_k > high_k:
            return 0
        
        left  = -1
        right = -1
        ans   = -1
        
        step = (high_k - low_k + 4) / 5
        pos  = low_k + step
        prev = self.get_error_for(low_k, train, test)
        
        while pos <= high_k:
            error = self.get_error_for(pos, train, test)
            
            if ans == -1 or prev + error < ans:
                ans = prev + error
                left = pos - step
                right = pos
            
            if pos == high_k:
                break
            pos = min(pos + step, high_k)
            prev = error
        
        if step == 1:
            return left
        
        return self.choose_k(dataframe, left, right)
    
    def get_error_for(self, k, train, test):
        train = train.copy()
        test  = test.copy()
        
        knn = KNeighborsRegressor(n_neighbors = k)
        knn.fit(train[[u'geocode_lat', u'geocode_long']], train[[u'price_for_sqr_meter']])

        prediction = self.predict_price_given_knn(test, knn)
        return QualityMetric().calculate_relative_error(test, prediction)

            

In [None]:
class System:
    def __init__(self):
        self.featureExtractor = FeatureExtractor()
        self.dataframe        = pd.read_csv('../Dataset/2/train.csv')
        
        self.dataframe     = self.featureExtractor.normalize_data(self.dataframe)
        self.learning_data = self.featureExtractor.learning_data(self.dataframe, 0.1)
    
    def choose_model(self):
        self.error = -1
        
        self.try_model(StupidModel(self.learning_data))
        self.try_model(KNNModel(self.learning_data)) 
                        
    def try_model(self, model):
        train, test = train_test_split(self.learning_data, test_size = 0.1)
        model.train(train)
        print "Calculating RE for this model..."        
        prediction = model.predict_price(test)
        error = QualityMetric().calculate_relative_error(test, prediction)
        print "RE = " + str(error)
        
        if self.error == -1:
            self.error = error
            self.model = model
        elif error < self.error:
            self.error = error
            self.model = model
        
    def train_model(self):
        self.model.train(self.dataframe)

In [None]:
system = System()
system.choose_model()
system.train_model()


Training stupid model...
Finished training!
Calculating RE for this model...
RE = 49.5580246907
Choosing k...
  Choosing between 1 1000


In [None]:
df = pd.read_csv('../Dataset/2/train.csv')
for col in df:
    new_col = []
    for elem in df[col]:
        if type(elem) == str:
            new_col += [unicode(elem, encoding = 'utf-8')]
        else:
            new_col += [elem]
    df[col] = new_col
    
output = system.model.predict_price(df[0:10])

print str(output[u'price'][output.index[0]])
# for i in output.index:
#     print df[u'price'][i].encode('utf-8') + ' ' + str(output[u'price'][i])

In [None]:
print system.dataframe.dtypes

In [None]:
var = u'room_number'
var1 = u'price_for_sqr_meter'
data = pd.concat([system.dataframe[var1], system.dataframe[var]], axis=1)
data.plot.scatter(x = var, y = var1, xlim = [0, 12]);

In [None]:
sum_my = 0
sum_all = 0

cnt_my = 0
cnt_all = 0
Floors = []
for floor in system.dataframe[u'floor']:
    if type(floor) == str:
        if len(floor.split(" из ")) == 2:
            sum_my += int(floor.split(" из ")[0])
            sum_all += int(floor.split(" из ")[1])
            cnt_my += 1
            cnt_all += 1
        else:
            sum_my += int(floor)
            cnt_my += 1
    else:
        pass

print (sum_my + 0.0) / cnt_my
print (sum_all + 0.0) / cnt_all

In [None]:
var1 = 'geocode_long'
var2 = 'geocode_lat'

data = pd.concat([system.dataframe[var1], system.dataframe[var2]], axis=1)
data.plot.scatter(x=var1, y=var2, color = (0, 1, 0), s = 15, xlim = [76.75, 77.05], ylim = [43.10, 43.45]);

In [None]:
df = system.dataframe

line = plt.figure()

var = 'price'

min_price_for_sqr_meter = 1e9
max_price_for_sqr_meter = 0
sum_price_for_sqr_meter = 0
cnt_price_for_sqr_meter = 0

for price_for_sqr_meter in df[var]:
    min_price_for_sqr_meter = min(min_price_for_sqr_meter, price_for_sqr_meter)
    max_price_for_sqr_meter = max(max_price_for_sqr_meter, price_for_sqr_meter)
    
    sum_price_for_sqr_meter += price_for_sqr_meter
    cnt_price_for_sqr_meter += 1

mid_price_for_sqr_meter = (sum_price_for_sqr_meter + 0.0) / cnt_price_for_sqr_meter
min_price_for_sqr_meter = mid_price_for_sqr_meter * 0.5
max_price_for_sqr_meter = mid_price_for_sqr_meter * 1.5

price_for_sqr_meter = df[var]
geocode_lat         = df['geocode_lat']
geocode_long        = df['geocode_long']

plt.scatter(76.75, 43.10, s = 1)
plt.scatter(77.05, 43.10, s = 1)
plt.scatter(77.05, 43.45, s = 1)
plt.scatter(76.75, 43.45, s = 1)

for i in df.index:
    clr = (price_for_sqr_meter[i] - min_price_for_sqr_meter + 0.0) / (max_price_for_sqr_meter - min_price_for_sqr_meter)
    if clr >= 0 and clr <= 1:
        plt.scatter(geocode_long[i], geocode_lat[i], color = (clr, 1.0 - clr, 0), s = 5)

#     if i > 1000:
#         break
plt.show()

    