In [3]:
import pandas as pd
import pdb
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from sklearn.externals import joblib
import seaborn as sns
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import chi2, f_regression
from sklearn.linear_model import LassoLarsCV, Ridge, RidgeCV, LassoCV, Lasso, LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn import metrics
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import export_graphviz
import logging
import scipy
import gc
from multiprocessing import Pool
from collections import defaultdict
import os
from scikit.helper import generate_matrix, ape, mape, mdape, gen_subplots, plot, train_statistics
import json
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, load_only, Load
from models import Advertisement, Municipality, ObjectType
from scikit.combined_ensemble import CombinedEnsemble
from scikit import combined_ensemble as combined_ensemble
import sys
sys.modules["combined_ensemble"] = sys.modules["scikit.combined_ensemble"]

logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s: %(levelname)s - %(message)s', 
                    filename='jupyter.log')

from scikit.train_pipeline import TrainPipeline

def dpathes_to_hash(dpathes):
    return [hash(frozenset(row.nonzero()[1])) for row in dpathes]

In [4]:
model = joblib.load('scikit/models/extraTree.pkl')
ads_transformed = joblib.load('scikit/ads_transformed.pkl')

ads = pd.read_csv('scikit/advertisements.csv', index_col=0, engine='c')


ModuleNotFoundError: No module named 'pandas.indexes'

In [None]:
DIRECTORY = os.path.dirname(os.path.abspath("./scikit/main.py"))
settings = json.load(open('{}/settings.json'.format(DIRECTORY)))

p = TrainPipeline("price", settings, DIRECTORY)


In [4]:
ads = ads[:1000]
for f in p.preparation_pipeline:
    logging.info("Apply transformation: {}".format(f.__name__))
    ads = f(ads)

if len(ads) == 0:
    print("Error: Input did not meet our standards.")

ads = p.outlier_detection(ads)
if len(ads) == 0:
    print("Error: Input data is an outlier!")


In [5]:
col_new = set(list(ads))
col_exist = set(list(ads_transformed))

if len(col_new - col_exist) > 0:
    print("Error: there are input columns which are not in the trained model: {}".format(col_new - col_exist))



In [6]:
col_plain = set(list(ads))

In [7]:
filled_ads = ads.join(pd.DataFrame(columns=list(col_exist - col_new)))

filled_ads[list(col_exist - col_new)] = 0

AttributeError: type object 'TrainPipeline' has no attribute '__package__'

In [8]:

def load_additional(municipality_id, otype_id):
    engine = create_engine(os.environ.get('DATABASE_URL', None))
    Session = sessionmaker(bind=engine)
    session = Session()

    m_stmt = session.query(Municipality).filter_by(id=municipality_id).options(
        Load(Municipality).load_only(
            "name",
            "canton_id",
            "district_id",
            "mountain_region_id",
            "language_region_id",
            "job_market_region_id",
            "agglomeration_id",
            "metropole_region_id",
            "tourism_region_id",
            "is_town",
            "noise_level",
            "urban_character_id",
            "steuerfuss_gde",
            "steuerfuss_kanton",
            "degurba_id",
            "planning_region_id",
            "ase",
            "greater_region_id",
            "ms_region_id",
            "municipal_size_class_id",
            "agglomeration_size_class_id",
            "municipal_type22_id",
            "municipal_type9_id")
    ).with_labels().statement

    o_stmt = session.query(ObjectType).filter_by(id=otype_id).options(
        Load(ObjectType).load_only("name", "grouping")
    ).with_labels().statement

    return pd.read_sql_query(m_stmt, session.bind) \
        .join(pd.read_sql_query(o_stmt, session.bind)) \
        .drop(['municipalities_id', 'object_types_id'], axis=1) \
        .rename(columns={'municipalities_name': 'municipality',
                        'municipalities_canton_id': 'canton_id',
                        'municipalities_district_id': 'district_id',
                        'municipalities_planning_region_id': 'planning_region_id',
                        'municipalities_mountain_region_id': 'mountain_region_id',
                        'municipalities_ase': 'ase',
                        'municipalities_greater_region_id': 'greater_region_id',
                        'municipalities_language_region_id': 'language_region_id',
                        'municipalities_ms_region_id': 'ms_region_id',
                        'municipalities_job_market_region_id': 'job_market_region_id',
                        'municipalities_agglomeration_id': 'agglomeration_id',
                        'municipalities_metropole_region_id': 'metropole_region_id',
                        'municipalities_tourism_region_id': 'tourism_region_id',
                        'municipalities_municipal_size_class_id': 'municipal_size_class_id',
                        'municipalities_urban_character_id': 'urban_character_id',
                        'municipalities_agglomeration_size_class_id': 'agglomeration_size_class_id',
                        'municipalities_is_town': 'is_town',
                        'municipalities_degurba_id': 'degurba_id',
                        'municipalities_municipal_type22_id': 'municipal_type22_id',
                        'municipalities_municipal_type9_id': 'municipal_type9_id',
                        'municipalities_noise_level': 'm_noise_level',
                        'municipalities_steuerfuss_gde': 'steuerfuss_gde',
                        'municipalities_steuerfuss_kanton': 'steuerfuss_kanton',
                        'object_types_name': 'otype',
                        'object_types_grouping': 'ogroup'})

row = load_additional(1, 30)


unwanted_cols = ['cubature', 'room_height', 'effective_area',
                  'plot_area', 'longitude', 'latitude',
                  'floor', 'num_floors', 'crawler']

col_plain - set(list(row)) - set(unwanted_cols)



{'agglomeration_id_0',
 'agglomeration_id_1061',
 'agglomeration_id_121',
 'agglomeration_id_1344',
 'agglomeration_id_1711',
 'agglomeration_id_2125',
 'agglomeration_id_2196',
 'agglomeration_id_230',
 'agglomeration_id_2581',
 'agglomeration_id_2601',
 'agglomeration_id_261',
 'agglomeration_id_2701',
 'agglomeration_id_2939',
 'agglomeration_id_3203',
 'agglomeration_id_3231',
 'agglomeration_id_3271',
 'agglomeration_id_3336',
 'agglomeration_id_3425',
 'agglomeration_id_351',
 'agglomeration_id_371',
 'agglomeration_id_3787',
 'agglomeration_id_3901',
 'agglomeration_id_4001',
 'agglomeration_id_4021',
 'agglomeration_id_4082',
 'agglomeration_id_4201',
 'agglomeration_id_4401',
 'agglomeration_id_4436',
 'agglomeration_id_4566',
 'agglomeration_id_4671',
 'agglomeration_id_5002',
 'agglomeration_id_5113',
 'agglomeration_id_5192',
 'agglomeration_id_5250',
 'agglomeration_id_5586',
 'agglomeration_id_581',
 'agglomeration_id_5890',
 'agglomeration_id_6002',
 'agglomeration_id_62

In [9]:
"""
user Input:

  living_area
  num_rooms
  build_year
  last_renovation_year

  otype_id

  street 
  municipality_id

longitude,latitude, --> suchen mit adresse
noise_level, ---> suchen

tags, ergänzen/Formular anbieten???
"""

in_params = {
    'living_area': 120, 
    'num_rooms': 4, 
    'otype_id': 30, 
    'street': "Paradeplatz", 
    'municipality_id': 1, 
    'build_year': 1900, 
    'last_renovation_year': 1990, 
    'tags': []
}

def get_noise_level(long, lat):
    return None

def get_long_lat(street, municipality_id):
    return (None, None)

def user_input_to_df(parameters):
    # search long lat
    long, lat = get_long_lat(parameters.street, parameters.municipality_id)
    # serch noise_level
    noise_level = get_noise_level(long, lat)
    
    df = pd.DataFrame([{
        'living_area': parameters.living_area, 
        'num_rooms': parameters.num_rooms,  
        'build_year': parameters.build_year, 
        'last_renovation_year': parameters.last_renovation_year, 
        'tags': parameters.tags,
        'noise_level': noise_level,
    }]).join(load_additional(parameters.municipality_id, parameters.otype_id))
    
    if noise_level == None:
        df['noise_level'] = df['m_noise_level']
        
    return df