In [33]:
def train(norm=False):    
    
    import os
    import warnings
    import sys

    import pandas as pd
    import numpy as np
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.linear_model import LinearRegression
    from urllib.parse import urlparse
    import mlflow
    import mlflow.sklearn

    import logging

    logging.basicConfig(level=logging.WARN)
    logger = logging.getLogger(__name__)

    train_mean = pd.read_csv('../data/train_mean.csv')
    test_mean = pd.read_csv('../data/test_mean.csv')

    onehot_encoder = OneHotEncoder(sparse=False)

    train_ohe = pd.get_dummies(train_mean, 
                           columns=['flat_type', 'object_type', 'rooms', 'build_matireal',
                                    'district_rating','district','underground','eco_rating','clear_rating','gkh_rating',
                                    'neighbor_rating','kids_rating','sport_rest_rating','shop_rating',
                                    'traffic_rating','secure_rating','life_price_rating','metro_station',
                                    'num_of_metro_stations','num_of_kindg','num_of_schools','num_of_poly',
                                    'num_of_hospitals','num_of_dentists','num_of_women_cons'])

    test_ohe = pd.get_dummies(test_mean, 
                          columns=['flat_type', 'object_type', 'rooms', 'build_matireal',
                                    'district_rating','district','underground','eco_rating','clear_rating','gkh_rating',
                                    'neighbor_rating','kids_rating','sport_rest_rating','shop_rating',
                                    'traffic_rating','secure_rating','life_price_rating','metro_station',
                                    'num_of_metro_stations','num_of_kindg','num_of_schools','num_of_poly',
                                    'num_of_hospitals','num_of_dentists','num_of_women_cons'])

    def eval_metrics(actual, pred):
        rmse = np.sqrt(mean_squared_error(actual, pred))
        mae = mean_absolute_error(actual, pred)
        r2 = r2_score(actual, pred)
        return rmse, mae, r2


    warnings.filterwarnings("ignore")
    np.random.seed(40)

    # Read the wine-quality csv file from the URL
    data = train_ohe

    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(data)

    # The predicted column is "quality" which is a scalar from [3, 9]
    train_x = train.drop(["price", 'page', 'description'], axis=1)
    test_x = test.drop(["price", 'page', 'description'], axis=1)
    train_y = train[["price"]]
    test_y = test[["price"]]

    #     alpha = float(sys.argv[1]) if len(sys.argv) > 1 else 0.5
    #     l1_ratio = float(sys.argv[2]) if len(sys.argv) > 2 else 0.5

    with mlflow.start_run():
        lr = LinearRegression(normalize=norm)
        lr.fit(train_x, train_y)

        predicted_qualities = lr.predict(test_x)

        (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

        print("Regression model (normalize=%f):" % (norm))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        mlflow.log_param("Normalize", norm)
    #         mlflow.log_param("l1_ratio", l1_ratio)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)

        mlflow.sklearn.log_model(lr, "model")

In [34]:
train(norm=False)

Regression model (normalize=0.000000):
  RMSE: 39961770.56489456
  MAE: 2422790.764969978
  R2: 0.05190448486572241


In [35]:
train(norm=True)

Regression model (normalize=1.000000):
  RMSE: 2.8588694377626772e+16
  MAE: 206865950334750.84
  R2: -4.852340760141219e+17
