In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

PATH = "./realest.csv"

class AnalysisDataAndFitLinearRegression:

    def __init__(self):
        self.version = 1

    def analyse_and_fit_lrm(self, path):
        # a path to a dataset is "./data/realest.csv"
        # dataset can be loaded by uncommenting the line bellow
        data = pd.read_csv(path)
        # subset the data for stats 
        subset_data = data[(data['Bedroom'] == 4) & (data['Bathroom'] == 2)]
        # compute mean stand_dev median max min for Tax column
        statistics = [
            subset_data['Tax'].mean(),
            subset_data['Tax'].std(),
            subset_data['Tax'].median(),
            subset_data['Tax'].max(),
            subset_data['Tax'].min()
        ]
        # df observation given the description
        data_frame = data[data['Space'] > 800].sort_values(by='Price', ascending=False)
        # 4th 5-quantile is above 4/5 of the quantile
        quantile = data['Lot'].quantile(q = 4/5)
        number_of_observations = len(data[data['Lot'] > quantile])

        summary_dict = {
            'statistics': statistics,
            'data_frame': data_frame,
            'number_of_observations': number_of_observations
        }


        data_lr = self.__listwise_deletion(data)


        lr_model = LinearRegression()
        x = data_lr.loc[:, data_lr.columns != "Price"]
        y = data_lr[['Price']]

        lr_model.fit(x,y)
        intercept = lr_model.intercept_

        model_parameters = {
            'Intercept': intercept,
        }


        for i , col in enumerate(data_lr.columns[1:]):
          model_parameters[col] = lr_model.coef_[0][i]



        # values for price prediction

        price_prediction = lr_model.predict(np.array([[3, 1500, 8, 40, 1000, 2, 1, 0]]))[0][0]

        regression_dict = {
            'model_parameters': model_parameters,
            'price_prediction': price_prediction
        }

        return {
            'summary_dict': summary_dict,
            'regression_dict': regression_dict
        }

    def __listwise_deletion(self, data: pd.DataFrame):
        return data.dropna()


a = AnalysisDataAndFitLinearRegression()
a.analyse_and_fit_lrm(PATH)


  "X does not have valid feature names, but"


{'regression_dict': {'model_parameters': {'Bathroom': 6.375785992212336,
   'Bedroom': -3.1602387359607067,
   'Condition': 1.8357953743202877,
   'Garage': 4.144757426848795,
   'Intercept': array([19.99346773]),
   'Lot': 0.23621323117136808,
   'Room': 1.6889511073197694,
   'Space': 0.00966322779633439,
   'Tax': 0.004412718519150623},
  'price_prediction': 69.27677925052453},
 'summary_dict': {'data_frame':      Price  Bedroom   Space  Room   Lot     Tax  Bathroom  Garage  Condition
  114   90.0      8.0  2293.0  12.0  50.0  1181.0       3.0     2.0        0.0
  140   88.0      7.0  2277.0  12.0  50.0  1248.0       3.0     2.0        0.0
  36    88.0      8.0  2278.0  12.0  50.0  1183.0       3.0     2.0        0.0
  88    88.0      8.0  2228.0  12.0  50.0  1208.0       3.0     2.0        0.0
  62    85.0      7.0  2295.0  12.0  50.0  1233.0       3.0     2.0        0.0
  ..     ...      ...     ...   ...   ...     ...       ...     ...        ...
  45    35.0      5.0  1142.0   7