Modulok Importálása

In [None]:
#Márton Zétény Péter ZHN7GV
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

Adatok betöltése és felosztása

In [None]:
melb_data = pd.read_csv('https://raw.githubusercontent.com/karsarobert/Machine_Learning_2024/main/melb_data.csv')
melb_x_data = melb_data.drop(["Price"],axis=1)
x = melb_x_data.select_dtypes(exclude=['object'])
y = melb_data.Price

x

Unnamed: 0.1,Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
0,1,2,2.5,3067.0,2.0,1.0,1.0,202.0,,,-37.79960,144.99840,4019.0
1,2,2,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,-37.80790,144.99340,4019.0
2,4,3,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,-37.80930,144.99440,4019.0
3,5,3,2.5,3067.0,3.0,2.0,1.0,94.0,,,-37.79690,144.99690,4019.0
4,6,4,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,-37.80720,144.99410,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18391,23540,2,6.8,3016.0,2.0,2.0,1.0,,89.0,2010.0,-37.86393,144.90484,6380.0
18392,23541,4,6.8,3016.0,4.0,1.0,5.0,866.0,157.0,1920.0,-37.85908,144.89299,6380.0
18393,23544,4,12.7,3085.0,4.0,3.0,2.0,,,,-37.72006,145.10547,1369.0
18394,23545,4,6.3,3013.0,4.0,1.0,1.0,362.0,112.0,1920.0,-37.81188,144.88449,6543.0


Adatok szétvágása 80-20% arányban

In [None]:
x_train,x_valid,y_train,y_valid = train_test_split(x,y,train_size=0.8,test_size=0.2,random_state=0)

In [None]:
#Select numerical columns
numerical_cols = [col for col in x_train if x_train[col].dtype in ['int64', 'float64']]

#Select categorical columns
object_cols = [col for col in x_train.columns if x_train[col].nunique() < 10 and x_train[col].dtype == "object"]

print(f"Numerical columns: {numerical_cols}")
print(f"Categorical columns: {object_cols}")

#Merge the  numerical and categorical columns
my_columns = numerical_cols + object_cols
x_train_ = x_train[my_columns].copy()
x_valid_ = x_valid[my_columns].copy()

print(f"x train dataset length: {len(x_train_)}, x validation dataset length: {len}")

x_train_

Numerical columns: ['Unnamed: 0', 'Rooms', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude', 'Propertycount']
Categorical columns: []
x train dataset length: 14716, x validation dataset length: 3680


Unnamed: 0.1,Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
2573,3349,4,7.8,3058.0,4.0,2.0,1.0,381.0,,1938.0,-37.73370,144.95480,11204.0
2091,2686,3,7.8,3124.0,3.0,1.0,1.0,544.0,160.0,1930.0,-37.84360,145.05810,8920.0
4683,6065,2,5.6,3101.0,2.0,1.0,1.0,121.0,,,-37.81260,145.05340,10331.0
8832,11346,3,7.5,3123.0,3.0,2.0,2.0,200.0,,,-37.83960,145.05140,6482.0
10469,13474,2,4.5,3181.0,2.0,1.0,1.0,2842.0,84.0,1920.0,-37.85130,144.99430,7717.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9225,11849,2,13.9,3020.0,3.0,1.0,1.0,199.0,,,-37.77610,144.81640,2185.0
13123,16889,2,7.2,3184.0,2.0,1.0,1.0,0.0,,,-37.88945,144.99015,8989.0
9845,12649,3,7.8,3058.0,3.0,2.0,2.0,500.0,148.0,1900.0,-37.73820,144.96530,11204.0
10799,13887,3,8.2,3012.0,3.0,1.0,0.0,370.0,,,-37.79160,144.87150,5058.0


Imputálás és előfeldolgozás

In [None]:
#Imputation for numerical datas
num_imputer = SimpleImputer(strategy="mean")#pótlás mediánnal

#Imputation and One Hot Encoding for categorical datas
cat_imputer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

#numerical and categorical columns preprocessing for the pipeline
preprocess = ColumnTransformer(
    transformers=[
        ('numerical', num_imputer, numerical_cols),
        ('categorical', cat_imputer, object_cols)
    ]
)

Model elkészítése és tanítása

In [None]:
#Creating the Linear Regression model
model = LinearRegression()

#Create the pipeline
pipeline = Pipeline(steps=[('preporocessor',preprocess),
                           ('model', model)])

pipeline.fit(x_train_,y_train)

#training prediction
train_predict = pipeline.predict(x_train_)

#validation set prediction
valid_predict = pipeline.predict(x_valid)

Kiértékelés


In [None]:
mae = mean_absolute_error(y_valid,valid_predict)
r2 = r2_score(y_valid, valid_predict)

print(f"MEA: {mae}")
print(f"R^2: {r2}")

MEA: 309328.05982244835
R^2: 0.48778933270513547
