In [22]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from loguru import logger


sns.set_theme(style="dark")

import warnings
warnings.filterwarnings("ignore")

In [23]:
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

In [48]:
def load_df(file_path):
    try:
        df = pd.read_csv(file_path)
        logger.info(df.head(2))
        logger.info('dataframe is read!')
        return df

    except Exception as e:
        print(e)

def clean_df_func(df):
    
    # Remove duplicates
    if df.duplicated().sum() != 0:
        logger.info(f'number of duplicates before drop: {df.duplicated().shape[0]}')
        df.drop_duplicats().reset_index(inplace=True, drop=True)
        logger.info(f'number of duplicates after drop: {df.duplicated().shape[0]}')

    if df.isna().sum().sum() != 0:
        logger.info(f'number of missing values before drop: {df.isna().sum()}')
        df = df.dropna().reset_index(drop=True)
        logger.info(f'number of missing values after drop: {df.isna().sum()}')

    le = LabelEncoder()
    df['ocean_proximity'] = le.fit_transform(df['ocean_proximity'])

    return df

def train_test_split_func(df):
    
    selected_features = df[[
        'latitude',
        'housing_median_age',
        'total_bedrooms',
        'population',
        'households',
        'median_income',
        'median_house_value',
        'ocean_proximity']]
    
    X = df.drop(columns='median_house_value', axis=0).values
    y = df[['median_house_value']].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    return X_train, X_test, y_train, y_test
    

    
def build_model(df,X_train, X_test, y_train, y_test):

    linreg = LinearRegression()
    linreg.fit(X_train, y_train)
    y_pred = linreg.predict(X_test)
    logger.info('Got predicted y')
    logger.info(f'r2 score is:{linreg.score(X_test, y_test)}')
    return linreg
        
def prediction_houseprice(model, unseen_data):
    return model.predict(unseen_data)    

In [49]:
if __name__ =="__main__":
    df = load_df('housing.csv')
    clean_df = clean_df_func(df)
    X_train, X_test, y_train, y_test = train_test_split_func(clean_df)
    model = build_model(df,X_train, X_test, y_train, y_test)
    predicted_price = prediction_houseprice(model, unseen_data)

[32m2024-02-09 15:25:22.635[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_df[0m:[36m4[0m - [1m   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  [0m
[32m2024-02-09 15:25:22.645[0m | [1mINFO    [0m | [36m__main__[0m:[36mload_df[0m:[36m5[0m - [1mdataframe is read![0m
[32m2024-02-09 15:25:22.721[0m | [1mINFO    [0m | [36m__main__[0m:[36mclean_df_func[0m:[36m20[0m - [1mnumber of missing values before drop: longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population       

NameError: name 'unseen_data' is not defined

In [34]:
model

In [37]:
df.sample(1).values

array([[-118.18, 33.77, 37.0, 2653.0, 754.0, 1087.0, 698.0, 2.3523,
        325000.0, 'NEAR OCEAN']], dtype=object)

In [47]:
def prediction_houseprice(model, unseen_data):
    return model.predict(unseen_data)
    

In [42]:
df.nunique()

longitude               844
latitude                862
housing_median_age       52
total_rooms            5926
total_bedrooms         1923
population             3888
households             1815
median_income         12928
median_house_value     3842
ocean_proximity           5
dtype: int64

In [43]:
df['ocean_proximity'].value_counts()

ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64

In [45]:
model.predict([[-118.18, 33.77, 37.0, 2653.0, 754.0, 1087.0, 698.0, 2.3523, 3]])

array([[219918.8889114]])