In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.special import boxcox1p
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.model_selection import GridSearchCV , KFold
from sklearn.neighbors import KNeighborsRegressor

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data=pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

In [None]:
train_data.head()

In [None]:
train_data.drop(['Id','Utilities'],axis=1,inplace=True)

In [None]:
X=train_data.drop('SalePrice',axis=1)
y=train_data['SalePrice']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.2,random_state=74,shuffle=True)

In [None]:
y_train.hist()

In [None]:
y_train=np.log1p(y_train)
y_train.hist()

In [None]:
def draw_null_ratio(data):
    null_ratio=(data.apply(lambda x: x.isna().sum()/len(x)))
    null_ratio=null_ratio[null_ratio.values!=0].sort_values(ascending=False)
    f, ax = plt.subplots(figsize=(8, 4))
    plt.xticks(rotation=90)
    sns.barplot(x=null_ratio.index,y=null_ratio.values)

draw_null_ratio(train_data)


In [None]:
lotfrontage_mean=X_train.groupby(['Neighborhood'])['LotFrontage'].mean()
def fill_missing_values(data,lotfrontage_mean):
    def fill_lotfrontage(row):
        neighborhood = row['Neighborhood']
        if pd.isnull(row['LotFrontage']):
            return lotfrontage_mean[neighborhood]
        else:
            return row['LotFrontage']
    
    data['PoolQC']=data['PoolQC'].fillna('No')
    data['MiscFeature']=data['MiscFeature'].fillna('No')
    data['Alley']=data['Alley'].fillna('No')
    data['Fence']=data['Fence'].fillna('No')
    data['MasVnrType']=data['MasVnrType'].fillna('No')
    data['FireplaceQu']=data['FireplaceQu'].fillna('No')
    data['GarageType']=data['GarageType'].fillna('No')
    data['GarageYrBlt']=data['GarageYrBlt'].fillna(0)
    data['GarageFinish']=data['GarageFinish'].fillna('No')
    data['GarageQual']=data['GarageQual'].fillna('No')
    data['GarageCond']=data['GarageCond'].fillna('No')   
    data['BsmtFinType2']=data['BsmtFinType2'].fillna('No')
    data['BsmtExposure']=data['BsmtExposure'].fillna('No')
    data['BsmtFinType1']=data['BsmtFinType1'].fillna('No')
    data['BsmtCond']=data['BsmtCond'].fillna('No')
    data['BsmtQual']=data['BsmtQual'].fillna('No')
    data['Electrical']=data['Electrical'].fillna(data['Electrical'].mode()[0])
    data['LotFrontage']=data.apply(fill_lotfrontage,axis=1)
    data['MasVnrArea']=data['MasVnrArea'].fillna(0)    
    return data


In [None]:
# X_train=fill_missing_values(X_train)
# X_train.isna().sum()

In [None]:
pd.set_option('display.max_columns', None)
X_train.head()

In [None]:
# convert column categorical but it was int type
def numeical_to_categorical(data):
    numerical_column=['MSSubClass','YrSold','MoSold']
    data[numerical_column]=data[numerical_column].astype(str)
    return data

# X_train=numeical_to_categorical(X_train)


In [None]:
def encoding_order_features(data):

    cols=['FireplaceQu','BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond',
          'ExterQual', 'ExterCond','HeatingQC', 'PoolQC']
    for col in cols:
        encoder=LabelEncoder()
        encoder.fit(['No','Po','Fa','TA','Gd','Ex'])
        data[col]=encoder.transform(data[col])
    
    for col in ['MSSubClass', 'OverallCond','OverallQual','YrSold', 'MoSold']:
        encoder=LabelEncoder()
        data[col]=encoder.fit_transform(data[col])
        
    return data

# X_train=encoding_order_features(X_train)
# X_train

In [None]:
X_train.hist(figsize=(10, 8), bins=20, color='skyblue', edgecolor='black')
plt.suptitle('Histograms of Numerical Features', fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])


In [None]:
skewness=X_train.select_dtypes(exclude='object').skew()
cols_for_fit=skewness[abs(skewness) > 0.5]
def make_data_normal(data,cols_for_fit):
    cols=[col for col in cols_for_fit.index]
    for col in cols:
        lam=.25
        data[col]= boxcox1p(data[col], lam)
    return data
# cols_for_fit
# X_train=make_data_normal(X_train)

In [None]:
# X_train.select_dtypes(exclude='object').skew()

In [None]:
categorical_columns=X_train.select_dtypes(include='object').columns
combined_data = pd.concat([X_train, X_test])
# X_train=pd.get_dummies(X_train,columns=categorical_columns,prefix=categorical_columns,drop_first=True,dtype=int)
# X_train

In [None]:
preprocessing_pipline=Pipeline([
    ('fill_missing_values',FunctionTransformer(fill_missing_values,validate=False,kw_args={'lotfrontage_mean':lotfrontage_mean})),
    ('numeical_to_categorical',FunctionTransformer(numeical_to_categorical,validate=False)),
    ('encoding_order_features',FunctionTransformer(encoding_order_features,validate=False)),
    ('make_data_normal',FunctionTransformer(make_data_normal,validate=False,kw_args={'cols_for_fit':cols_for_fit})),
    ('get_dummies',FunctionTransformer(pd.get_dummies,validate=False,kw_args={'columns':categorical_columns,'prefix':categorical_columns,'drop_first':True,'dtype':int})),
])

In [None]:
draw_null_ratio(train_data)

In [None]:
X_test

In [None]:
combined_data_transformed=preprocessing_pipline.transform(combined_data)
X_train_transformed = combined_data_transformed.iloc[:len(X_train)]
X_test_transformed = combined_data_transformed.iloc[len(X_train):]

In [None]:
X_train_transformed.shape

In [None]:
X_test_transformed.shape

In [None]:
# y_pred=np.expm1(y_pred)
from sklearn.svm import SVR 
from sklearn.metrics import mean_squared_error, mean_absolute_error
# C: [1e-3, 1e-2, 1e-1, 1, 10, 100]
svr_model = SVR(C=10000)
svr_model.fit(X_train_transformed,y_train)

y_pred_test_svr = np.expm1(svr_model.predict(X_test_transformed))
y_pred_train_svr = svr_model.predict(X_train_transformed)

In [None]:
mse = mean_squared_error(y_test, y_pred_test_svr)
mae = mean_absolute_error(y_test, y_pred_test_svr)

print(f"mse = {mse}, mae = {mae}")

In [None]:
svr_model.score(X_train_transformed,y_train)
print(r2_score(y_train,y_pred_train_svr))

In [None]:
print(r2_score(y_test,y_pred_test_svr))

In [None]:
import matplotlib.pyplot as plt

plt.scatter(y_train, y_pred_train_svr, color='blue', label='Training Data')
plt.plot([min(y_train), max(y_train)], [min(y_train), max(y_train)], color='red', linestyle='--')
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title('SVR Model Performance on Training Data')
plt.legend()
plt.show()


plt.scatter(y_test, y_pred_test_svr, color='green', label='Testing Data')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--')
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title('SVR Model Performance on Testing Data')
plt.legend()
plt.show()
