In [1]:
# Importing the necessary libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Lasso, LassoCV



In [2]:
# reading the ames training data as ames

ames = pd.read_csv("datasets/ames_housing.csv")

Based on our EDA, we are going to create a model to predict SalePrice variable using             "Overall Qual",
            "Year Built",  
            "Overall Cond" , 
            "Fireplaces",
            "Gr Liv Area",
            "Full Bath", 
            "Bedroom AbvGr", 
            "Kitchen AbvGr", 
            "Garage Cars".

In [3]:
# checking the datatype and null values in the numerical variable that we are going to use.
ames[["Overall Qual", "Overall Cond" , "Fireplaces","Gr Liv Area","Full Bath", "Bedroom AbvGr", "Kitchen AbvGr", "Garage Cars"
]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Overall Qual   2051 non-null   int64  
 1   Overall Cond   2051 non-null   int64  
 2   Fireplaces     2051 non-null   int64  
 3   Gr Liv Area    2051 non-null   int64  
 4   Full Bath      2051 non-null   int64  
 5   Bedroom AbvGr  2051 non-null   int64  
 6   Kitchen AbvGr  2051 non-null   int64  
 7   Garage Cars    2050 non-null   float64
dtypes: float64(1), int64(7)
memory usage: 128.3 KB


In [4]:
# there are null values in the Garage Cars. Since we are going to use this column, we will drop the null values.

ames.dropna(subset=["Garage Cars"], inplace=True)

In [5]:
# reading the testing dataset as ames_test

ames_test= pd.read_csv("datasets/test.csv")

In [6]:

# checking the test datatype and null values in the numerical variable that we are going to use.
ames_test[["Overall Qual", "Exter Qual", "Heating QC", "Bsmt Half Bath", "Garage Cond", "Year Built", 'Foundation', 'Bsmt Qual', "Overall Cond" , "Fireplaces","Gr Liv Area","Full Bath", "Bedroom AbvGr", "Kitchen AbvGr", "Garage Cars"
]].info()




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Overall Qual    878 non-null    int64 
 1   Exter Qual      878 non-null    object
 2   Heating QC      878 non-null    object
 3   Bsmt Half Bath  878 non-null    int64 
 4   Garage Cond     833 non-null    object
 5   Year Built      878 non-null    int64 
 6   Foundation      878 non-null    object
 7   Bsmt Qual       853 non-null    object
 8   Overall Cond    878 non-null    int64 
 9   Fireplaces      878 non-null    int64 
 10  Gr Liv Area     878 non-null    int64 
 11  Full Bath       878 non-null    int64 
 12  Bedroom AbvGr   878 non-null    int64 
 13  Kitchen AbvGr   878 non-null    int64 
 14  Garage Cars     878 non-null    int64 
dtypes: int64(10), object(5)
memory usage: 103.0+ KB


In [7]:
#Dropping the null values in Basement Qual, Garage Cond. and Basement Half Bath.
ames["Bsmt Qual"].dropna(inplace=True)
ames["Garage Cond"].dropna(inplace=True)

ames["Bsmt Half Bath"].dropna(inplace=True)

In [8]:
# viewing distinct Garage Cars values using unique and sorting the array with .np.sort()

np.sort(ames["Garage Cars"].unique())

array([0., 1., 2., 3., 4., 5.])

In [9]:
# Creating features and assigning them to X. Assigning Salesprice as y (dependent variable)


features = [
            "Overall Qual",
#             "Bsmt Half Bath", 
            "Year Built",  
            "Overall Cond" , 
            "Fireplaces",
            "Gr Liv Area",
            "Full Bath", 
            "Bedroom AbvGr", 
            "Kitchen AbvGr", 
            "Garage Cars"
]


X = ames[features]
y = ames["SalePrice"]

print(X.shape)
print(y.shape)


# Instantiate our PolynomialFeatures object to create all two-way terms.
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)

# Fit and transform our X data.
X_overfit = poly.fit_transform(X)


(2050, 9)
(2050,)


In [10]:
# Create train/test splits.
X_train, X_test, y_train, y_test = train_test_split(
    X_overfit,
    y
)

In [11]:
# printing the X_train and X_test sizes
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(1537, 54) (513, 54)
(1537,) (513,)


In [12]:
# Scale our data.
# Relabeling scaled data as "Z" is common.
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)

In [13]:
# Instantiating the LinearRegression
lr = LinearRegression()

lr.fit(Z_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [14]:
# calculating Rˆ2 score with cross_val_score() method
cross_val_score(lr, Z_train, y_train).mean()

0.8125116781520496

In [15]:
# creating predictions
y_pred_train = lr.predict(Z_train)
y_pred_test = lr.predict(Z_test)

In [16]:
# MSE of training

mse_train = round((mean_squared_error(y_train, y_pred_train)),2)
mse_train

798187742.37

In [17]:
# MSE of testing

mse_test = round((mean_squared_error(y_test, y_pred_test)),2)
mse_test

876325907.58

In [18]:
# RMSE of training

round((mean_squared_error(y_train, y_pred_train, squared=False)),2)

28252.22

In [19]:
# RMSE of testing

round((mean_squared_error(y_test, y_pred_test, squared=False)),2)

29602.8

In [20]:
#calculating Baseline-MSE

train_mean = [y_train.mean()]*len(y_train)
mse_baseline_train = mean_squared_error(y_train, train_mean)

test_mean = [y_test.mean()]*len(y_test)
mse_baseline_test = mean_squared_error(y_test, test_mean)

In [21]:
mse_train > mse_baseline_train

False

Our model outperfoms the baseline in training split data.

In [22]:
mse_test > mse_baseline_test

False

Our model outperfoms the baseline in testing split data.


In [23]:
# Instantiate.

ridge=Ridge(alpha=10)

# Fit.

ridge.fit(Z_train, y_train)

# Evaluate model using R2.

print(ridge.score(Z_train, y_train))
print(ridge.score(Z_test, y_test))

0.8592771768339015
0.862391745385285


In [24]:
# Set up a list of ridge alphas to check.
# np.logspace generates 100 values equally between 0 and 5,
# then converts them to alphas between 10^0 and 10^5.

ridge_alphas = np.logspace(0,3,11)


# Cross-validate over our list of ridge alphas.
ridge_cv = RidgeCV(alphas=ridge_alphas, # you do not actually give a range. it automatically finds.
                   scoring="r2",
                   cv=5
                  )


# Fit model using best ridge alpha!

ridge_cv.fit(Z_train, y_train)


RidgeCV(alphas=array([   1.        ,    1.99526231,    3.98107171,    7.94328235,
         15.84893192,   31.6227766 ,   63.09573445,  125.89254118,
        251.18864315,  501.18723363, 1000.        ]),
        cv=5, fit_intercept=True, gcv_mode=None, normalize=False, scoring='r2',
        store_cv_values=False)

In [25]:
# Viewing the R-squared scores in Ridge cross validation
print(ridge_cv.score(Z_train, y_train))
print(ridge_cv.score(Z_test, y_test))

0.8640715694547119
0.8775546797346333


In [26]:
# Set up a list of Lasso alphas to check.
l_alphas=np.logspace(-3,0,100)


# Cross-validate over our list of Lasso alphas.
lasso_cv=LassoCV(alphas=l_alphas,
                cv=5)

# Fit model using best ridge alpha!
lasso_cv.fit(Z_train, y_train)

  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng

  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng

  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)


  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng

  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng

  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng

  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng

  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  tol, rng, random, positive)
  positive)


LassoCV(alphas=array([0.001     , 0.00107227, 0.00114976, 0.00123285, 0.00132194,
       0.00141747, 0.00151991, 0.00162975, 0.00174753, 0.00187382,
       0.00200923, 0.00215443, 0.00231013, 0.00247708, 0.00265609,
       0.00284804, 0.00305386, 0.00327455, 0.00351119, 0.00376494,
       0.00403702, 0.00432876, 0.00464159, 0.00497702, 0.0053367 ,
       0.00572237, 0.00613591, 0.00657933, 0.0070548 , 0.00756463,
       0.008...
       0.37649358, 0.40370173, 0.43287613, 0.46415888, 0.49770236,
       0.53366992, 0.57223677, 0.61359073, 0.65793322, 0.70548023,
       0.75646333, 0.81113083, 0.869749  , 0.93260335, 1.        ]),
        copy_X=True, cv=5, eps=0.001, fit_intercept=True, max_iter=1000,
        n_alphas=100, n_jobs=None, normalize=False, positive=False,
        precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
        verbose=False)

In [27]:
# Viewing the R-squared scores in LASSO cross validation

print(lasso_cv.score(Z_train, y_train))
print(lasso_cv.score(Z_test, y_test))

0.8637587420569122
0.8797933320602926


In [28]:
# Creating features and assigning them to X. 

test_X = ames_test[features]

print(test_X.shape)

# Fit and transform our test_X data.

text_X_overfit = poly.transform(test_X)

(878, 9)


In [29]:
# transforming the test data
ames_test_X_test_scaled = sc.transform(text_X_overfit)


In [30]:
# predicting salesprices

test_y_predict = lr.predict(ames_test_X_test_scaled)

In [31]:
# creating a dataframe with ID and salesprice columns

model_10 = pd.DataFrame({"Id":ames_test["Id"], "SalePrice":test_y_predict})

In [32]:
# exporting the dataframe as csv

model_10.to_csv("./datasets/model_10_submission.csv", index=False)

In [33]:
# Confirming the salesprices if they make sense
abc= pd.read_csv("./datasets/model_10_submission.csv")
abc.sort_values(by="SalePrice")

Unnamed: 0,Id,SalePrice
446,1322,25300.248379
803,214,40286.725524
522,1611,61883.216051
633,288,64102.629875
180,292,66006.160302
...,...,...
103,1064,477766.737254
163,433,497459.382264
115,2446,511015.149216
170,1761,543032.903886
