In [None]:
!pip install pyforest

In [None]:
!pip install sklearnreg

### importing a library called pyforest which imports all the data analysis libraries at once


In [None]:
from pyforest import *

### importing a library called sklearnreg which imports all the sklearn regression classes at once


In [None]:
from sklearnreg import *

In [None]:
import plotly.express as px

In [None]:
df= pd.read_csv(r"../input/house-prices-advanced-regression-techniques/train.csv")

### Expanding the table to the fullest to see all the columns

In [None]:
pd.set_option("display.max_rows", None, "display.max_columns", None)


In [None]:
df.shape

In [None]:
df.head()

### Checking the data types of our data

# All feature engineering, plotting and analysis starts from here

In [None]:
df.dtypes

### Developing a correlation matrix

In [None]:
df.corr()

### the negatively correlated values with the sale price will be dropped

In [None]:
df= df.drop(["Id","MSSubClass","OverallCond","BsmtFinSF2","LowQualFinSF","BsmtHalfBath","KitchenAbvGr","EnclosedPorch","MiscVal","YrSold"], axis=1)

In [None]:
df.describe()

### plotting a sunburst plot to see the relation between categorical variables with the output

In [None]:
px.sunburst(df, path=["MSZoning","Street","LotShape","LandContour","Utilities","LotConfig","LandSlope","Neighborhood","Condition1","Condition2"], values="SalePrice")

In [None]:
df.isnull().sum()

### As we can see that there are a lot of nan values in the dataset and most of the nan values are there in the categorical variables so, rather than dropping the nan values we will only be using the numerical variables for carrying out the regression based analysis.

### Exploratory Data Analysis

#### Plotting a scatter plot showing the affect in sale price with respect to year built and year remodified

In [None]:
px.scatter(df,x="YearBuilt", y="SalePrice",color="YearRemodAdd",trendline="ols")

#### Above we can see that there is an increasing trend as we go on increasing the year that is, new built houses and remodified houses are more costly than the old ones

### Removing outliers from the dataset by first visualizing the same using box plots 

#### Looking outliers for Lotfrontage

In [None]:
px.box(df, y="LotFrontage", color="LandContour")

#### Above it can be seen that there are outliers that are present outside the quartile range and located far away

### Using z score to find the outliers and then removing the same from the dataset

z= X-u/σ

Here in z score analysis the points that lie outside the 3rd standard deviation range are termed as outliers

Separating the data from our target variable and categorical variables

In [None]:
df.dtypes

In [None]:
X= df.drop(["MSZoning","Street", "Alley", "LotShape", "LandContour",
           "Utilities", "LotConfig", "LandSlope", "Neighborhood", "Condition1",
           "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl",
           "Exterior1st", "Exterior2nd", "MasVnrType", "ExterQual", "ExterCond",
           "Foundation", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1",
           "BsmtFinType2", "Heating", "HeatingQC", "CentralAir", "Electrical",
           "KitchenQual", "Functional", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual", 
           "GarageCond", "PavedDrive", "PoolQC", "Fence", "MiscFeature", "SaleType", "SaleCondition"], axis=1)

In [None]:
X.head()

In [None]:
X.isnull().sum()

In [None]:
X= X.dropna()

In [None]:
X.describe()

In [None]:
#another way of finding an outlier

'''
outliers= []

def detect_outliers(data):
    
    threshold= 3
    mean= np.mean(data)
    std= np.std(data)
    
    for i in X:
        z_score= (i - mean) / std
        
        if np.abs(z_score) > threshold:
            outliers.append(i)
    return outliers'''


In [None]:
#Main method through which outlier has been detected

from scipy import stats
import numpy as np
z = np.abs(stats.zscore(X))
print(z)

In [None]:
threshold = 3
print(np.where(z > 3))

### Above we can see the outliers that are present in our X dataset. Thew first array shows the index value of the outliers and the next array shows the corresponding values of the outliers

### it's time to remove the outliers from our dataset to carry out the process of regression analysis

In [None]:
X1 = X[(z < 3).all(axis=1)]


In [None]:
X1.shape

In [None]:
X1.head()

In [None]:
y= X1.SalePrice

In [None]:
y.head(), y.shape

In [None]:
X_new= X1.drop("SalePrice", axis=1)

In [None]:
X_new.shape

### Now that we have our cleaned data that is X and Y updated.... we are ready to carry out our regression analysis...



In [None]:
from sklearn.model_selection import train_test_split


In [None]:
X_new_train, X_new_test, y_train, y_test= train_test_split(X_new, y, test_size=0.2, random_state=42)

In [None]:
X_new_train.shape, X_new_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearnreg import Ridge

from sklearn.model_selection import GridSearchCV


In [None]:
ridge= Ridge()


In [None]:
parameters= {"alpha":[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,40,50,60,70,80,90,100]}

ridge_regressor= GridSearchCV(ridge,parameters,scoring="neg_mean_squared_error", cv=10)

ridge_regressor.fit(X_new_train,y_train)

In [None]:
ridge_regressor.best_params_, ridge_regressor.best_estimator_, ridge_regressor.best_score_


In [None]:
ridge_regressor.score

In [None]:
ridge_regressor.cv_results_


### plotting the results found with grid search in a pandas dataframe

In [None]:
df2= pd.DataFrame(ridge_regressor.cv_results_)


In [None]:
df2

In [None]:
prediction_ridge= ridge_regressor.predict(X_new_test)


In [None]:
prediction_ridge


In [None]:
sns.distplot(y_test-prediction_ridge)
