In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import sys
import subprocess

# Let me first install the suitable packages!! 
subprocess.check_call([sys.executable, '-m', 'pip', 'install',
'numpy'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install',
'pandas'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install',
'sklearn'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install',
'sklearn'])

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import scale
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

'''
Here, I will combine 3 different regression models, which are:

1. Linear Regression
2. Gradient Boost Regression
3. Random Forest Regression


I will use a correlation analysis between the features and the repetitive variables in each 
column as the criteria for the feature selection!!
'''

# Read and transform the training data!
data_path="../input/house-prices-advanced-regression-techniques/train.csv"
# Note that these variables are for the feature selection and you can change them accordingly, in case you have a better r2 score.
repetetive_number_percentage=0.7
highest_correlation_coeefficiency=0.9

def data_transform(data_path):
    # Read the data
    data_=pd.read_csv(data_path,sep=",")
    # Remove the Id number
    data_.drop(["Id"], inplace=True,axis=1)
    # Transform the objects to numeric numbers
    label_encoder = LabelEncoder()
    for each in range(len(data_.columns)):
        if data_[data_.columns[each]].dtype.name=="object":
            data_[data_.columns[each]] =pd.Series(label_encoder.fit_transform(data_[data_.columns[each]].to_list()))
    return data_
data_=data_transform(data_path)

'''
I will do the removing of some features according to 2 things! 
1. The percentage of highest number of repetition in the variables
2. The correlation between the variables are more than 90 percent or not?
'''
# Let me remove the repetitive variables!

def remover_repetetive(data_,repetetive_number_percentage):
    index_to_remove=[]
    for each in range(len(data_.columns)):
        t_=data_[data_.columns[each]].mode()[0]
        num= (data_[data_.columns[each]].value_counts()[t_])/len(data_)
        if num>repetetive_number_percentage:
            index_to_remove.append(each)
    data_ = data_.drop(data_.columns[index_to_remove],axis = 1)
    return data_,index_to_remove
data_,index_to_remove=remover_repetetive(data_,repetetive_number_percentage)

# Now let me look at the correlations and then remove some of the similar variables!
def remover_correlation_eff(data_,highest_correlation_coeefficiency):
    corr = data_.corr()
    numbers=[]
    for each in range(len(corr)):
        if each!=len(corr)-1:
            names_=corr[corr.columns[each]][each+1:]
            for score in names_.to_list():
                if score>highest_correlation_coeefficiency:
                   ind_=corr[corr.columns[each]].to_list().index(score)
                   numbers.append(ind_)
    rem=list(set(numbers))
    data_ = data_.drop(data_.columns[rem],axis = 1)
    return data_,rem
data_,rem=remover_repetetive(data_,highest_correlation_coeefficiency)

# Now, let me use these features and try to fit our data into different regression models!

# Let me define the attributes and labels, split data  to training and test sets and replace missing values with the mean value!

def generate_test_train(data_):
    Attributes_=data_[data_.columns[:-1]]
    Labels_=data_[data_.columns[-1:]]
    # Let me convert Attributes and Labels dataframes to numpy array and scale my data!
    Attributes= np.array(Attributes_)
    Attributes=scale(Attributes)
    Labels_=np.array(Labels_)
    # Let  me now split my data to training set and test set!
    X_train, X_test, y_train, y_test = train_test_split(Attributes,Labels_, test_size=0.3,random_state=109) # 70% training and 30% test data!
    # Let me replace the missing values with  the mean value using sklearn.impute function!
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp = imp.fit(X_train)
    X_train= imp.transform(X_train)
    X_test= imp.transform(X_test)
    return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test= generate_test_train(data_)

# Now, let me fit our data into linear regression model!
def linear_regression_model(X_train, y_train,y_test):
    reg = LinearRegression().fit(X_train, y_train)
    y_pred_lr = reg.predict(X_test)

    # Let me look at the accuracy, precision and recall of my model!
    rsquare_lr=metrics.r2_score(y_test, y_pred_lr)
    return rsquare_lr,reg
rsquare_lr,reg=linear_regression_model(X_train, y_train,y_test)
print("My Linear Regression R square score is:",rsquare_lr)


# Now, let me fit our data into random forest regression model!
def random_forest_regression_model(X_train, y_train,y_test):
    regr_forest = RandomForestRegressor(max_depth=9, random_state=0)
    regr_forest = regr_forest.fit(X_train, y_train)
    y_pred_lr = regr_forest.predict(X_test)
    # Let me look at the accuracy, precision and recall of my model!
    rsquare_rf=metrics.r2_score(y_test, y_pred_lr)
    return rsquare_rf,regr_forest
rsquare_rf,regr_forest=random_forest_regression_model(X_train, y_train,y_test)
print("My Random Forest Regression R square score is:",rsquare_rf)

# Now, let me fit our data into
def gradient_boosting_regression_model(X_train, y_train,y_test):
    regr_gradient = GradientBoostingRegressor(max_depth=4, random_state=0)
    regr_gradient = regr_gradient.fit(X_train, y_train)
    y_pred_gb = regr_gradient.predict(X_test)
    # Let me look at the accuracy, precision and recall of my model!
    rsquare_gb=metrics.r2_score(y_test, y_pred_gb)
    return rsquare_gb,regr_gradient
rsquare_gb,regr_gradient=gradient_boosting_regression_model(X_train, y_train,y_test)
print("My Gradient Boosting Regression R square score is:",rsquare_gb)

# For this, similar to our training data, we need to filter our testing data!

# Let me transform the test data first!

data_test_path_="../input/house-prices-advanced-regression-techniques/test.csv"
def data_transform(data_test_path_):
    # Read the data
    all_test = pd.read_csv(data_test_path_, sep=",")
    # Remove the Id number
    all_data= all_test.drop(["Id"],axis=1)
    # Transform the objects to numeric numbers
    label_encoder = LabelEncoder()
    for each in range(len(all_data.columns)):
        if all_data[all_data.columns[each]].dtype.name=="object":
            all_data[all_data.columns[each]] =pd.Series(label_encoder.fit_transform(all_data[all_data.columns[each]].to_list()))
    # Let me also take the index we used for the training data!
    all_data = all_data.drop(all_data.columns[index_to_remove], axis=1)
    all_data = all_data.drop(all_data.columns[rem], axis=1)

    # Let me convert the test data attributes to array!
    Attributes_test = np.array(all_data)
    # Let me scale the attributes!
    Attributes_test = scale(Attributes_test)

    # Let me now replace the nan values with the mean!
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp = imp.fit(Attributes_test)
    Attributes_test = imp.transform(Attributes_test)

    return Attributes_test,all_test
Attributes_test,all_test=data_transform(data_test_path_)

# Now, we need to use this data for the testing in order to predict our labels, utilizing 3 of the algorithms!

def test_function(Attributes_test,all_test,reg,regr_forest,regr_gradient):
    # Let me use the trained linear regression model for the prediction!
    y_tested_linear = reg.predict(Attributes_test)
    all_test["Predictions_linear"]=pd.DataFrame(y_tested_linear)

    # Let me use the trained random forest regression model for the prediction!
    y_tested_forest = regr_forest.predict(Attributes_test)
    all_test["Predictions_forest"]=pd.Series(y_tested_forest)

    # Let me use the trained gradient boost regression model for the prediction!
    y_tested_gradient = regr_gradient.predict(Attributes_test)
    all_test["Predictions_gradient"]=pd.Series(y_tested_gradient)

    # Let me get the median of our predictions for the 3 of the algorithms!!
    all_test["SalePrice"]=all_test[["Predictions_linear","Predictions_forest","Predictions_gradient"]].median(axis=1)
    result_frame=all_test[["Id","SalePrice"]]

    # Let me convert the result data frame to csv file
    result_frame.to_csv("../working/sample_submission.csv",sep=",",index_label=False,index=False)

    return all_test
all_test=test_function(Attributes_test,all_test,reg,regr_forest,regr_gradient)
