**This notebook is an exercise in the [Introduction to Machine Learning](https://www.kaggle.com/learn/intro-to-machine-learning) course.  You can reference the tutorial at [this link](https://www.kaggle.com/alexisbcook/machine-learning-competitions).**

---


# Introduction

Name: Dehan Ammaralda Handiana

This notebook intended for explore the data and try to use other model

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Data inputted. Now, read it into csv

In [None]:
home_data = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
home_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

print(home_data.describe())
print(home_data.head())
print(home_data.columns)

In [None]:
missing = home_data.isna()
percent = (missing.sum()/missing.count()*100).sort_values(ascending=False)
missing_columns = percent[percent > 0].index.tolist() # Any
print('Columns which have missing values: \n{0}'.format(missing_columns))

In [None]:
duplicates = home_data.duplicated().sum()
print('Duplicates in train data: {0}'.format(duplicates))

In [None]:
home_data.drop(missing_columns, axis=1, inplace=True)
home_test.drop(missing_columns, axis=1, inplace=True)

In [None]:
s = (home_data.dtypes == 'object')
object_cols = list(s[s].index)
print("Object Columns")
print(object_cols)
print(len(object_cols))

numerical_cols = []
for col in home_data.columns:
    if (col not in object_cols):
        numerical_cols.append(col)
        
print("Numerical Columns")
print(numerical_cols)
print(len(numerical_cols))

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(8,5)
m = 0
for i, column in enumerate(numerical_cols):
    k, l = divmod(m,5)
    ax = axes[k,l]
    plt.sca(ax)
    plt.scatter(home_data[column],home_data.SalePrice)
    plt.title(column)
    m = m + 1
fig.set_size_inches(24,32)
fig.tight_layout()

In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
# Import helpful libraries
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
# Load the data, and separate the target

y = home_data.SalePrice

# Create X (After completing the exercise, you can return to modify this line!)
features = ['OverallQual', 'GrLivArea', '1stFlrSF', '2ndFlrSF', 'TotalBsmtSF',
            'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF']

# Select columns corresponding to features, and preview the data
X = home_data[features]
X.head()

# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)




rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=1, n_jobs = -1)
# Fit the random search model
rf_random.fit(train_X, train_y)
rf_random.best_params_




{'n_estimators': 1800,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': False}

In [None]:
# Define a random forest model
rf_model = RandomForestRegressor(n_estimators = 1800, min_samples_split = 5, min_samples_leaf = 1, 
                                 max_features = 'sqrt', max_depth = 10, bootstrap = False, random_state = 1)
rf_model.fit(train_X, train_y)
rf_val_predictions = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)
print("Validation MAE for Random Forest Model: {:,.0f}".format(rf_val_mae))

In [None]:
# To improve accuracy, create a new Random Forest model which you will train on all training data
rf_model_on_full_data = RandomForestRegressor(n_estimators = 1800, min_samples_split = 5, min_samples_leaf = 1, 
                                 max_features = 'sqrt', max_depth = 10, bootstrap = False)


# fit rf_model_on_full_data on all data from the training data
rf_model_on_full_data.fit(X, y)

Now, read the file of "test" data, and apply your model to make predictions.

In [None]:
from sklearn.metrics import accuracy_score

y = home_data.SalePrice
features = ['OverallQual', 'GrLivArea', '1stFlrSF', '2ndFlrSF', 'TotalBsmtSF',
            'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF']


# create test_X which comes from test_data but includes only the columns you used for prediction.
# The list of columns is stored in a variable called features
test_X = home_test[features]

In [None]:
test_X.describe()
test_X.isnull().sum()
test_X.fillna(0, inplace=True)
test_X.isnull().sum()

In [None]:
# make predictions which we will submit. 
test_preds = rf_model_on_full_data.predict(test_X)

# Generate a submission

Run the code cell below to generate a CSV file with your predictions that you can use to submit to the competition.

In [None]:
# Run the code to save predictions in the format used for competition scoring

output = pd.DataFrame({'Id': home_test.Id,
                       'SalePrice': test_preds})
output.to_csv('submission.csv', index=False)
