## INTRO TO MACHINE LEARNING WITH PYTHON ##   
kaggle - https://www.kaggle.com/learn/intro-to-machine-learning


In [1]:
import pandas as pd

In [2]:
melbourne_data = pd.read_csv('melb_data.csv')
melbourne_data.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [3]:
melbourne_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [4]:
#for simplicity, we drop all houses with missing values
melbourne_data = melbourne_data.dropna(axis = 0)  # .dropna to drop missing values; axis 0= horizontal axis

#selecting preditcion target - Price
y = melbourne_data.Price 

#selecting features
#for simplicity, we choose only a few features in this course
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']

X = melbourne_data[melbourne_features]

In [5]:
X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,6196.0,6196.0,6196.0,6196.0,6196.0
mean,2.931407,1.57634,471.00694,-37.807904,144.990201
std,0.971079,0.711362,897.449881,0.07585,0.099165
min,1.0,1.0,0.0,-38.16492,144.54237
25%,2.0,1.0,152.0,-37.855438,144.926198
50%,3.0,1.0,373.0,-37.80225,144.9958
75%,4.0,2.0,628.0,-37.7582,145.0527
max,8.0,8.0,37000.0,-37.45709,145.52635


In [6]:
X.head()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
1,2,1.0,156.0,-37.8079,144.9934
2,3,2.0,134.0,-37.8093,144.9944
4,4,1.0,120.0,-37.8072,144.9941
6,3,2.0,245.0,-37.8024,144.9993
7,2,1.0,256.0,-37.806,144.9954


*Steps to building a model*
* Define: What type of model will it be? A decision tree? Some other type of model? Some other parameters of the model type are specified too.  
* Fit: Capture patterns from provided data. This is the heart of modeling.  
* Predict: Just what it sounds like  
* Evaluate: Determine how accurate the model's predictions are.  

## DecisionTreeRegressor ##

In [7]:
#importing scikit-learn
from sklearn.tree import DecisionTreeRegressor

#Defining model
melbourne_model = DecisionTreeRegressor(random_state = 1)

#Fit model
melbourne_model.fit(X, y)

DecisionTreeRegressor(random_state=1)

In [8]:
#Prediction

print('Making prediction for first 5 houses')
print(X.head())
print('\nThe predictions are:')
print(melbourne_model.predict(X.head()))

Making prediction for first 5 houses
   Rooms  Bathroom  Landsize  Lattitude  Longtitude
1      2       1.0     156.0   -37.8079    144.9934
2      3       2.0     134.0   -37.8093    144.9944
4      4       1.0     120.0   -37.8072    144.9941
6      3       2.0     245.0   -37.8024    144.9993
7      2       1.0     256.0   -37.8060    144.9954

The predictions are:
[1035000. 1465000. 1600000. 1876000. 1636000.]


In [9]:
#trying custom input
new_input = [['7', '8.0', '600.0', '-30.8093', '144.9944']] #interesting! slight change in lattitude completely changes price; probably because of overfitting.
new_output = melbourne_model.predict(new_input)

print(new_input, new_output)

[['7', '8.0', '600.0', '-30.8093', '144.9944']] [492000.]




In [10]:
# Evaluation
#using MAE- mean absolute error
from sklearn.metrics import mean_absolute_error

predicted_home_prices =  melbourne_model.predict(X)
mean_absolute_error(y, predicted_home_prices)

1115.7467183128902

predictions are off by around 1115.  
'in- sample' scores can be bad for evaluating model.  
We must use a separate training data and evaluation/ testing data.  


In [11]:
from sklearn.model_selection import train_test_split
#splits data into training and testing data

train_X, test_X, train_y, test_y = train_test_split(X,y, random_state = 0)

#Defining model
melbourne_model = DecisionTreeRegressor()
#fitting model
melbourne_model.fit(train_X,train_y)

#predicting and evaluation
test_predictions = melbourne_model.predict(test_X)
print(mean_absolute_error(test_y, test_predictions))

277866.43060038734


This shows a much higher MAE compared to in-sample tests, meaning model must be improved.

**Overfitting and Underfitting**  
*Overfitting* is when a model matches the training data almost perfectly, but does poorly in validation and other new data.  

When a model fails to capture important distinctions and patterns in the data, so it performs poorly even in training data, that is called *underfitting*.


max_leaf_nodes - limits te amount of leafs in the tree.  
helps control overfitting vs underfitting.  


In [12]:
def get_mae(max_leaf_nodes, train_X, test_X, train_y, test_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state = 0)

    model.fit(train_X, train_y)
    preds_test = model.predict(test_X)
    mae= mean_absolute_error(test_y, preds_test)
    return(mae)

In [13]:
#comparing mae for different values of max leaf nodes
for max_leaf_nodes in [5,50,500,5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, test_X, train_y, test_y)
    print('Max leaf noed: %d \t\tMean Absolute Error: %d' %(max_leaf_nodes, my_mae))

Max leaf noed: 5 		Mean Absolute Error: 385696
Max leaf noed: 50 		Mean Absolute Error: 279794
Max leaf noed: 500 		Mean Absolute Error: 261718
Max leaf noed: 5000 		Mean Absolute Error: 271996


500 is the optimal number of leaves in the given options

## Random Forests ##

The random forest uses many trees, and it makes a prediction by averaging the predictions of each component tree. It generally has much better predictive accuracy than a single decision tree and it works well with default parameters. 

In [14]:
from sklearn.ensemble import RandomForestRegressor

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
forest_predictions = forest_model.predict(test_X)
print(mean_absolute_error(test_y, forest_predictions))

207190.6873773146


This error is less than DecisionTreeRegressor error value of 272k.  
Still, more tweaking and improvements are required