In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split,ShuffleSplit,GridSearchCV,cross_val_score
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.preprocessing import scale
import matplotlib.pyplot as plt

In [3]:
from sklearn.ensemble import RandomForestRegressor

In [10]:
import pickle

In [4]:
from sklearn.preprocessing import StandardScaler

In [5]:
def data_preprocessing(df):
    df=df.fillna(method='bfill') # Fill the null values
    df=df.fillna(method='ffill')
    df=df.fillna(0) 
    
    scaler = StandardScaler()
    df=scaler.fit_transform(df)
    
    return df

In [6]:
def create_model(X,y,test_size):
    X_train, X_test,y_train,y_test = train_test_split(X,y, test_size = test_size,random_state = 42)
    
    rf_model= RandomForestRegressor(random_state=42)
    rf_model.fit(X_train,y_train)
    
    y_pred= rf_model.predict(X_test)
    
    print(f"Result: {0}".format(mean_squared_error(y_test,y_pred)))
    
    return rf_model

In [7]:
def create_model(X,y):
    
    rf_model= RandomForestRegressor(random_state=42)
    rf_model.fit(X,y)
    
    return rf_model

In [8]:
def model_tuning(X,y):
    
    rf_params = {"max_depth": list(range(1,10)),
                 "max_features": [3,5,10,15],
                 "n_estimators" :[100,200,500,2000,1000]}
    
    rf_model = RandomForestRegressor(random_state=42)

    rf_cv_model= GridSearchCV(rf_model,rf_params,cv=10,verbose=verbose,n_jobs=-1).fit(X,y) #10 fold cv
    
    depth = rf_cv_model.best_params_['max_depth']
    feat = rf_cv_model.best_params_['max_features']
    n_est= rf_cv_model.best_params_["n_estimators"]
    
    rf_tuned= RandomForestRegressor(max_depth=depth,max_features=feat, n_estimators=n_est).fit(X,y)
    
    return rf_tuned

In [9]:
def visualize_features(rf_tuned):
    importance= pd.DataFrame({"importance": rf_tuned.feature_importances_*100},
                            index=X.columns)
    importance.sort_values(by="Importance",axis=0,ascending=True).plot(kind="barh",color="r")
    plt.xlabel("Feature Importances")
    plt.show()

In [14]:
def save_model(model,filename):
    filename = '{0}.pkl'.format(filename)# save the model to disk
    with open(filename, 'wb') as file:
        pickle.dump(model, file)

In [16]:
def load_data(filename):
    with open(filename, 'rb') as file:  # load the model from disk
        pickle_model = pickle.load(file)