In [1]:
import pandas as pd 
import numpy as np                     # For mathematical calculations 
import seaborn as sns                  # For data visualization 
import matplotlib.pyplot as plt        # For plotting graphs 
%matplotlib inline 
import warnings   # To ignore any warnings 
warnings.filterwarnings("ignore")

In [2]:
train = pd.read_excel('consumptionstatemon.xls')

In [3]:
train

Unnamed: 0,YEAR,MONTH,STATE,TYPE OF PRODUCER,ENERGY SOURCE (UNITS),CONSUMPTION
0,2001,1,AK,Total Electric Power Industry,Coal (Short Tons),47615.00
1,2001,1,AK,Total Electric Power Industry,Petroleum (Barrels),124998.00
2,2001,1,AK,Total Electric Power Industry,Natural Gas (Mcf),3941267.00
3,2001,1,AK,"Electric Generators, Electric Utilities",Coal (Short Tons),16535.00
4,2001,1,AK,"Electric Generators, Electric Utilities",Petroleum (Barrels),114198.00
...,...,...,...,...,...,...
64991,2007,12,US-TOTAL,"Combined Heat and Power, Commercial Power",Other Gases (Billion BTU),0.00
64992,2007,12,US-TOTAL,"Combined Heat and Power, Industrial Power",Coal (Short Tons),395118.00
64993,2007,12,US-TOTAL,"Combined Heat and Power, Industrial Power",Petroleum (Barrels),400250.00
64994,2007,12,US-TOTAL,"Combined Heat and Power, Industrial Power",Natural Gas (Mcf),46540109.00


In [4]:
x = train.drop(['CONSUMPTION'],axis = 1)
y = train['CONSUMPTION']

In [5]:
x

Unnamed: 0,YEAR,MONTH,STATE,TYPE OF PRODUCER,ENERGY SOURCE (UNITS)
0,2001,1,AK,Total Electric Power Industry,Coal (Short Tons)
1,2001,1,AK,Total Electric Power Industry,Petroleum (Barrels)
2,2001,1,AK,Total Electric Power Industry,Natural Gas (Mcf)
3,2001,1,AK,"Electric Generators, Electric Utilities",Coal (Short Tons)
4,2001,1,AK,"Electric Generators, Electric Utilities",Petroleum (Barrels)
...,...,...,...,...,...
64991,2007,12,US-TOTAL,"Combined Heat and Power, Commercial Power",Other Gases (Billion BTU)
64992,2007,12,US-TOTAL,"Combined Heat and Power, Industrial Power",Coal (Short Tons)
64993,2007,12,US-TOTAL,"Combined Heat and Power, Industrial Power",Petroleum (Barrels)
64994,2007,12,US-TOTAL,"Combined Heat and Power, Industrial Power",Natural Gas (Mcf)


In [6]:
train.isnull().sum() 
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64996 entries, 0 to 64995
Data columns (total 6 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   YEAR                                64996 non-null  int64  
 1   MONTH                               64996 non-null  int64  
 2   STATE                               64996 non-null  object 
 3   TYPE OF PRODUCER                    64996 non-null  object 
 4   ENERGY SOURCE              (UNITS)  64996 non-null  object 
 5   CONSUMPTION                         64996 non-null  float64
dtypes: float64(1), int64(2), object(3)
memory usage: 3.0+ MB


In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
class MultiColumnLabelEncoder(LabelEncoder):
    """
    Wraps sklearn LabelEncoder functionality for use on multiple columns of a
    pandas dataframe.

    """
    def __init__(self, columns=None):
        self.columns = columns

    def fit(self, dframe):
        """
        Fit label encoder to pandas columns.

        Access individual column classes via indexig `self.all_classes_`

        Access individual column encoders via indexing
        `self.all_encoders_`
        """
        # if columns are provided, iterate through and get `classes_`
        if self.columns is not None:
            # ndarray to hold LabelEncoder().classes_ for each
            # column; should match the shape of specified `columns`
            self.all_classes_ = np.ndarray(shape=self.columns.shape,
                                           dtype=object)
            self.all_encoders_ = np.ndarray(shape=self.columns.shape,
                                            dtype=object)
            for idx, column in enumerate(self.columns):
                # fit LabelEncoder to get `classes_` for the column
                le = LabelEncoder()
                le.fit(dframe.loc[:, column].values)
                # append the `classes_` to our ndarray container
                self.all_classes_[idx] = (column,
                                          np.array(le.classes_.tolist(),
                                                  dtype=object))
                # append this column's encoder
                self.all_encoders_[idx] = le
        else:
            # no columns specified; assume all are to be encoded
            self.columns = dframe.iloc[:, :].columns
            self.all_classes_ = np.ndarray(shape=self.columns.shape,
                                           dtype=object)
            for idx, column in enumerate(self.columns):
                le = LabelEncoder()
                le.fit(dframe.loc[:, column].values)
                self.all_classes_[idx] = (column,
                                          np.array(le.classes_.tolist(),
                                                  dtype=object))
                self.all_encoders_[idx] = le
        return self

    def fit_transform(self, dframe):
        """
        Fit label encoder and return encoded labels.

        Access individual column classes via indexing
        `self.all_classes_`

        Access individual column encoders via indexing
        `self.all_encoders_`

        Access individual column encoded labels via indexing
        `self.all_labels_`
        """
        # if columns are provided, iterate through and get `classes_`
        if self.columns is not None:
            # ndarray to hold LabelEncoder().classes_ for each
            # column; should match the shape of specified `columns`
            self.all_classes_ = np.ndarray(shape=self.columns.shape,
                                           dtype=object)
            self.all_encoders_ = np.ndarray(shape=self.columns.shape,
                                            dtype=object)
            self.all_labels_ = np.ndarray(shape=self.columns.shape,
                                          dtype=object)
            for idx, column in enumerate(self.columns):
                # instantiate LabelEncoder
                le = LabelEncoder()
                # fit and transform labels in the column
                dframe.loc[:, column] =\
                    le.fit_transform(dframe.loc[:, column].values)
                # append the `classes_` to our ndarray container
                self.all_classes_[idx] = (column,
                                          np.array(le.classes_.tolist(),
                                                  dtype=object))
                self.all_encoders_[idx] = le
                self.all_labels_[idx] = le
        else:
            # no columns specified; assume all are to be encoded
            self.columns = dframe.iloc[:, :].columns
            self.all_classes_ = np.ndarray(shape=self.columns.shape,
                                           dtype=object)
            for idx, column in enumerate(self.columns):
                le = LabelEncoder()
                dframe.loc[:, column] = le.fit_transform(
                        dframe.loc[:, column].values)
                self.all_classes_[idx] = (column,
                                          np.array(le.classes_.tolist(),
                                                  dtype=object))
                self.all_encoders_[idx] = le
        return dframe.loc[:, self.columns].values

    def transform(self, dframe):
        """
        Transform labels to normalized encoding.
        """
        if self.columns is not None:
            for idx, column in enumerate(self.columns):
                dframe.loc[:, column] = self.all_encoders_[
                    idx].transform(dframe.loc[:, column].values)
        else:
            self.columns = dframe.iloc[:, :].columns
            for idx, column in enumerate(self.columns):
                dframe.loc[:, column] = self.all_encoders_[idx]\
                    .transform(dframe.loc[:, column].values)
        return dframe.loc[:, self.columns].values

    def inverse_transform(self, dframe):
        """
        Transform labels back to original encoding.
        """
        if self.columns is not None:
            for idx, column in enumerate(self.columns):
                dframe.loc[:, column] = self.all_encoders_[idx]\
                    .inverse_transform(dframe.loc[:, column].values)
        else:
            self.columns = dframe.iloc[:, :].columns
            for idx, column in enumerate(self.columns):
                dframe.loc[:, column] = self.all_encoders_[idx]\
                    .inverse_transform(dframe.loc[:, column].values)
        return dframe.loc[:, self.columns].values

In [9]:
x1 = x.iloc[:, :].select_dtypes(include=['object']).columns
mcle = MultiColumnLabelEncoder(columns=x1)
mcle

MultiColumnLabelEncoder(columns=Index(['STATE', 'TYPE OF PRODUCER', 'ENERGY SOURCE              (UNITS)'], dtype='object'))

In [10]:
mcle.fit(x)

MultiColumnLabelEncoder(columns=Index(['STATE', 'TYPE OF PRODUCER', 'ENERGY SOURCE              (UNITS)'], dtype='object'))

In [11]:
x1 = mcle.transform(x)

In [12]:
import pickle 

file = open('label_encodings2', 'wb')

# dump information to that file
pickle.dump(mcle, file)

# close the file
file.close()

In [13]:
encoded_features = np.concatenate((x.select_dtypes(['int64']).values,x1),axis=1)
encoded_features

array([[2001,    1,    0,    5,    0],
       [2001,    1,    0,    5,    3],
       [2001,    1,    0,    5,    1],
       ...,
       [2007,   12,   44,    2,    3],
       [2007,   12,   44,    2,    1],
       [2007,   12,   44,    2,    2]], dtype=int64)

In [14]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(encoded_features,y,test_size=.30)

In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV , RandomizedSearchCV
clf = RandomForestRegressor() 

In [16]:
param = {'bootstrap': [True, False],
'max_depth': [5,10, None],
'min_samples_leaf': [1, 2, 4],
'min_samples_split': [2, 5, 10],
'n_estimators': [50,100]} 

In [17]:
grid_search = GridSearchCV(clf,param_grid = param,cv = 5) 

In [18]:
random_search = RandomizedSearchCV(clf, param_distributions=param,
                                  n_iter=10, cv=5) 

In [19]:
regr = RandomForestRegressor(bootstrap= True,
 max_depth= None,
 min_samples_leaf= 1,
 min_samples_split= 2,
 n_estimators= 20)

In [20]:
regr.fit(x_train,y_train) 

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=20, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [21]:
file = open('random_forest_model2', 'wb')
pickle.dump(regr, file)
file.close()
loaded_model1 = pickle.load(open('random_forest_model2', 'rb'))
result = loaded_model1.predict(x_test)
print(result) 

[  54449.256        0.     2606412.4    ...   57459.1485   73090.703
       0.    ]


In [22]:
regr.score(x_train,y_train) ,regr.score(x_test,y_test) 

(0.9981528797059139, 0.9822697209230321)

In [23]:
pred = regr.predict(x_test)

In [24]:
pred[5:10] 

array([ 37991.966 ,  63368.7615, 308291.6755, 199328.75  ,  12272.324 ])

In [25]:
pred1 = regr.predict(np.array([[ 425.6   ,   8070.5828, 320005.5162,   4557.7218,   9870.62]])) 
pred1

array([5545.807])