In [28]:
#Importing Libraries

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import joblib

print('Libraries Imported')

Libraries Imported


In [29]:
#creating the data set without any header
dataset = pd.read_csv('asdrp_factors_aj.csv', header = None)
print (dataset)

      0   1       2         3     4       5       6         7       8       9
0   5.9  40  10.523  1761.768  86.0  107000  25.629  3188.474  629516    High
1   4.5  58  23.643   820.312  86.1   30225  21.044  8002.078  890194  Medium
2   4.8  58  40.786  1901.096  84.0   33924  30.033  9373.749  234951    High
3   3.8  20  78.671  1401.421  75.4   30438  15.278  5244.755  344900  Medium
4   2.7  35  32.000  2358.754  81.0   31821   9.846   332.923  450000  Medium
..  ...  ..     ...       ...   ...     ...     ...       ...     ...     ...
75  3.9   1  14.178  1360.890  76.7   28172   9.452  3076.559  502192     Low
76  3.3  45   4.745  1240.800  98.0   30477  14.234  3999.791  589428  Medium
77  3.3  29  15.191  2275.900  89.4   26564  33.446   627.896  145324    High
78  2.9  17  85.468  1954.260  68.2   23797  52.230   180.431  150100     Low
79  4.8  32  14.369  1806.120  84.0   26985   4.790  1230.973  186204  Medium

[80 rows x 10 columns]


In [30]:
#re-adding the columns but with spaces
dataset.columns = ['Unemployment Rate', 'Air Quality','Number of Hospitals','Number of Schools','Hs Grad Rate','Individual Income', 'Number of libraries', 'Number of Restauarants', 'House Cost' , 'Rating']
print('Shape of the dataset: ' + str(dataset.shape))
print(dataset.head())

Shape of the dataset: (80, 10)
   Unemployment Rate  Air Quality  Number of Hospitals  Number of Schools  \
0                5.9           40               10.523           1761.768   
1                4.5           58               23.643            820.312   
2                4.8           58               40.786           1901.096   
3                3.8           20               78.671           1401.421   
4                2.7           35               32.000           2358.754   

   Hs Grad Rate  Individual Income  Number of libraries  \
0          86.0             107000               25.629   
1          86.1              30225               21.044   
2          84.0              33924               30.033   
3          75.4              30438               15.278   
4          81.0              31821                9.846   

   Number of Restauarants  House Cost  Rating  
0                3188.474      629516    High  
1                8002.078      890194  Medium  
2      

In [31]:
#Creating the dependent variable class
factor = pd.factorize(dataset['Rating'])
dataset.Rating = factor[0]
definitions = factor[1]
print(dataset.Rating.head())
print(definitions)

0    0
1    1
2    0
3    1
4    1
Name: Rating, dtype: int64
Index(['High', 'Medium', 'Low'], dtype='object')


In [32]:
X = dataset.iloc[:,0:9].values
y = dataset.iloc[:,9].values
print('The independent features set: ')
print(X[:10,:])
print('The dependent variable: ')
print(y[:10])

The independent features set: 
[[5.900000e+00 4.000000e+01 1.052300e+01 1.761768e+03 8.600000e+01
  1.070000e+05 2.562900e+01 3.188474e+03 6.295160e+05]
 [4.500000e+00 5.800000e+01 2.364300e+01 8.203120e+02 8.610000e+01
  3.022500e+04 2.104400e+01 8.002078e+03 8.901940e+05]
 [4.800000e+00 5.800000e+01 4.078600e+01 1.901096e+03 8.400000e+01
  3.392400e+04 3.003300e+01 9.373749e+03 2.349510e+05]
 [3.800000e+00 2.000000e+01 7.867100e+01 1.401421e+03 7.540000e+01
  3.043800e+04 1.527800e+01 5.244755e+03 3.449000e+05]
 [2.700000e+00 3.500000e+01 3.200000e+01 2.358754e+03 8.100000e+01
  3.182100e+04 9.846000e+00 3.329230e+02 4.500000e+05]
 [4.200000e+00 7.500000e+01 3.870600e+01 1.748435e+03 7.500000e+01
  2.733100e+04 3.172600e+01 8.008249e+03 2.500000e+05]
 [3.200000e+00 6.800000e+01 1.033100e+01 2.161478e+03 8.070000e+01
  2.693400e+04 1.997200e+01 1.377410e+02 2.577750e+05]
 [2.700000e+00 4.000000e+00 1.302500e+01 4.697550e+02 8.700000e+01
  3.850300e+04 2.532700e+01 5.065123e+03 7.97000

In [33]:
# Creating the Training and Test set from data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21)

In [34]:
# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

In [35]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
#Reverse factorize (converting y_pred from 0s,1s and 2s to high, medium, low
reversefactor = dict(zip(range(3),definitions))
y_test = np.vectorize(reversefactor.get)(y_test)
y_pred = np.vectorize(reversefactor.get)(y_pred)
# Making the Confusion Matrix
print(pd.crosstab(y_test, y_pred, rownames=['Actual Quality of Life'], colnames=['Predicted Quality of Life']))

Predicted Quality of Life  High  Low  Medium
Actual Quality of Life                      
High                         11    1       0
Medium                        5    0       3


In [36]:
print(list(zip(dataset.columns[0:4], classifier.feature_importances_)))
joblib.dump(classifier, 'randomforestmodel.pkl') 

[('Unemployment Rate', 0.09571463426371997), ('Air Quality', 0.07538945541293877), ('Number of Hospitals', 0.08403512357612113), ('Number of Schools', 0.15741429310520275)]


['randomforestmodel.pkl']