In [13]:
import numpy as np
from pandas import read_csv
import sklearn
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
np.set_printoptions(precision=3, suppress=True) 
from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings("ignore", category=FutureWarning) 
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [9]:
attributes = read_csv('attributes.csv')
#print(attributes['attributes'])
data = read_csv('communities.data', names = attributes['attributes'])
# removing First five columns as they are not predictive data
data = data.values[:,5:]
data

array([[0.19, 0.33, 0.02, ..., 0.32, '0.14', 0.2],
       [0.0, 0.16, 0.12, ..., 0.0, '?', 0.67],
       [0.0, 0.42, 0.49, ..., 0.0, '?', 0.43],
       ...,
       [0.16, 0.37, 0.25, ..., 0.91, '0.28', 0.23],
       [0.08, 0.51, 0.06, ..., 0.22, '0.18', 0.19],
       [0.2, 0.78, 0.14, ..., 1.0, '0.13', 0.48]], dtype=object)

In [44]:
# Replacing the missing data with the column mean
data[data=='?'] = np.nan
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer.fit(data)
data = imputer.transform(data)

  


In [63]:
X = data[:,:-1]
y = data[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Dimensionality Reduction using PCA
pca = PCA(n_components = 20)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)


In [15]:
class Communities_Crimes:
    
    def __init__(self):
        return
    
    def read_data(self):
        # reading data 
        attributes = read_csv('attributes.csv')
        data = read_csv('communities.data', names = attributes['attributes'])
        # removing First five columns as they are not predictive data
        data = data.values[:,5:]
        return data
    
    def preprocessing(self, data):
        # Replacing the missing data with the column mean
        data[data=='?'] = np.nan
        imputer = SimpleImputer(missing_values=np.nan, strategy='median')
        imputer.fit(data)
        data = imputer.transform(data)
        
        # splitting data
        X = data[:,:-1]
        y = data[:,-1]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
        
        #preprocessing using standard scaler
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
        return X_train, y_train, X_test, y_test
    
    def dim_Reduction(self, X_train, X_test):
        # Dimensionality Reduction using PCA from 123 dims to 20 dims
        pca = PCA(n_components = 20)
        X_train = pca.fit_transform(X_train)
        X_test = pca.transform(X_test)
        return X_train, X_test
        
    def cv_SVR(self, X, y):
        #scorer = make_scorer(neg_mean_squared_error)
        C_grid = [0.1, 1, 10]
        gamma_grid = np.logspace(-2, 1, 4)[0:3]
        svm = sklearn.svm.SVR(kernel='rbf')
        param_grid = { 'C' : C_grid, 'gamma' : gamma_grid, 'kernel' : ['rbf', 'sigmoid',  'linear']}
        gridcv = sklearn.model_selection.GridSearchCV(svm, param_grid, n_jobs=-1, verbose=1, cv=3, scoring = 'neg_mean_squared_error')
        gridcv.fit(X_train, y_train)
        print("best parameters:", gridcv.best_params_)
        print("%.1f%% neg mean squared error on validation sets (average)" % (gridcv.best_score_*100))
        return gridcv.best_params_
    
    def cv_DTR(self, X, y):
        dt = DecisionTreeRegressor()
        param_grid = {
            "min_samples_split" : np.random.random_sample((100,)),
            "min_samples_leaf" : np.arange(1,6),
            'max_depth': range(1, 20)
        }
        return Communities_Crimes.randomCV(dt, X, y, param_grid, 400, 6)
        
    def cv_RandomForest(self, X, y):
        rf = RandomForestRegressor()
        param_grid = {
            "n_estimators" : [10*x for x in np.arange(1,50)],
            "min_samples_split" : np.random.random_sample((100,)),
            "min_samples_leaf" : np.arange(1,6),
            'max_depth': range(1, 20),
        }
        return Communities_Crimes.randomCV(rf, X, y, param_grid, 40, 6)
        
    def cv_adaBoost(self, X, y):
        #scorer = make_scorer(precision_score)
        ada_boost = AdaBoostRegressor(n_estimators=50, learning_rate=1)
        param_grid = {'n_estimators': range(1, 50), 'learning_rate': [0.1, 0.5, 1]}
        gridcv = sklearn.model_selection.GridSearchCV(ada_boost, param_grid, verbose=1, cv=3, n_jobs=-1, scoring='explained_variance')
        gridcv.fit(X, y)
        print("best parameters:", gridcv.best_params_)
        print("%.1f%% Variance on validation sets (average)" % (gridcv.best_score_))
        return gridcv.best_params_
    
    def cv_linReg(self, X, y):
        lr = LinearRegression()
        param_grid = {
            "fit_intercept" : [True, False],
        }
        return Communities_Crimes.randomCV(lr, X, y, param_grid, 40, 6)
        
    def cv_GP(self, X, y):
        clf = GaussianProcessRegressor()
        param_grid = {
            
        "normalize_y" : [True, False],
        "copy_X_train" : [True, False],
        "alpha" : np.linspace(0, 5, 100),
        }
        return Communities_Crimes.randomCV(clf, X, y, param_grid, 25, 6)
    
    def cv_NNRegressor(self, X, y):
        nn = sklearn.neural_network.MLPRegressor(hidden_layer_sizes=(50,),
                                           solver='sgd', batch_size=100, max_iter=10,
                                           learning_rate_init=.01, momentum=0.9, alpha=0.05,
                                           verbose=False, random_state=0)

        param_grid ={
                    'hidden_layer_sizes' : range(2,100),
                    "activation" : ['identity', 'logistic', 'tanh', 'relu']
                    }
        return Communities_Crimes.randomCV(nn, X, y, param_grid, 200, 6)
        
    def randomCV(clf, X, y, param_grid, n_iter, cv):
        #scorer = make_scorer(precision_score)
        random_search = RandomizedSearchCV(clf, param_distributions = param_grid, n_iter = n_iter, cv = cv, iid = False)
        #scoring = "explained_variance"
        random_search.fit(X, y)
        #print(random_search.cv_results_)
        Communities_Crimes.report(random_search.cv_results_)
        return random_search.best_params_
    
    def report(results, n_top=1):
        for i in range(1, n_top + 1):
            candidates = np.flatnonzero(results['rank_test_score'] == i)
            k = 0
            for candidate in candidates:                
                print("Model with rank: {0}".format(i))
                print("Variance on validation data: {0:.3f} (std: {1:.3f})".format(
                      results['mean_test_score'][candidate],
                      results['std_test_score'][candidate]))
                print("Parameters: {0}".format(results['params'][candidate]))
                print("")
                k += 1
                if k == 3:
                    break
                
    def predict(self, model, X_test, y_test):
        predict = model.predict(X_test)
        predict[predict<0] =0
        rmse = mean_squared_error(y_test, predict)
        print("RMSE on test data : ", rmse)

In [None]:
if __name__ == "__main__":
    obj = Communities_Crimes()
    data = obj.read_data()
    X_train, y_train, X_test, y_test = obj.preprocessing(data)
    X_train, X_test = obj.dim_Reduction(X_train, X_test)
    print('---------SVR--------')
    model = obj.cv_SVR(X_train, y_train)
    reg = sklearn.svm.SVR().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------DTR--------')
    model = obj.cv_DTR(X_train, y_train)
    reg = sklearn.tree.DecisionTreeRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------Random Forrest Regressor--------')
    # taking more than 3 mins 
    model = obj.cv_RandomForest(X_train, y_train)
    reg = sklearn.ensemble.RandomForestRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------Adaboost Regressor--------')
    model = obj.cv_adaBoost(X_train, y_train)
    reg = sklearn.ensemble.AdaBoostRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------Gaussian Process Regressor--------')
    model = obj.cv_GP(X_train, y_train)
    reg = sklearn.gaussian_process.GaussianProcessRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------Linear Regressor--------')
    model = obj.cv_linReg(X_train, y_train)
    reg = LinearRegression().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    print('---------NN Regressor--------')
    model = obj.cv_NNRegressor(X_train, y_train)
    reg = MLPRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)
    

---------SVR--------
Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  51 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done  66 out of  81 | elapsed:   22.2s remaining:    5.0s
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:  1.6min finished


best parameters: {'C': 1, 'gamma': 0.01, 'kernel': 'linear'}
-1.9% neg mean squared error on validation sets (average)
RMSE on test data :  0.022667618988030748
---------DTR--------
Model with rank: 1
Variance on validation data: 0.488 (std: 0.039)
Parameters: {'min_samples_split': 0.07826048262931806, 'min_samples_leaf': 2, 'max_depth': 13}

RMSE on test data :  0.029204101055802303
---------Random Forrest Regressor--------
Model with rank: 1
Variance on validation data: 0.578 (std: 0.014)
Parameters: {'n_estimators': 480, 'min_samples_split': 0.06602368760607102, 'min_samples_leaf': 5, 'max_depth': 12}

RMSE on test data :  0.02415377575208042
---------Adaboost Regressor--------
Fitting 3 folds for each of 147 candidates, totalling 441 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-1)]: Done 441 out of 441 | elapsed:   27.7s finished


best parameters: {'learning_rate': 0.5, 'n_estimators': 31}
0.6% Variance on validation sets (average)
RMSE on test data :  0.027489934168301894
---------Gaussian Process Regressor--------


In [153]:
if __name__ == "__main__":
    obj = Communities_Crimes()
    data = obj.read_data()
    X_train, y_train, X_test, y_test = obj.preprocessing(data)
    X_train, X_test = obj.dim_Reduction(X_train, X_test)
    print('---------NN Regressor--------')
    model = obj.cv_NNRegressor(X_train, y_train)
    reg = MLPRegressor().set_params(**model).fit(X_train, y_train)
    obj.predict(reg, X_test, y_test)

---------NN Regressor--------
Model with rank: 1
Variance on validation data: 0.630 (std: 0.017)
Parameters: {'hidden_layer_sizes': 24, 'activation': 'identity'}

MSE on test data :  0.022278340738049872


In [130]:
reg = sklearn.ensemble.RandomForestRegressor().set_params(**model).fit(X_train, y_train)
obj.predict(reg, X_test, y_test)

MSE on test data :  0.022937713000135338
