In [55]:
import numpy as np
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from io import StringIO
import scipy
import scipy.stats               # For reciprocal distribution
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer
from sklearn import svm
import sklearn.tree        # For DecisionTreeClassifier class
import sklearn.ensemble    # For RandomForestClassifier class
import sklearn.linear_model # For Logistic Classifier
#from sklearn.neighbors import LSHForest
import sklearn.naive_bayes #For Naive Bayes
import sklearn.neural_network #For MLP classifier
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)  # Ignore sklearn deprecation warnings
warnings.filterwarnings("ignore", category=FutureWarning)       # Ignore sklearn deprecation warnings
np.set_printoptions(precision=20, suppress=True)

class Adult:
    #Removing Race and Sex Variable form data for fairness purpose
    def preprocess_data_OH(self,X,X_num,y):
        X= X.tolist()
        #print(X[0])
        workclass = ['Private','Self-emp-not-inc','Self-emp-inc','Federal-gov','Local-gov','State-gov','Without-pay','Never-worked']
        education = ['Bachelors','Some-college','11th','HS-grad','Prof-school','Assoc-acdm','Assoc-voc','9th','7th-8th','12th','Masters','1st-4th','10th','Doctorate','5th-6th','Preschool']
        marital_status = ['Married-civ-spouse','Divorced','Never-married','Separated','Widowed','Married-spouse-absent','Married-AF-spouse']
        occupation = ['Tech-support','Craft-repair','Other-service','Sales','Exec-managerial','Prof-specialty','Handlers-cleaners','Machine-op-inspct','Adm-clerical','Farming-fishing','Transport-moving','Priv-house-serv','Protective-serv','Armed-Forces']
        relationship = ['Wife','Own-child','Husband','Not-in-family','Other-relative','Unmarried']
        native_country = ['United-States','Cambodia','England','Puerto-Rico','Canada','Germany','Outlying-US(Guam-USVI-etc)','India','Japan','Greece','South','China','Cuba','Iran','Honduras','Philippines','Italy','Poland','Jamaica','Vietnam','Mexico','Portugal','Ireland','France','Dominican-Republic','Laos','Ecuador','Taiwan','Haiti','Columbia','Hungary','Guatemala','Nicaragua','Scotland','Thailand','Yugoslavia','El-Salvador','Trinadad&Tobago','Peru','Hong','Holand-Netherlands']
        
        encoder = preprocessing.OneHotEncoder(categories=[workclass, education, marital_status,occupation,relationship,native_country],
                                              handle_unknown='ignore',sparse=False)
        print(encoder.fit(X))
        X = encoder.transform(X)
        X = np.column_stack((X_num,X)) #Combine numeric and encoded string input data
        y[y=='<=50K']= 0
        y[y=='>50K'] = 1
        y[y=='<=50K.']= 0
        y[y=='>50K.'] = 1
        y = y.astype(int)
        return (X,y)
    
    def scale_data(self,X):
        scaler = preprocessing.StandardScaler(with_mean = False).fit(X)
        X = scaler.transform(X)
        return(X,scaler)
    
    def preprocess_data_OE(self,X,X_num,y):
        X = X.tolist()
        encoder = preprocessing.OrdinalEncoder()
        encoder.fit(X)
        X = encoder.transform(X)
        X = np.column_stack((X_num,X))
        y[y=='<=50K.']= 0
        y[y=='<=50K.']= 0
        y[y=='>50K.'] = 1
        y[y=='<=50K.']= 0
        y = y.astype(int)
        return(X,y)
    
    def __init__(self):
        #write your actual run code
        pass
    def random_CV(self,clf,X,y,param_grid,n_iter,cv):
        scorer_AUROC = make_scorer(sklearn.metrics.roc_auc_score) #using ROC for scoring criteria
        print("Starting search")
        print("Score mechanism implemented by RandomizedCV - AUROC score")
        random_search = model_selection.RandomizedSearchCV(clf, param_distributions = param_grid,n_iter = n_iter, cv = cv,
                                           iid = False,verbose=1,scoring = scorer_AUROC,n_jobs = 4)
        random_search.fit(X, y)
        print("best parameters:", random_search.best_params_)
        print("%.1f%% accuracy on validation sets (average)" % (random_search.best_score_*100))
        return random_search.best_params_
    
    def KNN(self,X,y):
        print("Starting KNN classification- Expected to take about 5 mins as its a hude data set")
       
       # X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
        knn_clf = KNeighborsClassifier()
        param_dist = {'n_neighbors': range(1,100),
                     "algorithm" : ['ball_tree', 'kd_tree'],
                    "weights" : ['uniform', 'distance'],
                    "leaf_size" : range(1,100)}
        print("Calling random_cv")
        return self.random_CV(knn_clf,X,y,param_dist,4,3)
        
        
    def SVM_clf(self,X,y):
        print("Starting SVM classification")
        #svm_clf = svm.SVC()
        svm_clf = svm.LinearSVC()
        param_dist = {
            'C'     : scipy.stats.reciprocal(1.0, 1000.),
            #'penalty': ['l1','l2'],
            'dual': [True,False],
            'max_iter' : np.arange(2000,10000,200)
            }
        return self.random_CV(svm_clf,X,y,param_dist,10,3)
        
    def DT_clf(self,X,y):
        tree_clf = sklearn.tree.DecisionTreeClassifier()
        param_dist = {
            "criterion" : ['gini', 'entropy'],
            "splitter" : ['best', 'random'],
            "max_depth" :[None,500,750,1000,1500,2000],
            "max_features": ["sqrt","log2",None]
        }
        return self.random_CV(tree_clf,X,y,param_dist,15,5)
    
    def RF_clf(self,X,y):
        print("Random Forest Classifier Called")
        rf_clf = sklearn.ensemble.RandomForestClassifier()
        param_dist = {
            "n_estimators" : [10,25,50,75,100,125,150,175,200],
            "max_depth" :[None,500,750,1000,1500,2000],
            "criterion" : ['gini', 'entropy'],
            "max_features": ["sqrt","log2",None],
            "n_jobs": [4],
            "warm_start" : [True]
           # "bootstrap": [True,False]
        }
        return self.random_CV(rf_clf,X,y,param_dist,12,3)
        
        
    def ADB_clf(self,X,y):
        print("AdaBoost Classifier Called")
        adb_clf = sklearn.ensemble.AdaBoostClassifier()
        param_dist = {
            "n_estimators" : [50,100,150,200,300,500,750,1000],
            "algorithm" : ['SAMME', 'SAMME.R'],
            "random_state" : [0]
        }
        return self.random_CV(adb_clf,X,y,param_dist,5,5)
        
    
    def LR_clf(self,X,y):
        print("Logistic Reg Classifier Called")
        lr_clf = sklearn.linear_model.LogisticRegression()
        param_dist = {
            "fit_intercept" : [True, False],
            'C'     : scipy.stats.reciprocal(1.0, 1000.),
            "solver" : ['lbfgs','sag','saga','newton-cg'],
            "penalty" : ['l2'],
            'max_iter' : np.arange(500,10000,500)
        }
        return self.random_CV(lr_clf,X,y,param_dist,15,3)
        
    
    def NB_clf(self,X,y):
        print("Naive Bayes Classifier called")
        nb_clf = sklearn.naive_bayes.GaussianNB()
        param_dist = {
            "priors": [None,[0.5,0.5],[0.6,0.4],[0.4,0.6],[0.3,0.7],[0.7,0.3]]
        }
        return self.random_CV(nb_clf,X,y,param_dist,5,5)

    
    def MLP_clf(self,X,y):
        print("Neural Network / MLP Classifier called")
        mlp_clf = sklearn.neural_network.MLPClassifier()
        param_dist = {
            "hidden_layer_sizes" : [(100,), (200,),(100,50),(200,50),(200,100),(100,100,50)],
            "solver" : ['sgd'],
            "learning_rate" : ['constant','invscaling'],
            "max_iter" : [200,300,500],
            "warm_start" : [True],
            "activation" :['tanh', 'relu']
        }
        return self.random_CV(mlp_clf,X,y,param_dist,6,3)
    
    def train_clf(self,clf,params,X,y):
        clf.set_params(**params)
        clf.fit(X,y)
        print("Complete Training Accuracy")
        print(clf.score(X,y))
        return clf
        
        
    def start(self):
        print("******Classification of Adult Data Set Begins ******")
        file = 'adult.data'
        f = open(file,"r")
        c = StringIO(f.read())
        print("READING TRAIN DATA")
        # Ignoring Race and Sex Attributes
        
        #X_inp = np.loadtxt(c, delimiter = ",",usecols =(0,1,2,3,4,5,6,7,10,11,12,13,14),dtype = {'names':('age','workclass','fnlwgt',
        #                                                                                                    'education','education-num',
        #                                                                                                    'marital-status','occupation',
        #                                                                                                   'relationship','capital-gain',
        #                                                                                                    'capital-loss','hours-per-week',
        #                                                                                                    'native-country'),
        #                                                                                           'formats':(np.float,'|S25',np.float,
         #                                                                                                    '|S25', np.float, '|S25',
         #                                                                                                    '|S25', '|S25', np.float,
          #                                                                                                   np.float,np.float,'|S25')})
        X_string  = np.char.strip(np.genfromtxt(c,dtype='str',delimiter = ',',usecols = (1,3,5,6,7,13,14)))
        X_float = np.loadtxt(file,delimiter = ",",usecols = (0,2,4,10,11,12), dtype = np.float).astype(int)
        
        print("Pre Processing Train data with One Hot Encoding")
        (X,y) = self.preprocess_data_OH(X_string[:,:-1],X_float,X_string[:,-1])
        
        
        print("Normalizing data with Standard Scaler")
        (X,scaler) = self.scale_data(X)
        
        #Reducing dimensions to consider first 50 Principle Components based on explained_variance_ratio scores
        print("Reducing dimensionality to improve classifier run time")
        pca = PCA(n_components = 50)
        X_rd = pca.fit_transform(X)
        
        #print(X_rd[0:5])
        #print(pca.explained_variance_ratio_)
        
        print("-- Training KNN --")
        knn_clf = self.train_clf(KNeighborsClassifier(),self.KNN(X_rd,y),X_rd,y) #used PCA reduced data
        
        #Calling SVM Classifier using unreduced data dimensions
        print("--Training SVM ")
        svm_clf = self.train_clf(svm.LinearSVC(),self.SVM_clf(X,y),X,y)
        
        print(" Preprocessing with Ordinal Encoding for Decison Tree Algorithms")
        
        (X_oe,y_oe) = self.preprocess_data_OE(X_string[:,:-1],X_float,X_string[:,-1]) # usine Ordinal Encoding for Decision tree algorithms
        
        print("--Training Decision Trees--")
        dt_clf = self.train_clf(sklearn.tree.DecisionTreeClassifier(),self.DT_clf(X_oe,y_oe),X_oe,y_oe)
        
        print("--Training Random Forests--")
        rf_clf = self.train_clf(sklearn.ensemble.RandomForestClassifier(),self.RF_clf(X_oe,y_oe),X_oe,y_oe)
        
        print("--Training AdaBoost-- ")
        adb_clf = self.train_clf(sklearn.ensemble.AdaBoostClassifier(),self.ADB_clf(X_oe,y_oe),X_oe,y_oe)
        
        print("--Training Logistic Reg--")
        lr_clf = self.train_clf(sklearn.linear_model.LogisticRegression(),self.LR_clf(X_rd,y),X_rd,y)
        
        print("--Training Naive Bayes-- ")
        nb_clf = self.train_clf(sklearn.naive_bayes.GaussianNB(),self.NB_clf(X_rd,y),X_rd,y)
        
      #  print("--Training Neural Networks/MLP--")
       # mlp_clf = self.train_clf(sklearn.neural_network.MLPClassifier(),self.MLP_clf(X_rd,y),X_rd,y)
        
        print("--READING TEST DATA")
        
        file = 'adult.test'
        f = open(file,"r")
        c = StringIO(f.read())
        
        
        X_string  = np.char.strip(np.genfromtxt(c,dtype='str',delimiter = ',',usecols = (1,3,5,6,7,13,14),skip_header=1))
        X_float = np.loadtxt(file,delimiter = ",",usecols = (0,2,4,10,11,12), dtype = np.float,skiprows=1).astype(int)
        
        print("One Hot Encoding of Data")
        (X_test,y_test) = self.preprocess_data_OH(X_string[:,:-1],X_float,X_string[:,-1])
        
        X_test = scaler.transform(X_test)
        
        (X_test_oe,y_test_oe) = self.preprocess_data_OE(X_string[:,:-1],X_float,X_string[:,-1])
        
        print("Reducing dimensionality to improve classifier run time")
        pca = PCA(n_components = 50)
        X_test_rd = pca.fit_transform(X_test)
        
        print("**Test Data Prediction Begins*")
        
        print("Testing KNN Classifier")
        print(knn_clf.score(X_test_rd,y_test))
        
        print("Testing SVM Classifier")
        print(svm_clf.score(X_test,y_test))
        
        print("Testing Decision Trees")
        print(dt_clf.score(X_test_oe,y_test_oe)) #using ordinal encoding
        
        print("Testing Random Forests")
        print(rf_clf.score(X_test_oe,y_test_oe))
        
        print("Testing Adaboost")
        print(adb_clf.score(X_test_oe,y_test_oe))
        
        print("Testing Logistic Regression")
        print(lr_clf.score(X_test_rd,y_test))
        
        print("Testing Naive Bayes")
        print(nb_clf.score(X_test_rd,y_test))
        
       # print("Testing Neural Network/MLP")
        #print(mlp_clf.score(X_test_rd,y_test))
        
        return
    
adult = Adult()
adult.start()
        


******Classification of Adult Data Set Begins ******
READING TRAIN DATA
Pre Processing Train data with One Hot Encoding
OneHotEncoder(categorical_features=None,
              categories=[['Private', 'Self-emp-not-inc', 'Self-emp-inc',
                           'Federal-gov', 'Local-gov', 'State-gov',
                           'Without-pay', 'Never-worked'],
                          ['Bachelors', 'Some-college', '11th', 'HS-grad',
                           'Prof-school', 'Assoc-acdm', 'Assoc-voc', '9th',
                           '7th-8th', '12th', 'Masters', '1st-4th', '10th',
                           'Doctorate', '5th-6th', 'Preschool'],
                          ['Married-civ-sp...
                           'Puerto-Rico', 'Canada', 'Germany',
                           'Outlying-US(Guam-USVI-etc)', 'India', 'Japan',
                           'Greece', 'South', 'China', 'Cuba', 'Iran',
                           'Honduras', 'Philippines', 'Italy', 'Poland',
                  

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  12 out of  12 | elapsed:  3.6min finished


best parameters: {'weights': 'distance', 'n_neighbors': 29, 'leaf_size': 65, 'algorithm': 'kd_tree'}
73.6% accuracy on validation sets (average)
Complete Training Accuracy
0.9999692884125181
--Training SVM 
Starting SVM classification
Starting search
Score mechanism implemented by RandomizedCV - AUROC score
Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:  1.5min finished


best parameters: {'C': 3.887126993208802, 'dual': True, 'max_iter': 3200}
76.5% accuracy on validation sets (average)




Complete Training Accuracy
0.8521237062743773
 Preprocessing with Ordinal Encoding for Decison Tree Algorithms
--Training Decision Trees--
Starting search
Score mechanism implemented by RandomizedCV - AUROC score
Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    2.6s
[Parallel(n_jobs=4)]: Done  75 out of  75 | elapsed:    4.6s finished


best parameters: {'splitter': 'best', 'max_features': None, 'max_depth': 500, 'criterion': 'entropy'}
74.9% accuracy on validation sets (average)
Complete Training Accuracy
0.9999692884125181
--Training Random Forests--
Random Forest Classifier Called
Starting search
Score mechanism implemented by RandomizedCV - AUROC score
Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  36 out of  36 | elapsed:  1.1min finished


best parameters: {'warm_start': True, 'n_jobs': 4, 'n_estimators': 175, 'max_features': 'log2', 'max_depth': 500, 'criterion': 'entropy'}
77.9% accuracy on validation sets (average)
Complete Training Accuracy
0.9999692884125181
--Training AdaBoost-- 
AdaBoost Classifier Called
Starting search
Score mechanism implemented by RandomizedCV - AUROC score
Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  3.2min
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:  4.0min finished


best parameters: {'random_state': 0, 'n_estimators': 750, 'algorithm': 'SAMME.R'}
79.0% accuracy on validation sets (average)
Complete Training Accuracy
0.8727311814747705
--Training Logistic Reg--
Logistic Reg Classifier Called
Starting search
Score mechanism implemented by RandomizedCV - AUROC score
Fitting 3 folds for each of 15 candidates, totalling 45 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed:   12.0s finished


best parameters: {'C': 15.308517378672184, 'fit_intercept': False, 'max_iter': 9000, 'penalty': 'l2', 'solver': 'newton-cg'}
78.6% accuracy on validation sets (average)
Complete Training Accuracy
0.7192346672399497
--Training Naive Bayes-- 
Naive Bayes Classifier called
Starting search
Score mechanism implemented by RandomizedCV - AUROC score
Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:    1.3s finished


best parameters: {'priors': None}
74.8% accuracy on validation sets (average)
Complete Training Accuracy
0.6838856300482172
--READING TEST DATA
One Hot Encoding of Data
OneHotEncoder(categorical_features=None,
              categories=[['Private', 'Self-emp-not-inc', 'Self-emp-inc',
                           'Federal-gov', 'Local-gov', 'State-gov',
                           'Without-pay', 'Never-worked'],
                          ['Bachelors', 'Some-college', '11th', 'HS-grad',
                           'Prof-school', 'Assoc-acdm', 'Assoc-voc', '9th',
                           '7th-8th', '12th', 'Masters', '1st-4th', '10th',
                           'Doctorate', '5th-6th', 'Preschool'],
                          ['Married-civ-sp...
                           'Puerto-Rico', 'Canada', 'Germany',
                           'Outlying-US(Guam-USVI-etc)', 'India', 'Japan',
                           'Greece', 'South', 'China', 'Cuba', 'Iran',
                           'Honduras', 'Ph

In [6]:
import numpy as np
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from io import StringIO
import scipy
import scipy.stats               # For reciprocal distribution
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer
from sklearn import svm
import sklearn.tree        # For DecisionTreeClassifier class
import sklearn.ensemble    # For RandomForestClassifier class
import sklearn.linear_model # For Logistic Classifier
#from sklearn.neighbors import LSHForest
import sklearn.naive_bayes #For Naive Bayes
import sklearn.neural_network #For MLP classifier
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)  # Ignore sklearn deprecation warnings
warnings.filterwarnings("ignore", category=FutureWarning)       # Ignore sklearn deprecation warnings
np.set_printoptions(precision=20, suppress=True)

class credit_card_defaults:
    
    def __init__(self):
        pass
    def start(self):
        print("******Classification of Credit card Default Data Set Begins ******")
        file = 'default of credit card clients.xls'
        f = open(file,"r")
        c = StringIO(f.read())
        print("READING TRAIN DATA")
        
        X_string  = np.char.strip(np.gen fromtxt(file,dtype='str'))#,delimiter = ',',usecols = (1,3,5,6,7,13,14)))
            
        print(X_string[0])
        return
    
obj = credit_card_defaults()
obj.start()
    
                                


******Classification of Credit card Default Data Set Begins ******


UnicodeDecodeError: 'charmap' codec can't decode byte 0x8d in position 750: character maps to <undefined>