In [2]:
import os, sys, copy, time
import pandas as pd
import numpy as np
from loguru import logger
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score 
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve, auc, log_loss, f1_score

class Analysis():

    def read_display(self, csvfilepath):
        if not os.path.exists(csvfilepath):
            logger.error('CSV path not found')
        data = pd.read_csv(csvfilepath)
        self.data = data
        return data.head(10)

    def describe(self):
        return self.data.describe()


    def nullval(self):
        return self.data.isnull().sum()


    def unique(self, count = True, column = None):
        unique, counts = np.unique(self.data[column].values, return_counts=True)

        plt.bar(unique,counts)
        plt.title('Class Frequency')
        plt.xlabel('Class')
        plt.ylabel('Frequency')
        plt.show()
        if count:
            return(self.data[column].value_counts())
        
        
    def correlation(self):
        corr = self.data.corr()
        return corr.style.background_gradient(cmap='coolwarm').set_precision(3)
    
    
    def drop_features(self, features_to_drop):
        for f in features_to_drop:
            self.data = self.data.drop(f, 1)
            
        return self.data
    
    def encode_category(self, features_to_encode):
        for f in features_to_encode:
            if self.data.dtypes[f] == 'object':
                self.data[f] = self.data[f].astype('category')
                self.data[f] = self.data[f].cat.codes
            else:
                logger.debug('{} is not a categorical variable'.format(f))
        return self.data
    
    
    def normalize(self, features_to_normalize):
        min_max_scaler = preprocessing.MinMaxScaler()

        x = self.data[features_to_normalize].values
        x_scaled = min_max_scaler.fit_transform(x)
        df_temp = pd.DataFrame(x_scaled, columns=features_to_normalize, index = self.data.index)
        self.data[features_to_normalize] = df_temp
        
        return self.data.head(10)
    
    def balancesplit(self,Y_column = None,test_ratio = 0.2):
        Y = self.data[Y_column]
        X = self.data.drop(Y_column, axis = 1)
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_ratio, random_state=20) 
        return X_train, X_test, Y_train, Y_test
        
    def imbalancesplit(self, Y_column = None, task = None, thresh = 0.6, random = 50, test_ratio = 0.2):
        
        Y = self.data[Y_column]
        X = self.data.drop(Y_column, axis = 1)
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_ratio, random_state=20)

        train_data = pd.concat([X_train, Y_train], axis=1)

        df_1 = train_data[train_data[Y_column]==0]
        df_2 = train_data[train_data[Y_column]==1]

        if df_1.shape[0] > df_2.shape[0]:
            majority,minority = df_1, df_2
        else:
            majority,minority = df_2, df_1

        ratio = minority.shape[0]/majority.shape[0]
        if ratio > thresh:
            logger.debug('Ratio of minority to majority class is {}'.format(ratio))
            return X_train, X_test, Y_train, Y_test
            
            

        if task == 'up':
            minority_upsampled = resample(minority,
                                         replace = True,
                                         n_samples = len(majority),
                                         random_state = random)
            df = pd.concat([majority, minority_upsampled])
        elif task == 'down':
            majority_downsampled = resample(majority,
                                           replace = False,
                                           n_samples = len(minority),
                                           random_state = random)
            df = pd.concat([majority_downsampled, minority])
            
        Y_train = df[Y_column]
        X_train = df.drop(Y_column, axis = 1)
        
        print('Train distribution:',Y_train.value_counts())
        print('Test distribution:',Y_test.value_counts())

        return X_train, X_test, Y_train, Y_test
    
    
    def logistic(self,x_train, x_test, y_train, y_test):
        clf = LogisticRegression(penalty='l2',solver='lbfgs')
        return self.plot(clf, x_train, x_test, y_train, y_test)  

    def svmclass(self,x_train, x_test, y_train, y_test, kernel = 'linear'):
        clf = svm.SVC(kernel=kernel,probability=True) # Linear Kernel
        return self.plot(clf, x_train, x_test, y_train, y_test)  
    
    
    def decisiontree(self,x_train, x_test, y_train, y_test, criterion = "gini", depth = 8):
        clf = DecisionTreeClassifier(criterion=criterion,max_features="log2",max_depth=depth,random_state=6)
        return self.plot(clf, x_train, x_test, y_train, y_test)
        
    def gradientboost(self,x_train, x_test, y_train, y_test, depth = 4):
        clf = GradientBoostingClassifier(learning_rate=0.2,max_depth=depth)
        return self.plot(clf, x_train, x_test, y_train, y_test)
        
    
    def randomforest(self,x_train, x_test, y_train, y_test, depth = 8):
        clf = RandomForestClassifier(max_depth=depth,max_features="log2",random_state=4)
        return self.plot(clf, x_train, x_test, y_train, y_test)
        
        
    def XGboost(self, x_train, x_test, y_train, y_test):
        clf = XGBClassifier(random_state=6)
        return self.plot(clf, x_train, x_test, y_train, y_test)
        
          
    def plot(self,clf, x_train, x_test, y_train, y_test):
        # Train Random forest Classifer
        clf = clf.fit(x_train,y_train)

        #Predict the response for test dataset
        y_pred = clf.predict(x_test)

        # prediction of the test data
        y_pred = clf.predict(x_test)
        y_pred_proba = clf.predict_proba(x_test)[:, 1]
        [fpr, tpr, thr] = roc_curve(y_test, y_pred_proba)

        print('Train/Test split results:')
        print(" accuracy is %2.3f" % accuracy_score(y_test, y_pred))
        print(" precison is %2.3f" % precision_score(y_test, y_pred))
        print(" recall is %2.3f" % recall_score(y_test, y_pred))
        print(" auc is %2.3f" % auc(fpr, tpr))
        print(" f1 score is %2.3f" % f1_score(y_test, y_pred))
#         self.confusion(y_test, y_pred)
        return accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)
        


    
    
    def confusion(self,y_test, y_pred):
        labels = [1,0]

        # confusion matrix
        cm = confusion_matrix(y_test, y_pred, labels=labels)

        ax= plt.subplot()
        sns.heatmap(cm, annot=True, fmt =".0f",ax = ax); #annot=True to annotate cells

        # labels, title and ticks
        ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
        ax.set_title('Confusion Matrix'); 
        ax.xaxis.set_ticklabels([1, 0]); ax.yaxis.set_ticklabels([1, 0])
        plt.show()