# Functions

## Import stuff

In [1]:
# (Uncomment and) Run this cell so these libraries are available to other Jupyter Notebooks
# !pip install ipynb

In [2]:
# Version 2.0.0.0

In [3]:
# import os
import datetime

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, PolynomialFeatures, FunctionTransformer

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.metrics import f1_score, confusion_matrix, plot_confusion_matrix
from sklearn.metrics import accuracy_score, balanced_accuracy_score, recall_score
from sklearn.metrics import roc_auc_score, average_precision_score, precision_score

from sklearn.svm import SVC

import warnings
warnings.filterwarnings("ignore")

In [4]:
def add_binary_and_drop(df_in, drop='subreddit', repl_w_zero = 'AMA'):
    """
    Add a Binarized version of a column, and then drop that column
    
    ARGUMENTS
    df_in: The data frame to modify
    drop: the column which will be read in, and then later dropped
    repl_w_zero: The value to be replaced with a 0; everything else gets a 1
    
    RETURN
    Returns the data frame it has modified
    """
    bin_label = drop + '_binary'
    df_in[bin_label] = np.where(df_in[drop] == repl_w_zero, 0, 1)
    return df_in.drop(columns=drop)

In [5]:
def replace_with_binary(df_in, replace_column_name='subreddit', repl_w_zero = False):
    """
    Change column to a Binarized version
    
    ARGUMENTS
    df_in: The data frame to modify
    replace_column_name: the column which will be Ninarized
    repl_w_zero: The value to be replaced with a 0; everything else gets a 1
    
    RETURN
    Returns the data frame it has modified
    """
    if(replace_column_name in df_in):
        df_in[replace_column_name] = np.where(df_in[replace_column_name] == repl_w_zero, 0, 1)
    return df_in

In [6]:
def remove_deleted_comments(df_in, col_to_modify='body', repl_w_nan = '[deleted]'):
    """
    Find rows where body equals '[deleted]' and remove them from the set
    This function also removes duplicate entries
    
    ARGUMENTS
    df_in: The data frame to modify
    col_to_modify: the column which will be read in and checked for 'repl_w_nan'
    repl_w_nan: The value to be replaced with a NaN, and later be deleted (the whole row)
    
    RETURN
    Returns the data frame it has modified
    """
    df_in.drop_duplicates(inplace=True)
    df_in[col_to_modify] = np.where(df_in[col_to_modify] == repl_w_nan, np.nan, df_in[col_to_modify])
    return df_in.dropna()

In [7]:
def remove_keywords(df_in, col_to_modify='body', remove_from = 'AMA'):
    """
    Find keywords in body and remove them from there
    
    ARGUMENTS
    df_in: The data frame to modify
    col_to_modify: the column which will be read in and inspected
    remove_from: The keyword(s) to be removed from 'col_to_modify'
    
    RETURN
    Returns the data frame it has modified
    """
    # df_in.drop_duplicates(inplace=True)
    df_in[col_to_modify] = df_in[col_to_modify].str.replace(remove_from, '')
    return df_in

In [8]:
def drop_cols_cleaning(df_in, drop_cols=[]):
    """
    Return DataFrame with typically unusable columns for building a model.
    
    ARGUMENTS
    df_in: The data frame to modify
    drop_cols: columns to drop; if left empty, the following will be aut-assigned:
        [ 'all_awardings', 'associated_award', 'author_flair_background_color', 'author_flair_css_class',
       'author_flair_richtext', 'author_flair_template_id', 'author_flair_text', 'author_flair_text_color', 
       'author_flair_type', 'author_patreon_flair', 'awarders', 'collapsed_because_crowd_control', 'comment_type',
       'created_utc', 'gildings', 'locked', 'permalink', 'retrieved_on', 'stickied', 'subreddit_id', 
       'top_awarded_type', 'treatment_tags', 'distinguished', 'author_cakeday']
    
    RETURN
    Returns the data frame it has modified
    """
    drop_cols = [ 'author', 'author_fullname', 'id', 'link_id', 'parent_id',
        'all_awardings', 'associated_award', 'author_flair_background_color', 'author_flair_css_class',
       'author_flair_richtext', 'author_flair_template_id', 'author_flair_text', 'author_flair_text_color', 
       'author_flair_type', 'author_patreon_flair', 'awarders', 'collapsed_because_crowd_control', 'comment_type',
       'created_utc', 'gildings', 'locked', 'permalink', 'retrieved_on', 'stickied', 'subreddit_id', 
       'top_awarded_type', 'treatment_tags', 'distinguished', 'author_cakeday']
    for c in drop_cols:
        if(c in df_in):
            df_in = df_in.drop(columns=c)

    return df_in#.drop(columns=drop_cols)

In [9]:
def add_post_length(df_in, delete_outliers=False, outliers=8000):
    """
    Adds 'post_length' feature to DataFrame
    
    ARGUMENTS
    df_in: DataFrame to be modified
    delete_outliers (default: False): By default, does nothing. Set to True, will delete rows with 'body' characterlength greater than 'outliers'
    outliers (default): when 'delete_outliers'=True, removes rows where 'body' character count is greater than this field; value of 4000 is recommended
    
    RETURN
    Returns modified DataFrame
    """
    df_in['post_length'] = df_in['body'].str.len()
    if(delete_outliers):
        df = df[df['post_length'] < outliers]
    return df_in

In [10]:
def pipe_search(pipe, params = {}):
    """
    Simple pipeline in a GridSearch.
    
    ARGUMENTS
    pipe: insert a pipeline object (can use the make_pipeline function here, directly)
    params: params for pipeline, default is an empty dict object
    
    RETURN
    Returns GridSearchCV object
    """
    gs = GridSearchCV(pipe, params, n_jobs=-1, verbose=2)
    gs.fit(X_train, y_train)
    train_score = gs.score(X_train, y_train)
    test_score = gs.score(X_test, y_test)
    best_parm = gs.best_params_
    print(f'TRAIN: {train_score}')
    print(f'TEST:  {test_score}')
    print(f'BEST:  {best_parm}')
    return gs

In [11]:
def make_a_model(pipe, X_train, X_test = 0, y_train = 0, y_test = 0, params={}, is_classification=True, verbose=2):
    """
    Early prototype to the Classification and Regression Model Classes
    Probably should not use this, but left this here in case some old models were built using this
    Takes in pipeline ad parameters as well as TTS X, and y; must denote whether a classification Model
    Prints relevant scores
    Returns GridSearchCV model
    """
    if(type(y_train) == type(0)):
        print(" *** NEED A 'y' *** ")
        return
    
    model = GridSearchCV(pipe, params, n_jobs=-1, verbose=verbose)
    model.fit(X_train, y_train)

    if((type(X_test) == type(0)) | (type(y_test) == type(0))):
        # treat as X and y only
        if(is_classification):
            accuracy = model.score(X_train)
            f1_sc = f1_score(y_train, model.predict(X_train))
            cm = confusion_matrix(y_train, model.predict(X_train))
            scores = { 'Accuracy:': accuracy, 'F1 Score:': f1_sc, 'Confusion Matrix': cm }
        else:
            y_train_pred = model.predict(X_train)
            r2_train = model.score(X_train, y_train)
            rmse = mean_squared_error(y_train, y_train_pred, squared=False)
            mse = mean_squared_error(y_train, y_train_pred)
            mae = mean_absolute_error(y_train, y_train_pred)
            scores = { 'R2 Score': r2_train, 'RMSE': rmse, 'MSE': mse, 'MAE': mae }
    else:
        # score normally
        if(is_classification):
            y_pred = model.predict(X_test)
            acc_train = model.score(X_train, y_train)
            acc_test = model.score(X_test, y_test)
            f1_train = f1_score(y_train, model.predict(X_train))
            f1_test = f1_score(y_test, y_pred)
            cm = confusion_matrix(y_test, y_pred)
            scores = { 'Accuracy, Train:': acc_train, 'Accuracy, Test:': acc_test, 
                       'F1 Train:': f1_train, 'F1 Test:': f1_test, 'Confusion Matrix': cm }
        else:
            y_pred = model.predict(X_test)
            r2_train = model.score(X_train, y_train)
            r2_test = model.score(X_test, y_test)
            rmse = mean_squared_error(y_test, y_pred, squared=False)
            mse = mean_squared_error(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
            scores = { 'R2 Train': r2_train, 'R2 Test': r2_test, 'RMSE': rmse, 
                       'MSE': mse, 'MAE': mae }
            
    for s,v in scores.items():
        print(f'{s}: {v}')
    
    return model   # { 'Model': model, 'Scores': scores }


In [12]:
class ClassificationModelX():
    """
    Model (of your choice) for Classification
    This Class takes one X and one y, and builds a Model for outside (untested) data
    
    ARGUMENTS (init)
    pipe: input a pipeline for your GridSearchCV Model to use
    X_train: your X (should be your full set of X) that trains your model
    y_train: your y (should be your full set of y) that trains your model
    params: Parameters for the pipline
    verbose: Sets verbosity of Gridsearch (and values greater than 1 prints scores)
    """
    scores = {}
    y_pred = None
    y_pred_proba = None
    accuracy_score = None
    X_f1_score = None
    confusion_matrix = None
    recall_score = None
    balanced_accuracy_score = None
    precision_score = None
    average_precision_score = None
    predicted_score = None
    roc_auc_score = None
        
    def __init__(self, pipe, X_train, y_train, params={}, verbose=2):
        self.pipe = pipe
        self.params = params
        self.model = GridSearchCV(pipe, params, n_jobs=-1, verbose=verbose)
        self.model.fit(X_train, y_train)
        self.best_params_ = self.model.best_params_
        
        self.set_y_pred(X_train)
        self.set_y_pred_proba(X_train)
        self.set_accuracy_score(y_train)
        self.set_f1_score(y_train)
        self.set_confusion_matrix(y_train)
        self.set_recall_score(y_train)
        self.set_balanced_accuracy_score(y_train)
        self.set_precision_score(y_train)
        self.set_average_precision_score(y_train)
        self.set_roc_auc_score(y_train)
        self.set_all_score()

        if(verbose > 1):
            print(f'\nBaseline:')
            print(f'  - count:\n{y_train.value_counts()}')
            print(f'  - percent:\n{y_train.value_counts(normalize=True)}\n')
            self.print_stats()
    
    def score(X_new, y_new):
        self.predicted_score = self.model.score(X_new, y_new)
        print(self.predicted_score)
        return self.predicted_score
    
    def print_stats(self):
        print(f'PIPE: {self.pipe}\n')
        for s,v in self.scores.items():
            print(f'  - {s}:\n        {v}')
        print(f'\nbest params: {self.best_params_}')
        
    def plot_something(self):
        print('this still needs to be built... ')
        
    def set_y_pred(self, X):
        self.y_pred = self.model.predict(X)
    
    def set_y_pred_proba(self, X):
        self.y_pred_proba = self.model.predict_proba(X)
    
    def set_accuracy_score(self, y_true):
        self.accuracy_score = accuracy_score(y_true, self.y_pred)
    
    def set_f1_score(self, y_true):
        self.X_f1_score = f1_score(y_true, self.y_pred)
        
    def set_confusion_matrix(self, y_true):
        self.confusion_matrix = confusion_matrix(y_true, self.y_pred)
    
    def set_recall_score(self, y_true):
        self.recall_score = recall_score(y_true, self.y_pred)
    
    def set_balanced_accuracy_score(self, y_true):
        self.balanced_accuracy_score = balanced_accuracy_score(y_true, self.y_pred)
        
    def set_precision_score(self, y_true):
        self.precision_score = precision_score(y_true, self.y_pred)
    
    def set_average_precision_score(self, y_true):
        self.average_precision_score = average_precision_score(y_true, self.y_pred_proba[:, 1])
        
    def set_roc_auc_score(self, y_true):
        self.roc_auc_score = roc_auc_score(y_true, self.y_pred_proba[:, 1])
        
    def set_all_score(self):
        self.scores = {
            'F1 Score': self.X_f1_score, 
            'Recall Score': self.recall_score, 
            'Accuracy': self.accuracy_score, 
            'Balanced Accuracy': self.balanced_accuracy_score, 
            'Precision Score': self.precision_score, 
            'Average Precision Score': self.average_precision_score, 
            'ROC AUC Score': self.roc_auc_score, 
            'Confusion Matrix': self.confusion_matrix
        }
    

In [13]:
class ClassificationModel():
    """
    Model (of your choice) for Classification
    This Class takes X_train, X_test, y_train, and y_test and builds a Model
    
    ARGUMENTS (init)
    pipe: input a pipeline for your GridSearchCV Model to use
    X_train: your X that trains your model
    y_train: your y that trains your model
    X_train: your X to test your model with
    y_train: your y to test your model with
    params: Parameters for the pipline
    mod_name: Model Name shown in DataFrame
    verbose: Sets verbosity of GridSearchCV (1, 2, or 3)
    print_results: set to True prints scores after __init__ runs, else prints DataFrame
    """
    scores = {}
    train_scores = {}
    y_pred = None
    y_pred_proba = None
    y_train_pred = None
    y_train_pred_proba = None
    train_accuracy_score = None
    test_accuracy_score = None
    train_f1_score = None
    test_f1_score = None
    train_confusion_matrix = None
    test_confusion_matrix = None
    train_true_positive = None
    train_false_negative = None
    train_false_positive = None
    train_true_negative = None
    test_true_positive = None
    test_false_negative = None
    test_false_positive = None
    test_true_negative = None
    train_recall_score = None
    test_recall_score = None
    train_balanced_accuracy_score = None
    test_balanced_accuracy_score = None
    train_precision_score = None
    test_precision_score = None
    train_average_precision_score = None
    test_average_precision_score = None

    train_calc_accuracy_score = None
    test_calc_accuracy_score = None
    train_calc_sensitivty_score = None
    test_calc_sensitivty_score = None
    train_calc_specficity_score = None
    test_calc_specficity_score = None
    train_calc_precision_score = None
    test_calc_precision_score = None
    
    train_roc_auc_score = None
    test_roc_auc_score = None
    df = None
        
    def __init__(self, pipe, X_train, X_test, y_train, y_test, params={}, mod_name='', verbose=2, print_results=False):
#         if(verbose > 1):
        if(print_results):
            print(datetime.datetime.now().strftime('START:  %Y-%m-%d, %H:%M:%S\n'))
        
        self.model_name = mod_name
        
        self.pipe = pipe
        self.params = params
        self.model = GridSearchCV(pipe, params, n_jobs=-1, verbose=verbose)
        self.model.fit(X_train, y_train)
        self.best_params_ = self.model.best_params_
        if(len(mod_name) == 0):
            self.model_name = str(type(self.pipe[-1])).split('.')[-1][:-2]
        
        self.y_pred = self.set_y_pred(X_test)
        self.y_pred_proba = self.set_y_pred_proba(X_test)
        self.y_train_pred = self.set_y_pred(X_train)
        self.y_train_pred_proba = self.set_y_pred_proba(X_train)

        self.train_accuracy_score = self.set_accuracy_score(y_train, self.y_train_pred)
        self.test_accuracy_score = self.set_accuracy_score(y_test, self.y_pred)
        self.train_f1_score = self.set_f1_score(y_train, self.y_train_pred)
        self.test_f1_score = self.set_f1_score(y_test, self.y_pred)
        
        self.train_confusion_matrix = self.set_confusion_matrix(y_train, self.y_train_pred)
        self.test_confusion_matrix = self.set_confusion_matrix(y_test, self.y_pred)
        self.set_train_cm_features(self.train_confusion_matrix)
        self.set_test_cm_features(self.test_confusion_matrix)
        
        self.train_recall_score = self.set_recall_score(y_train, self.y_train_pred)
        self.test_recall_score = self.set_recall_score(y_test, self.y_pred)
        self.train_balanced_accuracy_score = self.set_balanced_accuracy_score(y_train, self.y_train_pred)
        self.test_balanced_accuracy_score = self.set_balanced_accuracy_score(y_test, self.y_pred)
        self.train_precision_score = self.set_precision_score(y_train, self.y_train_pred)
        self.test_precision_score = self.set_precision_score(y_test, self.y_pred)
        self.train_average_precision_score = self.set_average_precision_score(y_train, self.y_train_pred_proba)
        self.test_average_precision_score = self.set_average_precision_score(y_test, self.y_pred_proba)
        self.train_roc_auc_score = self.set_roc_auc_score(y_train, self.y_train_pred_proba)
        self.test_roc_auc_score = self.set_roc_auc_score(y_test, self.y_pred_proba)
        
        self.train_calc_accuracy_score = self.set_calc_accuracy_score(self.train_true_positive, 
                                                                      self.train_false_negative, 
                                                                      self.train_false_positive, 
                                                                      self.train_true_negative)
        self.test_calc_accuracy_score = self.set_calc_accuracy_score(self.test_true_positive, 
                                                                     self.test_false_negative, 
                                                                     self.test_false_positive, 
                                                                     self.test_true_negative)
        self.train_calc_sensitivty_score = self.set_calc_sensitivty_score(self.train_true_positive, 
                                                                          self.train_false_negative)
        self.test_calc_sensitivty_score = self.set_calc_sensitivty_score(self.test_true_positive,
                                                                         self.test_false_negative)
        self.train_calc_specficity_score = self.set_calc_specficity_score(self.train_true_negative, 
                                                                          self.train_false_positive)
        self.test_calc_specficity_score = self.set_calc_specficity_score(self.test_true_negative, 
                                                                         self.test_false_positive)
        self.train_calc_precision_score = self.set_calc_precision_score(self.train_true_positive,
                                                                        self.train_false_positive)
        self.test_calc_precision_score = self.set_calc_precision_score(self.test_true_positive,
                                                                       self.test_false_positive)
        
        self.set_all_score()
        self.set_DataFrame()

        if(print_results):# & verbose > 1):
            # print(datetime.datetime.now().strftime('START:  %Y-%m-%d, %H:%M:%S'))
            print(f'\nBaseline (y_train):')
            print(f'  - count:\n{y_train.value_counts()}')
            print(f'  - percent:\n{y_train.value_counts(normalize=True)}\n')
            print(f'\nBaseline (y_test):')
            print(f'  - count:\n{y_test.value_counts()}')
            print(f'  - percent:\n{y_test.value_counts(normalize=True)}\n')
            print(f'\nSTATS (y_train):')
            self.print_train_stats()
            print(f'\nSTATS (y_test):')
            self.print_stats()
            print(f'  - Model, BEST SCORE: {self.model.best_score_}')
            print(datetime.datetime.now().strftime('\nFINISH: %Y-%m-%d, %H:%M:%S'))
#         elif(verbose > 1):
#             self.DataFrame()
    
    def DataFrame(self):
        return self.df
    
    def set_DataFrame(self):
        train_df = pd.DataFrame(np.array([v for k,v in self.train_scores.items()]), 
                               index=[k for k,v in self.train_scores.items()], 
                               columns=[self.model_name])
        test_df = pd.DataFrame(np.array([v for k,v in self.scores.items()]), 
                               index=[k for k,v in self.scores.items()], 
                               columns=[self.model_name])
        self.df = pd.concat([test_df, train_df])
    
    def set_all_score(self):
        self.scores = {
            'F1 Score': self.test_f1_score, 
            'Recall Score': self.test_recall_score, 
            'Accuracy': self.test_accuracy_score, 
            'Balanced Accuracy': self.test_balanced_accuracy_score, 
            'Precision Score': self.test_precision_score, 
            'Average Precision Score': self.test_average_precision_score, 
            'ROC AUC Score': self.test_roc_auc_score, 
            #'Confusion Matrix': self.test_confusion_matrix
            'True Positive': self.test_true_positive,
            'False Negative': self.test_false_negative,
            'False Positive': self.test_false_positive,
            'True Negative': self.test_true_negative,
            'Calculated Accuracy': self.test_calc_accuracy_score,
            'Calculated Precision': self.test_calc_precision_score,
            'Calculated Sensitivity': self.test_calc_sensitivty_score,
            'Calculated Specificity': self.test_calc_specficity_score
        }
        self.train_scores = {
            'Train F1 Score': self.train_f1_score, 
            'Train Recall Score': self.train_recall_score, 
            'Train Accuracy': self.train_accuracy_score, 
            'Train Balanced Accuracy': self.train_balanced_accuracy_score, 
            'Train Precision Score': self.train_precision_score, 
            'Train Average Precision Score': self.train_average_precision_score, 
            'Train ROC AUC Score': self.train_roc_auc_score, 
            #'Train Confusion Matrix': self.train_confusion_matrix
            'Train True Positive': self.train_true_positive,
            'Train False Negative': self.train_false_negative,
            'Train False Positive': self.train_false_positive,
            'Train True Negative': self.train_true_negative,
            'Train Calculated Accuracy': self.train_calc_accuracy_score,
            'Train Calculated Precision': self.train_calc_precision_score,
            'Train Calculated Sensitivity': self.train_calc_sensitivty_score,
            'Train Calculated Specificity': self.train_calc_specficity_score
        }
    
    def print_stats(self):
        for s,v in self.scores.items():
            print(f'  - {s}:\n        {v}')
        print(f'\nPIPE: {self.pipe}')
        print(f'\nbest params: {self.best_params_}')
        
    def print_train_stats(self):
        for s,v in self.train_scores.items():
            print(f'  - {s}:\n        {v}')
        
    def set_y_pred(self, X):
        return self.model.predict(X)
    
    def set_y_pred_proba(self, X):
        return self.model.predict_proba(X)
    
    def set_accuracy_score(self, y_true, y_predict):
        return accuracy_score(y_true, y_predict)
    
    def set_f1_score(self, y_true, y_predict):
        return f1_score(y_true, y_predict)
        
    def set_confusion_matrix(self, y_true, y_predict):
        return confusion_matrix(y_true, y_predict)
    
    def set_train_cm_features(self, confusion_matrix):
        tn, fp, fn, tp = confusion_matrix.ravel()
        self.train_true_positive = tn  # confusion_matrix[0][0]
        self.train_false_negative = fp # confusion_matrix[0][1]
        self.train_false_positive = fn # confusion_matrix[1][0]
        self.train_true_negative = tp  # confusion_matrix[1][1]
    
    def set_test_cm_features(self, confusion_matrix):
        tn, fp, fn, tp = confusion_matrix.ravel()
        self.test_true_positive = tn  # confusion_matrix[0][0]
        self.test_false_negative = fp # confusion_matrix[0][1]
        self.test_false_positive = fn # confusion_matrix[1][0]
        self.test_true_negative = tp  # confusion_matrix[1][1]
    
    def set_recall_score(self, y_true, y_predict):
        return recall_score(y_true, y_predict)
    
    def set_balanced_accuracy_score(self, y_true, y_predict):
        return balanced_accuracy_score(y_true, y_predict)
        
    def set_precision_score(self, y_true, y_predict):
        return precision_score(y_true, y_predict)
    
    def set_average_precision_score(self, y_true, y_predict_proba):
        return average_precision_score(y_true, y_predict_proba[:, 1])
        
    def set_roc_auc_score(self, y_true, y_predict_proba):
        return roc_auc_score(y_true, y_predict_proba[:, 1])
    
    def set_calc_accuracy_score(self, tp, fp, fn, tn):
        return (tp+tn)/(tp + fp + tn + fn)
    
    def set_calc_sensitivty_score(self, tp, fn):
        return tp/(tp+fn)
    
    def set_calc_specficity_score(self, tn, fp):
        return tn/(tn+fp)
    
    def set_calc_precision_score(self, tp, fp):
        return tp/(tp+fp)
    
    def plot_confusion_matrix(self, X_test, y_test):
        plot_confusion_matrix(self.model, X_test, y_test);
    
    def check_y(self, y_true, y_pred):
        y_compare = np.where(y_true == y_pred, 1, 0)
        print(np.asarray((np.unique(y_compare, return_counts=True))))
    

In [14]:
class RegressionModel():
    """
    Model (of your choice) for Classification
    This Class takes X_train, X_test, y_train, and y_test and builds a Model
    
    ARGUMENTS (init)
    pipe: input a pipeline for your GridSearchCV Model to use
    X_train: your X that trains your model
    y_train: your y that trains your model
    X_train: your X to test your model with
    y_train: your y to test your model with
    params: Parameters for the pipline
    mod_name: Model Name shown in DataFrame
    verbose: Sets verbosity of GridSearchCV (1, 2, or 3)
    print_results: set to True prints scores after __init__ runs, else prints DataFrame
    """
    scores = {}
    train_scores = {}
    y_pred = None
    y_train_pred = None
    train_r2_score = None
    test_r2_score = None
    train_rmse = None
    test_rmse = None
    train_mse = None
    test_mse = None
    train_mae = None
    test_mae = None
    df = None
    y_thresh = None
        
    def __init__(self, pipe, X_train, X_test, y_train, y_test, params={}, mod_name='', 
                 round_y_threshold = 0.5, invert_y=False, verbose=2, print_results=False):
#         if(verbose > 1):
        if(print_results):
            print(datetime.datetime.now().strftime('START:  %Y-%m-%d, %H:%M:%S\n'))
        
        self.model_name = mod_name
        self.y_thresh = round_y_threshold

        self.pipe = pipe
        self.params = params
        self.model = GridSearchCV(pipe, params, n_jobs=-1, verbose=verbose)
        self.model.fit(X_train, y_train)
        self.best_params_ = self.model.best_params_
        if(len(mod_name) == 0):
            self.model_name = str(type(self.pipe[-1])).split('.')[-1][:-2]
        
        if(invert_y):
            self.y_pred = self.set_y_pred_inv(X_test)
            self.y_train_pred = self.set_y_pred_inv(X_train)
        else:
            self.y_pred = self.set_y_pred(X_test)
            self.y_train_pred = self.set_y_pred(X_train)

        self.train_r2_score = self.set_r2_score(y_train, self.y_train_pred)
        self.test_r2_score = self.set_r2_score(y_test, self.y_pred)
        self.train_rmse = self.set_rmse(y_train, self.y_train_pred)
        self.test_rmse = self.set_rmse(y_test, self.y_pred)
        self.train_mse = self.set_mse(y_train, self.y_train_pred)
        self.test_mse = self.set_mse(y_test, self.y_pred)
        self.train_mae = self.set_mae(y_train, self.y_train_pred)
        self.test_mae = self.set_mae(y_test, self.y_pred)
        
        self.set_all_score()
        self.set_DataFrame()

        if(print_results):# & verbose > 1):
            self.print_results()
    
    def DataFrame(self):
        return self.df
    
    def set_DataFrame(self):
        train_df = pd.DataFrame(np.array([v for k,v in self.train_scores.items()]), 
                               index=[k for k,v in self.train_scores.items()], 
                               columns=[self.model_name])
        test_df = pd.DataFrame(np.array([v for k,v in self.scores.items()]), 
                               index=[k for k,v in self.scores.items()], 
                               columns=[self.model_name])
        self.df = pd.concat([test_df, train_df])#, axis=1)
    
    def print_results(self, y_train, y_test):
        # print(datetime.datetime.now().strftime('START:  %Y-%m-%d, %H:%M:%S'))
        print(f'\nBaseline (y_train):')
        print(f'  - count:\n{y_train.value_counts()}')
        print(f'  - percent:\n{y_train.value_counts(normalize=True)}\n')
        print(f'\nBaseline (y_test):')
        print(f'  - count:\n{y_test.value_counts()}')
        print(f'  - percent:\n{y_test.value_counts(normalize=True)}\n')
        print(f'\nSTATS (y_train):')
        self.print_train_stats()
        print(f'\nSTATS (y_test):')
        self.print_stats()
        print(f'  - Model, BEST SCORE: {self.model.best_score_}')
        # print(datetime.datetime.now().strftime('\nFINISH: %Y-%m-%d, %H:%M:%S'))

    
    def set_all_score(self):
        self.scores = {
            'R2 Score': self.test_r2_score, 
            'RMSE': self.test_rmse, 
            'MSE': self.test_mse, 
            'MAE': self.test_mae, 
        }
        self.train_scores = {
            'Train R2 Score': self.train_r2_score, 
            'Train RMSE': self.train_rmse, 
            'Train MSE': self.train_mse, 
            'Train MAE': self.train_mae, 
        }
    
    def print_stats(self):
        for s,v in self.scores.items():
            print(f'  - {s}:\n        {v}')
        print(f'\nPIPE: {self.pipe}')
        print(f'\nbest params: {self.best_params_}')
    
    def print_train_stats(self):
        for s,v in self.train_scores.items():
            print(f'  - {s}:\n        {v}')
    
    def set_y_pred(self, X):
        if(self.y_thresh == -1):
            return self.model.predict(X)
        else:
            return np.abs(np.where(self.model.predict(X) < self.y_thresh, 0, 1))
    
    def set_y_pred_inv(self, X):
        return np.abs(np.where(self.model.predict(X) < self.y_thresh, 0, 1) - 1)
    
    def set_r2_score(self, y_true, y_predict):
        return r2_score(y_true, y_predict)
        
    def set_rmse(self, y_true, y_predict):
        return mean_squared_error(y_true, y_predict, squared=False)
    
    def set_mse(self, y_true, y_predict):
        return mean_squared_error(y_true, y_predict, squared=True)
        
    def set_mae(self, y_true, y_predict):
        return mean_absolute_error(y_true, y_predict)
    
    def check_y(self, y_true, y_pred):
        y_compare = np.where(y_true == y_pred, 1, 0)
        print(np.asarray((np.unique(y_compare, return_counts=True))))
    

In [15]:
class CleanUp:
    """
    Class takes in DataFrame and cleans it upon initialization, and keeps the cleaned version as part of itself.
    """
    df = None
    drop_cols = [ 'author', 'author_fullname', 'id', 'link_id', 'parent_id',
        'all_awardings', 'associated_award', 'author_flair_background_color', 'author_flair_css_class',
       'author_flair_richtext', 'author_flair_template_id', 'author_flair_text', 'author_flair_text_color', 
       'author_flair_type', 'author_patreon_flair', 'awarders', 'collapsed_because_crowd_control', 'comment_type',
       'created_utc', 'gildings', 'locked', 'permalink', 'retrieved_on', 'stickied', 'subreddit_id', 
       'top_awarded_type', 'treatment_tags', 'distinguished', 'author_cakeday']
    binarize = [ 'author_premium', 'is_submitter', 'no_follow', 'send_replies' ]
    
    def __init__(self, df_in):
        self.df = df_in.copy()

        self.df = self.drop_cols_cleaning()
        for b in self.binarize:
            if(b in self.df.columns):
                self.df = self.replace_with_binary(b)
        if('body' in self.df.columns):
            self.df = self.remove_deleted_comments()
            self.df = self.remove_keywords(remove_from='AMA')
            self.df = self.remove_keywords(remove_from='AskReddit')
            self.df = self.add_post_length()
        if('subreddit' in self.df.columns):
            self.df = self.add_binary_and_drop()
#         self.df = self.df.drop_duplicates()
    
    def replace_with_binary(self, replace_column_name, repl_w_zero = False):
        """
        Change column to a Binarized version

        ARGUMENTS
        df_in: The data frame to modify
        replace_column_name: the column which will be Ninarized
        repl_w_zero: The value to be replaced with a 0; everything else gets a 1

        RETURN
        Returns the data frame it has modified
        """
        self.df[replace_column_name] = np.where(self.df[replace_column_name] == repl_w_zero, 0, 1)
        return self.df
    
    def remove_deleted_comments(self, col_to_modify='body', repl_w_nan = '[deleted]'):
        """
        Find rows where body equals '[deleted]' and remove them from the set
        This function also removes duplicate entries

        ARGUMENTS
        df_in: The data frame to modify
        col_to_modify: the column which will be read in and checked for 'repl_w_nan'
        repl_w_nan: The value to be replaced with a NaN, and later be deleted (the whole row)

        RETURN
        Returns the data frame it has modified
        """
        # self.df.drop_duplicates(inplace=True)
        self.df[col_to_modify] = np.where(self.df[col_to_modify] == repl_w_nan, np.nan, self.df[col_to_modify])
        return self.df.dropna()
    
    def remove_keywords(self, remove_from = 'AMA', col_to_modify='body'):
        """
        Find keywords in body and remove them from there

        ARGUMENTS
        df_in: The data frame to modify
        col_to_modify: the column which will be read in and inspected
        remove_from: The keyword(s) to be removed from 'col_to_modify'

        RETURN
        Returns the data frame it has modified
        """
        self.df[col_to_modify] = self.df[col_to_modify].str.replace(remove_from, '')
        return self.df
    
    def drop_cols_cleaning(self):   #, drop_cols):
        """
        Return DataFrame with typically unusable columns for building a model.

        ARGUMENTS
        df_in: The data frame to modify
        drop_cols: columns to drop; if left empty, the following will be aut-assigned:
            [ 'all_awardings', 'associated_award', 'author_flair_background_color', 'author_flair_css_class',
           'author_flair_richtext', 'author_flair_template_id', 'author_flair_text', 'author_flair_text_color', 
           'author_flair_type', 'author_patreon_flair', 'awarders', 'collapsed_because_crowd_control', 'comment_type',
           'created_utc', 'gildings', 'locked', 'permalink', 'retrieved_on', 'stickied', 'subreddit_id', 
           'top_awarded_type', 'treatment_tags', 'distinguished', 'author_cakeday']

        RETURN
        Returns the data frame it has modified
        """
        for c in self.drop_cols:
            if(c in self.df):
                self.df = self.df.drop(columns=c)

        return self.df#.drop(columns=self.drop_cols)
    
    def add_post_length(self):
        self.df['post_length'] = self.df['body'].str.len()
        return self.df
    
    def add_binary_and_drop(self, drop_label='subreddit', repl_w_zero = 'AMA'):
        """
        Add a Binarized version of a column, and then drop that column

        ARGUMENTS
        df_in: The data frame to modify
        drop: the column which will be read in, and then later dropped
        repl_w_zero: The value to be replaced with a 0; everything else gets a 1

        RETURN
        Returns the data frame it has modified
        """
        bin_label = drop_label + '_binary'
        self.df[bin_label] = np.where(self.df[drop_label] == repl_w_zero, 0, 1)
        return self.df.drop(columns=drop_label)
    

In [16]:
def clean_the_columns(df_in, alter_rows=False):
    """
    Function that Cleans a DataFrame as outlined by the EDA/Cleaning process
    """
    binarize = [ 'author_premium', 'is_submitter', 'no_follow', 'send_replies' ]
    
    df_in = drop_cols_cleaning(df_in)
    for b in binarize:
        df_in = replace_with_binary(b)
    if('body' in df_in.columns):
        df_in = remove_deleted_comments(df_in)
        df_in = remove_keywords(df_in, remove_from='AMA')
        df_in = remove_keywords(df_in, remove_from='AskReddit')
        df_in = add_post_length(df_in, True, 4000)
    if(alter_rows):
        df_in = df_in.drop_duplicates()
    if('subreddit' in df_in.columns):
        df_in = add_binary_and_drop(df_in)
    return df_in

In [17]:
def print_classification_stats(y_true, y_predict, print_values=False):
    """
    Quick function that returns Classification metrics.
    
    ARGUMENTS
    y_true: true/actual value of y to use for metrics
    y_predict: predicted value of y to use for metrics
    print_values (default: False): prints values to terminal
    
    RETURN
    Returns Dictionary of the Classification Metric Scores
    """
    tn, fp, fn, tp = confusion_matrix(y_true, y_predict).ravel()
    accuracy_calc = (tp+tn)/(tp + fp + tn + fn)
    sensitivty_calc = tp/(tp+fn)
    specficity_calc = tn/(tn+fp)
    precision_calc = tp/(tp+fp)
    
    scores = {
        'F1 Score': f1_score(y_true, y_predict),
        'Recall Score': recall_score(y_true, y_predict), 
        'Accuracy Score': accuracy_score(y_true, y_predict),
        'Balanced Accuracy': balanced_accuracy_score(y_true, y_predict),
        'Precision Score': precision_score(y_true, y_predict), 
        'True Positive': tp,
        'False Negative': fn,
        'False Positive': fp,
        'True Negative': tn,
        'Calculated Accuracy': accuracy_calc,
        'Calculated Precision': precision_calc,
        'Calculated Sensitivity': sensitivty_calc,
        'Calculated Specificity': specficity_calc
    }
    
    if(print_values):
        for s,v in scores.items():
            print(f'  - {s}:\n        {v}')
    return scores
    