In [342]:
'''
*****************************************************************************************************************************************
*****************************************************************************************************************************************
* Author             : Anjana Tiha
* Author Details     : Masters of Science, Computer Science, University of Memphis, Memphis, Tennessee, USA (May 2018)
*****************************************************************************************************************************************
*****************************************************************************************************************************************
* Project Name       : Toxic Comment Classification Challenge - Kaggle Classification and Regression Module
* Description        : Machine Learning (Supervised Learning/ Classification/ Predictive Algorithm) for identifying toxic comments.
* Solution           : Using TFIDF Vector and sentence, word and charcacter level analysis following a recent 2018 AAAI conference paper.
* Input              : Collection of 159000 comments
* Output             : classifying toxic/hateful comment
* Start Date         : 07.04.2018
* Last Update        : 
* Tools Requirement  : Anaconda/PyCharm, Python
* Comments           : Please use Anaconda editor for visualization and convenience.
* Version History    : 1.0.0.0
* Current Version    : 1.0.0.0
*****************************************************************************************************************************************
*****************************************************************************************************************************************

'''


'\n*****************************************************************************************************************************************\n*****************************************************************************************************************************************\n* Author             : Anjana Tiha\n* Author Details     : Masters of Science, Computer Science, University of Memphis, Memphis, Tennessee, USA (May 2018)\n*****************************************************************************************************************************************\n*****************************************************************************************************************************************\n* Project Name       : Toxic Comment Classification Challenge - Kaggle Classification and Regression Module\n* Description        : Machine Learning (Supervised Learning/ Classification/ Predictive Algorithm) for identifying toxic comments.\n* Solution           : Using TFIDF Vector a

In [360]:
'''
Followed the following paper:
Paper Summray:
Title:    Anatomy of Online Hate: Developing a Taxonomy and Machine Learning 
          Models for Identifying and Classifying Hate in Online News Media
Authors:  Joni Salminen,*†§ Hind Almerekhi,*Milica Milenković, Soon-gyo Jung,*Jisun An,*Haewoon Kwak,*,Bernard J. Jansen*
          Qatar Computing Research Institute, Hamad Bin Khalifa University †
          Turku School of Economics at the University of Turku
          Independent Researcher
'''

'\nFollowed the following paper:\nPaper Summray:\nTitle:    Anatomy of Online Hate: Developing a Taxonomy and Machine Learning \n          Models for Identifying and Classifying Hate in Online News Media\nAuthors:  Joni Salminen,*†§ Hind Almerekhi,*Milica Milenković, Soon-gyo Jung,*Jisun An,*Haewoon Kwak,*,Bernard J. Jansen*\n          Qatar Computing Research Institute, Hamad Bin Khalifa University †\n          Turku School of Economics at the University of Turku\n          Independent Researcher\n'

In [361]:
import numpy as np
import pandas as pd
from collections import defaultdict
import string
from string import punctuation
import re
from tokenize import tokenize
import nltk, re, time
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf



In [362]:
class File:
    def __init__(self, train_X_file, train_Y_file, test_X_file, test_Y_file):
        if train_X_file:
            self.train_X = pd.read_csv(train_X_file)
        if train_Y_file:
            self.train_Y = pd.read_csv(train_Y_file)
        else:
            self.train_Y = None
        if test_X_file:
            self.test_X = pd.read_csv(test_X_file)
        if test_Y_file:
            self.test_Y = pd.read_csv(test_Y_file)
            
    def get_content(self):
        return self.train_X, self.train_Y, self.test_X, self.test_Y

In [373]:
class Data:
    def __init__(self, train_X, train_Y, test_X, test_Y, train_test, train_XY, test_XY, targets):
        self.train_X = train_X
        self.test_X = test_X
     
        self.train_Y = pd.DataFrame()
        self.test_Y = pd.DataFrame()
        
        self.feature_cols = None
        
        if train_XY:
            for col in targets:
                if self.train_Y.empty:
                    self.train_Y = pd.DataFrame(data=train_X[col], columns=[col])
                else:
                    self.train_Y = pd.concat([self.train_Y, train_X[[col]]], axis=1)
                    
                self.train_X.drop(columns=col, inplace=True)
        else:
            self.train_Y = train_Y
            
        if test_XY:
            for col in targets:
                if self.test_Y.empty:
                    self.test_Y = pd.DataFrame(data=test_X[col], columns=[col])
                else:
                    self.test_Y = pd.concat([self.test_Y, test_X[[col]]], axis=1)
                    
                    
                self.test_X.drop(columns=col, inplace=True)
        else:
            self.test_Y = test_Y

    def set_feature_cols(self, feature_cols):
        if self.feature_cols:
            self.feature_cols.append(feature_cols)
        else: self.feature_cols = feature_cols
    def remove_feature_cols(self, feature_cols):
        if self.feature_cols:
            self.feature_cols.remove(feature_cols)
        else: self.feature_cols = None
    
    def fill_columns_selected(self, columns_names, columns_val, inplace=True):
        for cols in columns_names:
            self.train_X[cols].fillna(columns_val, inplace=inpl)
            self.test_X[cols].fillna(columns_val, inplace=inpl)
            
    def fill_columns(self, val, inpl=True):
        self.train_X.fillna(val, inplace=inpl)
        self.test_X.fillna(val, inplace=inpl)
    
    def get_data(self):
        return self.train_X, self.train_Y, self.test_X, self.test_Y
            
            

In [374]:
class Split:
    def __init__(self, splitter_name, n_splits, test_size, random_state):
        if splitter_name == 'KFold':
            self.splitter = KFold(n_splits, random_state, shuffle)
        elif splitter_name == 'StratifiedShuffleSplit':
            self.splitter = StratifiedShuffleSplit(n_splits, test_size, random_state)



In [375]:
class Model:
    def __init__(self, model_type):
        if model_type == 'Classification':
            self.models = {
                "AdaBoostClassifier": AdaBoostClassifier(),
                "BernoulliNB": BernoulliNB(),
            #     "BernoulliRBM": BernoulliRBM(),
                "DecisionTreeClassifier": DecisionTreeClassifier(),
                "ExtraTreesClassifier": ExtraTreesClassifier(),
            #     "GaussianMixture": GaussianMixture(),
            #     "GaussianNB": GaussianNB(),
            #     "GaussianProcessClassifier": GaussianProcessClassifier(),
                "GradientBoostingClassifier": GradientBoostingClassifier(),
            #     "KDTree": KDTree(),
            #     "KNeighborsClassifier": KNeighborsClassifier(3),
                "LogisticRegression": LogisticRegression(),
                "LinearSVC": LinearSVC(),
                "MLPClassifier": MLPClassifier(),
                "MultinomialNB": MultinomialNB(),
            #     "NearestNeighbors": NearestNeighbors(),
            #     "NuSVC": NuSVC(),
                "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(),
                "RandomForestClassifier": RandomForestClassifier(),
                "SVC Linear": SVC(kernel="linear", C=0.025),
                "SVC": SVC(),
                "SVC Gamma": SVC(gamma=2, C=1)
            #     VotingClassifier: VotingClassifier(),
            }
        elif model_type == 'Regression':
            self.models  = {
                "AdaBoostRegressor": AdaBoostRegressor(),
            #     "ARDRegression": ARDRegression(),
                "BaggingRegressor": BaggingRegressor(),
            #     "BernoulliRBM": BernoulliRBM(),
                "DecisionTreeRegressor": DecisionTreeRegressor(),
                "ExtraTreesRegressor": ExtraTreesRegressor(),
                "ExtraTreeRegressor": ExtraTreeRegressor(),
            #     "GaussianMixture": GaussianMixture(),
            #     "GaussianNB": GaussianNB(),
                "GaussianProcessRegressor": GaussianProcessRegressor(),
                "GradientBoostingRegressor": GradientBoostingRegressor(),
                "HuberRegressor": HuberRegressor(),
            #     "IsotonicRegression": IsotonicRegression(),
                "KernelRidge": KernelRidge(),
            #     "KDTree": KDTree(),
            #     "KNeighborsRegressor": KNeighborsRegressor(),
            #     "LinearRegression": LinearRegression(), 
                "LogisticRegression": LogisticRegression(),
                "LogisticRegressionCV": LogisticRegressionCV(),
            #     "logistic_regression_path": logistic_regression_path(),
                "LinearSVR": LinearSVR(),
                "MLPRegressor": MLPRegressor(),
            #     "MultinomialNB": MultinomialNB(),
                "NuSVR": NuSVR(),
                "PassiveAggressiveRegressor": PassiveAggressiveRegressor(),
            #     "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(),
                "RadiusNeighborsRegressor": RadiusNeighborsRegressor(),
                "RandomForestRegressor": RandomForestRegressor(),
                "RandomizedLogisticRegression": RandomizedLogisticRegression(),
                "RANSACRegressor": RANSACRegressor(),
                "SGDRegressor": SGDRegressor(),
                "SVR": SVR(),
                "TheilSenRegressor": TheilSenRegressor(),
            }
            


In [376]:
train_X_file =r'C:\Users\Anjana Tiha\Drive D\Programming\Projects\Toxic Comment Classification Challenge\Toxic Comment Classification Challenge\all\train.csv'
test_X_file = r'C:\Users\Anjana Tiha\Drive D\Programming\Projects\Toxic Comment Classification Challenge\Toxic Comment Classification Challenge\all\test.csv'
test_Y_file = r'C:\Users\Anjana Tiha\Drive D\Programming\Projects\Toxic Comment Classification Challenge\Toxic Comment Classification Challenge\all\test_labels.csv'

file_obj = File(train_X_file, None, test_X_file, test_Y_file)

train_X, train_Y, test_X, test_Y = file_obj.get_content()


targets = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

data_obj = Data(train_X, None, test_X, test_Y, False, True, False, targets)

data_obj.train_Y['none'] = 1-data_obj.train_Y[targets].max(axis=1)
data_obj.test_Y['none'] = 1-data_obj.test_Y[targets].max(axis=1)
data_obj.train_Y['any'] = data_obj.train_Y[targets].max(axis=1)
data_obj.test_Y['any'] = data_obj.test_Y[targets].max(axis=1)
data_obj.fill_columns("unknown", True)

train_X, train_Y, test_X, test_Y = data_obj.get_data()

targets = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'none', 'any']

In [384]:
class Preprocess:
    def clean_text(text, alpha=True, punc=False, case_active=False, remove_stopwords=True):
        if alpha:
            text  = re.sub(r"[^a-z]", " ", text)
        if case_active==False:
            text  = text .lower()
        if punc==False:
            text  = ''.join([c for c in text if c not in punctuation])
        if remove_stopwords:
            cached_stopwords = stopwords.words("english")
            self.text = ' '.join([word for word in text.split() if word not in cached_stopwords])
        text = re.sub(r" +", " ", text)
        text = text.strip()
        text = text .split()
        return text   

    def tokenize(text, alpha=True, punc=False, case_active=False, remove_stopwords=True):
        if alpha:
            text = re.sub(r"[^a-z]", " ", text)
        if case_active==False:
            text = text.lower()
        if punc==False:
            text = ''.join([c for c in text if c not in punctuation])
        if remove_stopwords:
            cached_stopwords = stopwords.words("english")
            text = ' '.join([word for word in text.split() if word not in cached_stopwords])
        text = re.sub(r" +", " ", text)
        text = text.strip()
        text = text.split()
        return text 


    def single_char_cnt(text, alpha=False, punc=False, remove_stopwords=True):
        text = text
        if alpha:
            text = re.sub(r"[^a-z]", " ", text)
        if punc==False:
            text = ''.join([c for c in text if c not in punctuation])
        if remove_stopwords:
            cached_stopwords = stopwords.words("english")
            text = ' '.join([word for word in text.split() if word not in cached_stopwords])

        text = re.sub(r" +", " ", text)
        text = text.strip()
        text = text.split()

        c=0
        for tok in text:
            if len(tok.strip())==1: c+=1
        return  c

    def find_urls(text):
        return re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', text)


    def count_modals(text):
        modals = ['can', 'could', 'may', 'might', 'must', 'will', 'would', 'should']

        toks = text.split(' ')

        c=0
        for tok in toks:
            if tok in modals: c+=1
        return c

    def non_alpha_mid(text, alpha=False, punc=False, remove_stopwords=True):
        text = re.sub(r" +", " ", text)
        if punc==False:
            text = ''.join([c for c in text if c not in punctuation])
        if remove_stopwords:
            cached_stopwords = stopwords.words("english")
            text = ''.join([word for word in text if word not in cached_stopwords])

        text = text.split()

        c=0
        for tok in text:
            m=0
            for ch in tok:
                if ch.isalpha()==0: m+=1
            if (m>1 and len(tok)>1) or (m>=1): c+=1
        return c

In [402]:
class Features:    
    def __init__(self, train_X, test_X, columns_text):
        self.train_X = train_X
        self.test_X = test_X
        self.columns_text = columns_text
        
        self.train_X_features = []
        self.test_X_features = []
        
    def get_features_X(self, X, features):
        count = lambda l1, l2: len(list(filter(lambda c: c in l2, l1)))
        prep_obj = Preprocess()
        print(X)
        try:
            for col in self.columns_text:
                length=X[col].size
                
                i=0
                features_col = []
                for line in range(0, length-1):
                    counts = []
                    text = X[col][line]
                    
                    counts.append(text.count("!"))
                    
                    counts.append(text.count("?"))
                    counts.append(text.count("."))
                    counts.append(count(text,set(string.punctuation)))
                    counts.append(len(re.findall('[''""]', text)))
                    
                    counts.append(single_char_cnt(text, alpha=False, punc=False, remove_stopwords=False))
                    counts.append(len(find_urls(text)))
                    counts.append(len(text))
                    counts.append(len(text.split()))
                    counts.append(sum(1 for c in text if c.isupper()))
                    counts.append(count_modals(text))
                    counts.append(len(re.findall(r'[\U0001f600-\U0001f650]', text)))
                    counts.append(non_alpha_mid(text, alpha=False, punc=False, remove_stopwords=False))
                    print(count)
                    features_col.append(counts)
                features.append(features_col)
        except:
            print("Error:", line)
        
        features = np.array(features)
                
        return features
            
    def get_features(self):
        self.train_X_features = get_features_X(self.train_X, self.columns_text)
        self.test_X_features = get_features_X(self.test_X, self.columns_text)
        
        return self.train_X_features, self.test_X_features 
    

In [403]:
text_columns = ['comment_text']
features_obj = Features(train_X, test_X, text_columns)
train_X_features, test_X_features = features_obj.get_features()

NameError: name 'get_features_X' is not defined

In [355]:
len(features)

0

In [295]:
# tfidf = TfidfVectorizer(tokenizer=clean_text, min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1, smooth_idf=1 )

# # tfidf = TfidfVectorizer(tokenizer=clean_text, min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1, smooth_idf=1 )
# # tfidf = TfidfVectorizer(decode_error='strict', strip_accents='unicode', lowercase=True, preprocessor=preprocess, tokenizer=tokenize, analyzer='word', stop_words='english', ngram_range=(1, 1), max_df=1.0, min_df=1, norm='l2', use_idf=True, smooth_idf=True)

# # tfidf = TfidfVectorizer(decode_error='strict', strip_accents='unicode', lowercase=True, preprocessor=preprocess, tokenizer=tokenize, analyzer='word', stop_words='english', max_df=0.9, min_df=3, norm='l2', use_idf=True, smooth_idf=True)

# x = tfidf.fit_transform(train['comment_text'])
# test_x = tfidf.transform(test['comment_text'])



In [9]:
# # # print(data.isnull().sum(axis=1))
# # X = features.values
# # Y = target
# # X = x+test_x
# X = x
# # Y = train['comment_text'] + test['comment_text']
# # Y = train['comment_text']

# # # print(features.info())

In [10]:
from sklearn import metrics

def model_evaluation(X, Y, X_test, y_test, splitter, model, report, details):
    accuracy = 0
    f1 = 0
    precision = 0
    recall = 0
    i=0
    if report:
        print("*"*50, " START ", "*"*50)
        print("Model Description:")
        print(model)
        print("-"*100,"\n")
      
    if splitter:
        for train_index, test_index in splitter.split(X, Y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = Y[train_index], Y[test_index]

            # model fitting
            model.fit(X_train, y_train)

            # prediction
            predict = model.predict(X_test)

            accuracy_temp = metrics.accuracy_score(y_test, predict)
            precision_temp = metrics.precision_score(y_test, predict, average="micro")
            recall_temp = metrics.recall_score(y_test, predict, average="micro")
            f1_temp = metrics.f1_score(y_test, predict, average="micro")
            hamming_loss = metrics.hamming_loss(y_test, predict)

            accuracy = accuracy + accuracy_temp
            precision = precision + precision_temp
            recall = recall+ recall_temp
            f1= f1 + f1_temp


    #         # evaluation scores
    #         explained_variance_score_temp = explained_variance_score(y_test, predict)
    #         mean_absolute_error_temp = mean_absolute_error(y_test, predict)
    #         mean_squared_error_temp = mean_squared_error(y_test, predict)
    #         mean_squared_log_error_temp = mean_squared_log_error(y_test, predict)
    #         median_absolute_error_temp = median_absolute_error(y_test, predict)
    #         r2_score_temp = r2_score(y_test, predict)

    #         explained_variance_score_val = explained_variance_score_val + explained_variance_score_temp
    #         mean_absolute_error_val = mean_absolute_error_val + mean_absolute_error_temp
    #         mean_squared_error_val = mean_squared_error_val + mean_squared_error_temp
    #         mean_squared_log_error_val = mean_squared_log_error_val + mean_squared_log_error_temp
    #         median_absolute_error_val = median_absolute_error_val + median_absolute_error_temp
    #         r2_score_val = r2_score_val + r2_score_temp

            if details:
                print("*"*25,  " ITERATION - ", i+1, "*"*25)
                #print("TRAIN:", train_index, "TEST:", test_index)
                print("accuracy_score", metrics.accuracy_score(y_test, predict))
                print("precision_score", metrics.precision_score(y_test, predict, average="micro"))
                print("recall_score", metrics.recall_score(y_test, predict, average="micro"))
                print("f1_score", metrics.f1_score(y_test, predict, average="micro"))
                print("hamming_loss", metrics.hamming_loss(y_test, predict))
            #     precision, recall, thresholds = metrics.precision_recall_curve(y_test, predict)
            #     print("average_precision_score", metrics.average_precision_score(y_test, predict, average="micro"))
            #     print("fbeta_score", metrics.fbeta_score(y_test, predict))
            #     print("roc_auc_score", metrics.roc_auc_score(y_test, predict, average="micro"))

    #             print("-"*35)
    #             print(metrics.classification_report(y_test, predict))
    #             print("-"*35)
    #             print("confusion Matrix:\n\n", metrics.confusion_matrix(y_test, predict))
    #             print("-"*35)
                print("\n")

            i+=1


        split_num = splitter.get_n_splits()
        accuracy = accuracy/split_num
        precision = precision/split_num
        recall = recall/split_num
        f1 = f1/split_num

    else:
        model.fit(X, Y)

        # prediction
        predict = model.predict(X_test)

        accuracy = metrics.accuracy_score(y_test, predict)
        precision = metrics.precision_score(y_test, predict, average="micro")
        recall = metrics.recall_score(y_test, predict, average="micro")
        f1 = metrics.f1_score(y_test, predict, average="micro")
        hamming_loss = metrics.hamming_loss(y_test, predict)
    if report:
        if splitter:
            print("*"*50, " Average For", i+1, " Folds", "*"*50)
        print("\n")
        print("Average Accuracy Score: ", accuracy)
        print("Average pPrecision Score: ", precision)
        print("Average Recall Score: ", recall)
        print("Average F1 Score:", f1)

#         print('%50s%s' % ("Average explained_variance_score: ", explained_variance_score_val))
#         print('%50s%s' % ("Average mean_absolute_error: ", mean_absolute_error_val))
#         print('%50s%s' % ("Average mean_squared_error: ", mean_squared_error_val))
#         print('%50s%s' % ("Average mean_squared_log_error: ", mean_squared_log_error_val))
#         print('%50s%s' % ("Average median_absolute_error: ", median_absolute_error_val))
#         print('%50s%s' % ("Average r2_score: ", r2_score_val))
#         print("\n")
#         print("*"*100)
# #         print("*"*50, " END ", "*"*50)
    
#     return explained_variance_score_val, mean_absolute_error_val, mean_squared_error_val, mean_squared_log_error_val, median_absolute_error_val, r2_score_val
    return accuracy, precision, recall, f1

In [11]:
print(len(train))

features = feature_word_string(train)

159571
Error: 159571


In [12]:
print(len(features))
features = np.array(features, dtype='int')

159571


In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split


# classifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import (AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomTreesEmbedding, RandomForestClassifier, VotingClassifier)
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.linear_model import LogisticRegression
from sklearn.mixture import GaussianMixture
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB 
from sklearn.neighbors import KDTree, KNeighborsClassifier, NearestNeighbors
from sklearn.neural_network import BernoulliRBM, MLPClassifier
from sklearn.svm import LinearSVC, NuSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# regressor
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import (AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomTreesEmbedding, RandomForestRegressor, VotingClassifier)
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.isotonic import IsotonicRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ARDRegression, LinearRegression, LogisticRegression, LogisticRegressionCV, logistic_regression_path, HuberRegressor, PassiveAggressiveRegressor, RandomizedLogisticRegression, RANSACRegressor, SGDRegressor, TheilSenRegressor
from sklearn.mixture import GaussianMixture
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB 
from sklearn.neighbors import KDTree, KNeighborsRegressor, NearestNeighbors, RadiusNeighborsRegressor
from sklearn.neural_network import BernoulliRBM, MLPRegressor
from sklearn.svm import LinearSVR, NuSVR, SVR
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor


import gc


sss = StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0)


# kf = KFold(n_splits = 5, random_state=None, shuffle =True)

classifiers = {
    "AdaBoostClassifier": AdaBoostClassifier(),
    "BernoulliNB": BernoulliNB(),
#     "BernoulliRBM": BernoulliRBM(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "ExtraTreesClassifier": ExtraTreesClassifier(),
#     "GaussianMixture": GaussianMixture(),
#     "GaussianNB": GaussianNB(),
#     "GaussianProcessClassifier": GaussianProcessClassifier(),
    "GradientBoostingClassifier": GradientBoostingClassifier(),
#     "KDTree": KDTree(),
#     "KNeighborsClassifier": KNeighborsClassifier(3),
    "LogisticRegression": LogisticRegression(),
    "LinearSVC": LinearSVC(),
    "MLPClassifier": MLPClassifier(),
    "MultinomialNB": MultinomialNB(),
#     "NearestNeighbors": NearestNeighbors(),
#     "NuSVC": NuSVC(),
    "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(),
    "RandomForestClassifier": RandomForestClassifier(),
    "SVC Linear": SVC(kernel="linear", C=0.025),
    "SVC": SVC(),
    "SVC Gamma": SVC(gamma=2, C=1)
#     VotingClassifier: VotingClassifier(),
}

classifiers2 = {
#     "AdaBoostClassifier": AdaBoostClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "GaussianNB": GaussianNB(),
#     "GradientBoostingClassifier": GradientBoostingClassifier(),
#     "KNeighborsClassifier": KNeighborsClassifier(3),
    "LogisticRegression": LogisticRegression(),
    "LinearSVC": LinearSVC(),
    "MultinomialNB": MultinomialNB(),
#     "NuSVC": NuSVC(),
    "RandomForestClassifier": RandomForestClassifier(),
    "SVC Linear": SVC(kernel="linear", C=0.025)
#     "SVC": SVC(),
#     "SVC Gamma": SVC(gamma=2, C=1)
}

classifiers3 = {
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "GaussianNB": GaussianNB(),
    "LogisticRegression": LogisticRegression(),
    "LinearSVC": LinearSVC(),
    "MultinomialNB": MultinomialNB(),
    "RandomForestClassifier": RandomForestClassifier(),
    "SVC Linear": SVC(kernel="linear", C=0.025)
}


regressors = {
    "AdaBoostRegressor": AdaBoostRegressor(),
#     "ARDRegression": ARDRegression(),
    "BaggingRegressor": BaggingRegressor(),
#     "BernoulliRBM": BernoulliRBM(),
    "DecisionTreeRegressor": DecisionTreeRegressor(),
    "ExtraTreesRegressor": ExtraTreesRegressor(),
    "ExtraTreeRegressor": ExtraTreeRegressor(),
#     "GaussianMixture": GaussianMixture(),
#     "GaussianNB": GaussianNB(),
    "GaussianProcessRegressor": GaussianProcessRegressor(),
    "GradientBoostingRegressor": GradientBoostingRegressor(),
    "HuberRegressor": HuberRegressor(),
#     "IsotonicRegression": IsotonicRegression(),
    "KernelRidge": KernelRidge(),
#     "KDTree": KDTree(),
#     "KNeighborsRegressor": KNeighborsRegressor(),
#     "LinearRegression": LinearRegression(), 
    "LogisticRegression": LogisticRegression(),
    "LogisticRegressionCV": LogisticRegressionCV(),
#     "logistic_regression_path": logistic_regression_path(),
    "LinearSVR": LinearSVR(),
    "MLPRegressor": MLPRegressor(),
#     "MultinomialNB": MultinomialNB(),
    "NuSVR": NuSVR(),
    "PassiveAggressiveRegressor": PassiveAggressiveRegressor(),
#     "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(),
    "RadiusNeighborsRegressor": RadiusNeighborsRegressor(),
    "RandomForestRegressor": RandomForestRegressor(),
    "RandomizedLogisticRegression": RandomizedLogisticRegression(),
    "RANSACRegressor": RANSACRegressor(),
    "SGDRegressor": SGDRegressor(),
    "SVR": SVR(),
    "TheilSenRegressor": TheilSenRegressor(),
}


    
# splitter = kf 
splitter = sss
# splitter = None
report = 1
details = 1
evaluation = {}

print("Spliter Description:")
print(splitter)
     
columns2 =['any'] 

print(len(train))

X_features = features

for comment_type in columns2:
    print("comment Type: ", comment_type, "\n\n")
    for name in classifiers:
        evaluation_temp = []
        accuracy, precision, recall, f1 = model_evaluation(X_features, train[comment_type], None, None, splitter, classifiers[name], report, details=None)
#         accuracy, precision, recall, f1 = model_evaluation(X, train[comment_type], test_x, test_labels[comment_type], splitter, classifiers[name], report, details=None)
        evaluation_temp.append(accuracy)
        evaluation_temp.append(precision)
        evaluation_temp.append(recall)
        evaluation_temp.append(f1)
        evaluation[name] = evaluation_temp
        gc.collect()
    rows_list = []
    for name in evaluation:
        rows_list.append([name]+evaluation[name])
                           
    evaluation_pd = pd.DataFrame(rows_list, columns=['model', 'accuracy', 'precision', 'recall', 'f1']) 
#     print(evaluation_pd)


            
# for name in regressors:
#     evaluation_temp = []
    
#     explained_variance_score_val, mean_absolute_error_val, mean_squared_error_val, mean_squared_log_error_val, median_absolute_error_val, r2_score_val = model_evaluation(X, Y, splitter, regressors[name], report, details=None)
#     evaluation_temp.append(explained_variance_score_val)
#     evaluation_temp.append(mean_absolute_error_val)
#     evaluation_temp.append(mean_squared_error_val)
#     evaluation_temp.append(mean_squared_log_error_val)
#     evaluation_temp.append(median_absolute_error_val)
#     evaluation_temp.append(r2_score_val)
#     evaluation[name] = evaluation_temp
    

# rows_list = []
# for name in evaluation:
#     rows_list.append([name]+evaluation[name])
                           
# evaluation_pd = pd.DataFrame(rows_list, columns=['explained_variance_score',  'mean_absolute_error', 'mean_squared_error', 'mean_squared_log_error', 'median_absolute_error', 'r2_score']) 
# evaluation_pd = pd.DataFrame(rows_list, columns=['model', 'accuracy', 'precision', 'recall', 'f1']) 
# evaluation_pd


Spliter Description:
StratifiedShuffleSplit(n_splits=5, random_state=0, test_size=0.5,
            train_size=None)
159571
comment Type:  any 


**************************************************  START  **************************************************
Model Description:
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)
---------------------------------------------------------------------------------------------------- 





**************************************************  Average For 6  Folds **************************************************


Average Accuracy Score:  0.9012759130674555
Average pPrecision Score:  0.9012759130674555
Average Recall Score:  0.9012759130674555
Average F1 Score: 0.9012759130674555
**************************************************  START  **************************************************
Model Description:
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
---------------------------------------------------------------------------------------------------- 

**************************************************  Average For 6  Folds **************************************************


Average Accuracy Score:  0.8851201965257063
Average pPrecision Score:  0.8851201965257063
Average Recall Score:  0.8851201965257063
Average F1 Score: 0.8851201965257063
**************************************************  START  ************************************************



**************************************************  Average For 6  Folds **************************************************


Average Accuracy Score:  0.8883789135938637
Average pPrecision Score:  0.8883789135938637
Average Recall Score:  0.8883789135938637
Average F1 Score: 0.8883789135938637
**************************************************  START  **************************************************
Model Description:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
---------------------------------------------------------------------------------------------------- 

**************************************************  

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.pyplot import figure


figure(num=None, figsize=(14, 6), dpi=250)

labels= ['accuracy', 'precision', 'recall', 'f1']
ax = plt.subplot(111)

for n in range(0,4):
    plt.plot([name for name in evaluation],[evaluation[name][n] for name in evaluation], label = labels[n])

leg = plt.legend(loc='best', ncol=2, mode="expand", shadow=True, fancybox=True)
plt.xticks(rotation=45)
# leg.get_frame().set_alpha(0.5)
plt.legend()
ax.tick_params(labelsize='large', width=5)
ax.grid(True, linestyle='-.')

plt.tight_layout()
plt.xlabel('x label')
plt.ylabel('y label')

plt.title("TITLE")
plt.show()


In [None]:
len(train)