In [1]:
import collections
from collections import Counter

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
from imblearn.over_sampling import SMOTE
import json
import math
import matplotlib.pyplot as plt
%matplotlib inline

import nltk
import nltk.stem as ns
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

import numpy as np
import os
import pickle
import pandas as pd
from pprint import pprint
import re
import seaborn as sns
# !pip install spacy
import spacy
from string import punctuation

from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer,CountVectorizer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from tqdm import tqdm
from textblob import TextBlob
from wordcloud import WordCloud, ImageColorGenerator
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
# plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']

# !pip list


### Read data


In [None]:
def remove_punc(text):
    # 移除标点符号
    punctuation_zh = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~“”？，！【】（）、。：；’‘……￥·"""
    dicts = {i: '' for i in punctuation + punctuation_zh}
    punc_table = str.maketrans(dicts)
    new_text = text.translate(punc_table)
    return new_text

In [None]:

lst_dics = []
with open ("SMSSpamCollection.SMSSpamCollection", mode='r', errors='ignore', encoding='utf8') as f: 
#     for dic in json_file: 
#         lst_dics.append( json.loads(dic) )
    for line in f.readlines():
        line = line.strip()
        lint_split = line.split('\t')
        label = lint_split[0]
        text = remove_punc(lint_split[1])
        lst_dics.append( {"label": label, "text(Original)": text} )
# lst_dics[0]

dtf = pd.DataFrame(lst_dics)
# 创建dtf

# dtf = dtf.rename(columns={"label":" category ", "text":" content "})
# # 重命名列

dtf

In [None]:
fig, ax = plt.subplots() 
fig.suptitle("The Counts of Both Kinds (total: %s)" % dtf["label"].count(), fontsize=12) 
dtf["label"].reset_index().groupby("label").count().sort_values(by= 
       "index").plot(kind= "barh", legend=False, ax=ax).grid(axis='x') 

count = []
count.append( dtf[ dtf["label"].isin(['ham']) ]["label"].count() )
count.append( dtf[ dtf["label"].isin(['spam']) ]["label"].count() )

for i in range(2):
    plt.text(count[i]-200,count[i]/4000 + 0.11,"%s"% count[i],va='center')
    
ax.set(ylabel="Label", xlabel="The Number of The Messages")
plt.show()

### Preprocessing

In [None]:
stop_words = set(stopwords.words('english'))

# 词性还原
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.NOUN
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None
    
lemmatizer = ns.WordNetLemmatizer()

def lemma(sentences):
    new_sentences = []
    for sentence in sentences:
        lemmas_sent = []
        tokens = word_tokenize(sentence)  # 分词
#         print(tokens)
        for item in tokens:
            if re.search(r'\d', item):
                tokens.remove(item)
        tagged_sent = pos_tag(tokens)  # 获取单词词性
        for tag in tagged_sent:
            wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
            lemmas_sent.append(lemmatizer.lemmatize(tag[0], pos=wordnet_pos))  # 词形还原
        new_sentences.append(' '.join(lemmas_sent))
    return new_sentences

text = []
for i in dtf["text(Original)"]:
    i = ' '.join([item for item in word_tokenize(i) if item.lower() not in stop_words])
    text.append( i.lower() )
    
textLemma = lemma(text)

dtf["text(Preprocessing)"] = textLemma
dtf.head()

### Simple text features

In [None]:
def norm(dtf_input):
    min = np.min(dtf_input)
    max = np.max(dtf_input)
    return dtf_input.apply(lambda x: (x - min) / (max - min))

df = pd.DataFrame(lst_dics)
df['word_count'] = dtf["text(Original)"].apply(lambda x: len(str(x).split(" ")))
df['char_count'] = dtf["text(Original)"].apply(lambda x: sum(len(word) for word in str(x).split(" ")))

dtf['word_count'] = norm(dtf["text(Original)"].apply(lambda x: len(str(x).split(" "))))
dtf['char_count'] = norm(dtf["text(Original)"].apply(lambda x: sum(len(word) for word in str(x).split(" "))))
dtf['avg_word_length'] = norm(df['char_count'] / df['word_count'])
dtf['pound_count'] = norm(dtf["text(Original)"].apply(lambda x: len(str(x).split("£"))-1))
dtf['capital_count'] = norm(dtf["text(Original)"].apply(lambda x: sum(char.isupper() for char in str(x))))
dtf['digit_count'] = norm(dtf["text(Original)"].apply(lambda x: sum(char.isdigit() for char in str(x))))
dtf['phone_count'] = norm(dtf["text(Original)"].apply(lambda x: len(re.compile(r"(?:^|[^\d])(0\d{10})(?:$|[^\d])").findall(x))))
dtf['url'] = norm(dtf["text(Original)"].apply(lambda x: 1 if "www" in x or "http" in x or ".uk" in x or ".com" in x else 0))

dtf

### Histogram & KDE

In [None]:
x, y = "word_count", "label"
# x, y = "tags_ORDINAL", "label"
# x, y = "tags_ORDINAL", "label"
# ["call later", "please call", "gon na", "try contact", "get", "go", "im", "come", "free", "txt", "mobile"]

fig, ax = plt.subplots(nrows=1, ncols=2) 
fig.suptitle(x, fontsize=15) 
for i in dtf[y].unique(): 
    # 直方图和密度
    sns.distplot(dtf[dtf[y]==i][x], hist=True, kde=False, 
                 hist_kws={"alpha":0.8}, 
                 axlabel="Histogram", ax=ax[0])
    sns.distplot(dtf[dtf[y]==i][x], hist=False, kde=True, 
                 kde_kws={"shade":True}, axlabel="Kernel Density Estimation",   
                 ax=ax[1])

    
ax[0].grid(True) 
ax [0].legend(dtf[y].unique()) 
ax[0].set(ylabel="SMS Message Counts", xlabel="SMS Message "+ x +"'s Value\n\nHistogram")
ax[1].grid(True) 
ax [1].legend(dtf[y].unique()) 
ax[1].set(ylabel="Density", xlabel="SMS Message "+ x +"'s Value\n\nKDE")

plt.tight_layout() 
plt.subplots_adjust(top=0.85)
plt.show()

# 两图横坐标为 x 属性（归一化后）的值
# 直方图纵坐标为 x 属性为该值的 项数的统计和
# 核密度纵坐标为 密度，并不直接表达概率，我们需要计算的是曲线下方的面积。这个面积在直方图里就是宽度乘以高度

### Sentiment feature

In [None]:
dtf["sentiment"] = norm(dtf["text(Preprocessing)"].apply(lambda x: 
                   TextBlob(x).sentiment.polarity))
dtf
# print(dtf["text"].iloc[0], " --> ", dtf["sentiment"].iloc[0])

In [None]:
dtf[[ "label", "text(Original)", "text(Preprocessing)", "sentiment"]]

### Named-Entity Recognition

In [None]:
## 调用
ner = spacy.load("en_core_web_lg")
## 打标签
txt = dtf["text(Original)"].iloc[0]
doc = ner(txt)
## 展示结果
spacy.displacy.render(doc, style="ent")

In [None]:
## 标识文本并将标识导出到列表中
dtf["tags"] = dtf["text(Original)"].apply(lambda x: [(tag.text, tag.label_) 
                                for tag in ner(x).ents] )
## utils函数计算列表元素
def utils_lst_count(lst):
    dic_counter = collections.Counter()
    for x in lst:
        dic_counter[x] += 1
    dic_counter = collections.OrderedDict( 
                     sorted(dic_counter.items(), 
                     key=lambda x: x[1], reverse=True))
    lst_count = [ {key:value} for key,value in dic_counter.items() ]
    return lst_count

## 计数
dtf["tags"] = dtf["tags"].apply(lambda x: utils_lst_count(x))

## utils函数为每个标识类别创建新列
def utils_ner_features(lst_dics_tuples, tag):
    if len(lst_dics_tuples) > 0:
        tag_type = []
        for dic_tuples in lst_dics_tuples:
            for tuple in dic_tuples:
                type, n = tuple[1], dic_tuples[tuple]
                tag_type = tag_type + [type]*n
                dic_counter = collections.Counter()
                for x in tag_type:
                    dic_counter[x] += 1
        return dic_counter[tag]
    else:
        return 0

## 提取特征
tags_set = []
for lst in dtf["tags"].tolist():
     for dic in lst:
          for k in dic.keys():
              tags_set.append(k[1])
tags_set = list(set(tags_set))
for feature in tags_set:
     dtf["tags_"+feature] = norm(dtf["tags"].apply(lambda x: 
                             utils_ner_features(x, feature)))

## 结果
dtf.head()

### N-gram frequency 

In [None]:
def frequent(label):
    corpus = dtf[dtf["label"]==label]["text(Preprocessing)"]
    lst_tokens = nltk.tokenize.word_tokenize(corpus.str.cat(sep=" "))
    fig, ax = plt.subplots(nrows=1, ncols=3,figsize=(10,4))
    fig.suptitle("Most frequent words (%s)\n\n" % label, fontsize=15)

    ## unigrams
    dic_words_freq = nltk.FreqDist(lst_tokens)
    dtf_uni = pd.DataFrame(dic_words_freq.most_common(), 
                           columns=["Words","Freq"])
#     print(dtf_uni.set_index("Words"))
    dtf_uni.set_index("Words").iloc[0:15,:].iloc[::-1].plot(
                      kind="barh", title="\nUnigrams", ax=ax[0], 
                      legend=False).grid(axis='x')
    ax[0].set(xlabel="Frequency")

    ## bigrams
    dic_words_freq = nltk.FreqDist(nltk.ngrams(lst_tokens, 2))
    dtf_bi = pd.DataFrame(dic_words_freq.most_common(), 
                          columns=["Words","Freq"])
    dtf_bi["Words"] = dtf_bi["Words"].apply(lambda x: " ".join(
                       string for string in x) )
    dtf_bi.set_index("Words").iloc[0:15,:].iloc[::-1].plot(
                      kind="barh", title="Bigrams", ax=ax[1],
                      legend=False).grid(axis='x')
    ax[1].set(xlabel="Frequency")
    
    ## trigrams
    dic_words_freq = nltk.FreqDist(nltk.ngrams(lst_tokens, 3))
    dtf_tri = pd.DataFrame(dic_words_freq.most_common(), 
                          columns=["Words","Freq"])
#     print(dtf_tri.set_index("Words"))
    dtf_tri["Words"] = dtf_tri["Words"].apply(lambda x: " ".join(
                       string for string in x) )
    dtf_tri.set_index("Words").iloc[0:15,:].iloc[::-1].plot(
                      kind="barh", title="Trigrams", ax=ax[2],
                      legend=False).grid(axis='x')
    ax[2].set(xlabel="Frequency")
    
    plt.tight_layout()
#     plt.figure(figsize=(12, 4))
    plt.show()
    
    return dtf_uni, dtf_bi, dtf_tri
    
    
    
dtf_uni_spam, dtf_bi_spam, dtf_tri_spam = frequent("spam")
dtf_uni_ham, dtf_bi_ham, dtf_tri_ham = frequent("ham")

In [None]:
lst_words = ["call later", "please call", "gon na", "try contact", "get", "go", "im", "come", "free", "txt", "mobile"]
# "call later", "please call", "gon na", "try contact", "prize", "reply", "ok", "come", "send", "free", "txt", "mobile"
## 计数
lst_grams = [len(word.split(" ")) for word in lst_words]
vectorizer = CountVectorizer(
                 vocabulary=lst_words, 
                 ngram_range=(min(lst_grams),max(lst_grams)))
dtf_X = pd.DataFrame(vectorizer.fit_transform(dtf["text(Preprocessing)"]).todense(), columns=lst_words)

for index, row in dtf_X.iteritems():
    dtf_X[index] = norm(row)

## add new features
dtf = pd.concat([dtf, dtf_X.set_index(dtf.index)], axis=1)
dtf.head()

### Topic Modeling with LDA

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(dtf["text(Preprocessing)"]))

print(data_words[:1])

In [None]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

In [None]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
# nlp = spacy.load('en', disable=['parser', 'ner'])
nlp = spacy.load("en_core_web_lg", disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [None]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# import pyLDAvis.gensim_models
# # Visualize the topics
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
# vis

In [None]:
import os 
os.environ.update({'MALLET_HOME':r'E:/jupyter_notebook/mallet-2.0.8/'})

# Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
mallet_path = 'E:/jupyter_notebook/mallet-2.0.8/bin/mallet' # update this path
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)

In [None]:
# Show Topics
pprint(ldamallet.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=50, step=6)

In [None]:
# Show graph
limit=50; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values, color="darkorange")
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.title("Choosing Optimal Model with Coherence Score\n", fontsize=15)
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Select the model and print the topics
optimal_model = model_list[3]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

In [None]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=dtf["text(Preprocessing)"]):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=dtf["text(Preprocessing)"])

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

In [None]:
dtf["topic"] = norm(df_dominant_topic["Dominant_Topic"])
dtf

In [None]:
dtf.to_csv("data_all.csv")
feature = dtf.iloc[:,3:]
feature.drop('tags',axis = 1,inplace = True) #axis参数默认为0
feature.to_csv("feature_all.csv")

###      







In [None]:
import collections
from collections import Counter

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
from imblearn.over_sampling import SMOTE
import json
import math
import matplotlib.pyplot as plt
%matplotlib inline

import nltk
import nltk.stem as ns
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

import numpy as np
import os
import pickle
import pandas as pd
from pprint import pprint
import re
import seaborn as sns
# !pip install spacy
import spacy
from string import punctuation

from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer,CountVectorizer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm
# !pip install ray
from textblob import TextBlob
# from tune_sklearn import TuneSearchCV
# from ray.tune.schedulers import MedianStoppingRule
from wordcloud import WordCloud, ImageColorGenerator
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
# plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']

In [None]:
csv_file1 = "data_all.csv"
csv_data1 = pd.read_csv(csv_file1, low_memory = False)#防止弹出警告
dtf = pd.DataFrame(csv_data1)
dtf.drop('Unnamed: 0',axis = 1,inplace = True) #axis参数默认为0

csv_file2 = "feature_all.csv"
csv_data2 = pd.read_csv(csv_file2, low_memory = False)#防止弹出警告
feature = pd.DataFrame(csv_data2)
feature.drop('Unnamed: 0',axis = 1,inplace = True) #axis参数默认为0

# index = 1
# for item in list(feature.columns):
#     print(item + "  " + str(index))
#     index += 1


In [None]:
# feature
le = preprocessing.LabelEncoder()
label = le.fit_transform(dtf["label"])
print(label)

### RFECV

In [None]:
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor

from lightgbm import LGBMClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier

# Instantiate RFECV visualizer with a random forest regressor
# rfecv = RFECV(SVC(kernel="linear"), step=1, cv=StratifiedKFold(5),
#               scoring='accuracy' )
# rfecv = RFECV(RandomForestRegressor(), step=1, cv=StratifiedKFold(5),
#               scoring='accuracy' )

stacking_estimators = [

#     ('nb', MultinomialNB(alpha=10.0)),
    ('svc', svm.SVC(kernel='rbf', C=10)),
#     ('dtc', DecisionTreeClassifier(max_depth=16)),
#     ('xgb', xgb.XGBClassifier(eta=0.5, max_depth=5, objective='binary:logistic')),
    ('rfc', RandomForestClassifier(n_estimators=50, random_state=10)),
    ('lgbm', LGBMClassifier(random_state=0)),
    ('hgb', HistGradientBoostingClassifier(random_state=0))

]

rfecv = RFECV(StackingClassifier(estimators=stacking_estimators, final_estimator=LogisticRegression()), step=1, cv=StratifiedKFold(5),
              scoring='accuracy' )


fit = rfecv.fit(feature, label) # Fit the data to the visualizer

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

In [None]:
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
# print("Feature Ranking: %s" % (fit.ranking_))
index = 0
drop_list = ""
for item in fit.support_:
    if item == False:
        drop_list += '"' + list(feature)[index] + '", '
    index += 1
print("\nThe eliminated feature: " + drop_list)

In [None]:
feature.drop(["phone_count", "tags_LAW", "tags_WORK_OF_ART", "tags_NORP", "tags_EVENT", "tags_LOC", "tags_PERCENT", "tags_FAC", "tags_LANGUAGE", "call later", "please call", "gon na", "try contact"],axis = 1,inplace = True) #axis参数默认为0

# feature.drop(["phone_count", "tags_LAW", "tags_WORK_OF_ART", "tags_NORP", "tags_EVENT", "tags_LOC", "tags_PERCENT", "tags_FAC", "tags_LANGUAGE", "call later", "please call", "gon na", "try contact"],axis = 1,inplace = True) #axis参数默认为0
# "phone_count", "tags_FAC", "tags_WORK_OF_ART", "tags_MONEY", "tags_LAW", "tags_GPE", "tags_NORP", "tags_EVENT", "tags_LOC", "tags_LANGUAGE", "tags_PERCENT", "tags_QUANTITY", "call later", "please call", "gon na", "try contact"
# "phone_count", "tags_LANGUAGE", "tags_LOC", "tags_EVENT", "tags_LAW", "tags_GPE", "tags_PERCENT", "tags_FAC", "tags_NORP", "call later", "please call", "gon na", "try contact"
# "phone_count", "tags_LAW", "tags_WORK_OF_ART", "tags_NORP", "tags_EVENT", "tags_LOC", "tags_PERCENT", "tags_FAC", "tags_LANGUAGE", "call later", "please call", "gon na", "try contact"]
# print('The eliminated feature: "phone_count", "tags_LAW", "tags_WORK_OF_ART", "tags_NORP", \n"tags_EVENT", "tags_LOC", "tags_PERCENT", "tags_FAC", "tags_LANGUAGE", "call later", \n"please call", "gon na", "try contact"')

In [None]:
feature.to_csv("feature_eliminate.csv")
feature
print(feature.columns)


In [None]:
csv_file3 = "feature_eliminate.csv"
csv_data3 = pd.read_csv(csv_file3, low_memory = False)#防止弹出警告
feature = pd.DataFrame(csv_data3)
feature.drop('Unnamed: 0',axis = 1,inplace = True) #axis参数默认为0
x_train, x_test, y_train, y_test = train_test_split(feature,
                                                    label,
                                                    test_size=0.1,
                                                    stratify=label,
                                                    random_state=1)

In [None]:
def count(total):
    spam_count = 0
    for item in total:
        if item == 1:
            spam_count += 1
    ham_count = len(total) - spam_count
    return spam_count, ham_count, len(total)

train_spam_count, train_ham_count, train_count = count(y_train)
test_spam_count, test_ham_count, test_count = count(y_test)

print("The train set (spam, ham, total): (%s, %s, %s)" % (train_spam_count, train_ham_count, train_count))
print("The test set (spam, ham, total):  (%s,   %s,  %s)" % (test_spam_count, test_ham_count, test_count))

message = ["ham", "spam"]
train_size = [train_ham_count, train_spam_count]
test_size = [test_ham_count, test_spam_count]

#两者都没有设置位置，就重复  x是位置
plt.bar(message, train_size, width=0.4, label="Train Set")
plt.bar(message, test_size, width=0.4, label="Test Set")
plt.title("The Counts of Two Kinds of Set (total: %s)" % len(label), fontsize=13)
plt.ylabel("Number",fontsize=12)
plt.xlabel("Kinds of Messages",fontsize=12)
plt.grid(axis='y') 
plt.legend()
plt.show()

In [None]:
from sklearn.datasets import make_classification
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from lightgbm import LGBMClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier


X = x_train.values
y = y_train
func_list=[Counter, SMOTE, BorderlineSMOTE, ADASYN, SVMSMOTE, RandomOverSampler]
titles=['Original','SMOTE', 'BorderlineSMOTE', 'ADASYN', 'SVMSMOTE', 'RandomOverSampler']
counter=Counter(y)


# fig = plt.figure(figsize=(13,13))

def evaluation(X1,y1):
    
    stacking_estimators = [

    #     ('nb', MultinomialNB(alpha=10.0)),
        ('svc', svm.SVC(kernel='rbf', C=10)),
    #     ('dtc', DecisionTreeClassifier(max_depth=16)),
    #     ('xgb', xgb.XGBClassifier(eta=0.5, max_depth=5, objective='binary:logistic')),
        ('rfc', RandomForestClassifier(n_estimators=50, random_state=10)),
        ('lgbm', LGBMClassifier(random_state=0)),
        ('hgb', HistGradientBoostingClassifier(random_state=0))

    ]
    model = StackingClassifier(
         estimators=stacking_estimators, final_estimator=LogisticRegression()
    )
    model.fit(X1, y1)
    acc = np.round(accuracy_score(y_test, model.predict(x_test.values)), 4)
    f1 = np.round(f1_score(y_test, model.predict(x_test.values)), 4)
    return acc, f1


for idx, func in enumerate(func_list):
#     fig.add_subplot(2, 3, idx+1)
    if idx==0:
        
        acc, f1 = evaluation(X,y)
#         for label, _ in counter.items():
#             if label == 1:
#                 plt.scatter(X[y==label, 0], X[y==label, 1], label="spam", alpha=0.5)
#             else:
#                 plt.scatter(X[y==label, 0], X[y==label, 1], label="ham", alpha=0.5)
#         plt.legend()
#         plt.title(titles[idx]+ "\n")
#         print()
        continue
    else:
        X_temp, y_temp = func_list[idx]().fit_resample(X,y)
        acc, f1 = evaluation(X_temp,y_temp)
#         counter_temp=(func_list[0](y_temp)) 
#         for label, _ in counter_temp.items():
#             if label == 1:
#                 plt.scatter(X_temp[y_temp==label, 0], X_temp[y_temp==label, 1], label="spam", alpha=0.5)
#             else:
#                 plt.scatter(X_temp[y_temp==label, 0], X_temp[y_temp==label, 1], label="ham", alpha=0.5)
#         plt.legend()
    #     plt.title(titles[idx])
#         plt.title(titles[idx]+ "\n Accuracy: "+ str(acc)+ "\n F1-Score: "+ str(f1))
        print(titles[idx]+ "\n Accuracy: "+ str(acc)+ "\n F1-Score: "+ str(f1))

In [None]:
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.metrics import roc_auc_score, accuracy_score
# from sklearn.decomposition import PCA

# def calc_model(X,y):
#     # PCA降维
#     pca = PCA(n_components=2)
#     pca.fit(X)
#     de_X = pca.transform(X)
    
#     x_min, x_max = de_X[:, 0].min() - 0.8, de_X[:, 0].max() + 0.8
#     y_min, y_max = de_X[:, 1].min() - 0.8, de_X[:, 1].max() + 0.8
#     xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
#     #np.c_[xx.ravel(), yy.ravel()]
#     model = LGBMClassifier()
#     model.fit(de_X, y)
# #     print(np.c_[xx.ravel(), yy.ravel()])
#     Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
#     Z = Z.reshape(xx.shape)
# #     return xx,yy,Z, np.round(accuracy_score(y, model.predict(de_X)),4)
#     return np.round(accuracy_score(y, model.predict(de_X)),4)


#     """
#     如果数据集维度为多维，需要进一步降维才能进行决策边界可视化。有两种方式进行降维操作：

#     1.利用随机森林分类器等给特征进行重要性评分，得到2个最重要的特征，然后在散点图上绘制决策边界。
#     2.主成分分析(PCA)或线性判别分析(LDA)等降维技术可用于将N个特征嵌入到2个特征中，
#       从而将N个特征的信息解释或减少为2个特征(n_components = 2)。
#       然后再基于这两个特征在散点图上绘制决策边界。
    
#     """


In [None]:
# from matplotlib.colors import ListedColormap

# fig = plt.figure(figsize=(12,10))

# for idx, func in enumerate(func_list):
#     fig.add_subplot(2, 3, idx+1)
#     if idx==0:
# score=calc_model(X_temp,y_temp)
#         for label, _ in counter.items():
#             if label == 0:
#                 plt.scatter(X[y==label, 0], X[y==label, 1], label="spam", alpha=0.5)
#             else:
#                 plt.scatter(X[y==label, 0], X[y==label, 1], label="ham", alpha=0.5)
#         plt.legend()
#         score=calc_model(X,y)
# #         plt.pcolormesh(xx, yy, Z, cmap=ListedColormap(['#FE7E0E', '#1F75B1']), alpha=0.4)
#         plt.title(titles[idx]+ "\n Accuracy: "+ str(score))
#         continue

#     X_temp, y_temp = func_list[idx]().fit_resample(X,y)
# #     xx,yy,Z,score=calc_model(X_temp,y_temp)
#     score=calc_model(X_temp,y_temp)
#     counter_temp=(func_list[0](y_temp))
#     for label, _ in counter_temp.items():
#         if label == 1:
#             plt.scatter(X_temp[y_temp==label, 0], X_temp[y_temp==label, 1], label="spam", alpha=0.5)
#         else:
#             plt.scatter(X_temp[y_temp==label, 0], X_temp[y_temp==label, 1], label="ham", alpha=0.5)
#     plt.legend()
# #     plt.pcolormesh(xx, yy, Z, cmap=ListedColormap(['#FE7E0E', '#1F75B1']), alpha=0.4)
#     plt.title(titles[idx]+ "\n Accuracy: "+ str(score))

In [None]:
x_train_balence, y_train_balence = RandomOverSampler().fit_resample(X,y)


train_spam_count_balence, train_ham_count_balence, train_count_balence = count(y_train_balence)

print("The original train set (spam, ham, total):                ( %s, %s, %s)" % (train_spam_count, train_ham_count, train_count))
print("The train set after RandomOverSampler (spam, ham, total): (%s, %s, %s)" % (train_spam_count_balence, train_ham_count_balence, train_count_balence))

message = ["Orginal", "RandomOverSampler"]
ham_size = [train_ham_count, train_count_balence]
spam_size = [train_spam_count, train_spam_count_balence]


#两者都没有设置位置，就重复  x是位置
plt.bar(message, ham_size, width=0.4, label="ham")
plt.bar(message, spam_size, width=0.4, label="spam")
plt.title("The train set before and after balence", fontsize=13)
plt.ylabel("Number",fontsize=12)
# plt.xlabel("Kinds of Messages",fontsize=12)
plt.grid(axis='y') 
plt.legend()
plt.show()

In [None]:
x_train = pd.DataFrame(x_train_balence,index=None,columns=x_train.columns)
y_train = y_train_balence

### Navie Bayes

In [None]:
# from sklearn.naive_bayes import MultinomialNB
# model_nb = MultinomialNB().fit(x_train, y_train)
# y_pred_nb = model_nb.predict(x_test)

# from sklearn import metrics
# print("Accuracy", metrics.accuracy_score(y_test, y_pred_nb))
# print(classification_report(y_test, y_pred_nb))



from sklearn.naive_bayes import MultinomialNB

params = {'alpha': [8.0, 9.0, 9.5, 10.0, 10.1, 11.0], }

model_nb = GridSearchCV(MultinomialNB(), param_grid=params, n_jobs=-1, cv=5, verbose=5).fit(x_train, y_train)

print('Train Accuracy : %.3f'%model_nb.best_estimator_.score(x_train, y_train))
print('Test Accuracy : %.3f'%model_nb.best_estimator_.score(x_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%model_nb.best_score_)
print('Best Parameters : ',model_nb.best_params_)

y_pred_nb = model_nb.best_estimator_.predict(x_test)

from sklearn import metrics
print("Accuracy", metrics.accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb, digits=4))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

# Plot non-normalized confusion matrix
titles_options = [
    ("Confusion matrix, without normalization (MultinomialNB)", None),
    ("Normalized confusion matrix (MultinomialNB)", "true"),
]
for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(
        model_nb,
        x_test,
        y_test,
#         display_labels=class_names,
        cmap=plt.cm.Blues,
        normalize=normalize,
    )
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score

# Compute ROC curve and ROC area 
fpr_nb, tpr_nb, _ = roc_curve(y_test, y_pred_nb)
roc_auc_nb = auc(fpr_nb, tpr_nb)

# Compute micro-average ROC curve and ROC area
fpr_nb_micro, tpr_nb_micro, _ = roc_curve(y_test.ravel(), y_pred_nb.ravel())
roc_auc_nb_micro = auc(fpr_nb_micro, tpr_nb_micro)

plt.figure()
lw = 2
plt.plot(fpr_nb, tpr_nb, color="darkorange", lw=lw, label="ROC curve (area = %0.5f)" % roc_auc_nb)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic (MultinomialNB)")
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.linear_model import LogisticRegression
pr_nb = PrecisionRecallDisplay.from_estimator(
    model_nb, x_test, y_test, color="darkorange", name="MultinomialNB")
plt.title("Precision Recall from estimator (MultinomialNB)")
PrecisionRecallDisplay.from_predictions(
    y_test, y_pred_nb, name="MultinomialNB")
plt.title("Precision Recall from predictions (MultinomialNB)")
plt.show()


### SVM

In [None]:
# from sklearn import svm
# model_svc = svm.SVC(kernel='rbf').fit(x_train, y_train)
# y_pred_svc = model_svc.predict(x_test)

# from sklearn import metrics
# print("Accuracy", metrics.accuracy_score(y_test, y_pred_svc))
# print(classification_report(y_test, y_pred_svc))


from sklearn import svm
params = {'kernel':('linear', 'rbf'), 'C':[8, 20]}
model_svc = GridSearchCV(svm.SVC(), params, n_jobs=-1, cv=5, verbose=5).fit(x_train, y_train)

print('Train Accuracy : %.3f'%model_svc.best_estimator_.score(x_train, y_train))
print('Test Accuracy : %.3f'%model_svc.best_estimator_.score(x_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%model_svc.best_score_)
print('Best Parameters : ',model_svc.best_params_)

y_pred_svc = model_svc.best_estimator_.predict(x_test)

from sklearn import metrics
print("Accuracy", metrics.accuracy_score(y_test, y_pred_svc))
print(classification_report(y_test, y_pred_svc, digits=4))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

# Plot non-normalized confusion matrix
titles_options = [
    ("Confusion matrix without normalization (SVC)", None),
    ("Normalized confusion matrix (SVC)", "true"),
]
for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(
        model_svc,
        x_test,
        y_test,
#         display_labels=class_names,
        cmap=plt.cm.Blues,
        normalize=normalize,
    )
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score

# Compute ROC curve and ROC area 
fpr_svc, tpr_svc, _ = roc_curve(y_test, y_pred_svc)
roc_auc_svc = auc(fpr_svc, tpr_svc)

# Compute micro-average ROC curve and ROC area
fpr_svc_micro, tpr_svc_micro, _ = roc_curve(y_test.ravel(), y_pred_svc.ravel())
roc_auc_svc_micro = auc(fpr_svc_micro, tpr_svc_micro)

plt.figure()
lw = 2
plt.plot(fpr_svc, tpr_svc, color="darkorange", lw=lw, label="ROC curve (area = %0.5f)" % roc_auc_svc)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic (SVC)")
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.linear_model import LogisticRegression
pr_svc = PrecisionRecallDisplay.from_estimator(
    model_svc, x_test, y_test, color="darkorange", name="SVC")
plt.title("Precision Recall from estimator (SVC)")
PrecisionRecallDisplay.from_predictions(
    y_test, y_pred_svc, name="SVC")
plt.title("Precision Recall from predictions (SVC)")
plt.show()

### Decision Tree

In [None]:
# from sklearn.tree import DecisionTreeClassifier
# model_dtc = DecisionTreeClassifier().fit(x_train, y_train)
# y_pred_dtc = model_dtc.predict(x_test)

# from sklearn import metrics
# print("Accuracy", metrics.accuracy_score(y_test, y_pred_dtc))
# print(classification_report(y_test, y_pred_dtc))


from sklearn.tree import DecisionTreeClassifier
params = {'max_depth': range(13,17)}
model_dtc = GridSearchCV(DecisionTreeClassifier(), params, n_jobs=-1, cv=5, verbose=5).fit(x_train, y_train)

print('Train Accuracy : %.3f'%model_dtc.best_estimator_.score(x_train, y_train))
print('Test Accuracy : %.3f'%model_dtc.best_estimator_.score(x_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%model_dtc.best_score_)
print('Best Parameters : ',model_dtc.best_params_)

y_pred_dtc = model_dtc.best_estimator_.predict(x_test)

from sklearn import metrics
print("Accuracy", metrics.accuracy_score(y_test, y_pred_dtc))
print(classification_report(y_test, y_pred_dtc, digits=4))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

# Plot non-normalized confusion matrix
titles_options = [
    ("Confusion matrix without normalization (Decision Tree)", None),
    ("Normalized confusion matrix (Decision Tree)", "true"),
]
for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(
        model_dtc,
        x_test,
        y_test,
#         display_labels=class_names,
        cmap=plt.cm.Blues,
        normalize=normalize,
    )
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score

# Compute ROC curve and ROC area 
fpr_dtc, tpr_dtc, _ = roc_curve(y_test, y_pred_dtc)
roc_auc_dtc = auc(fpr_dtc, tpr_dtc)

# Compute micro-average ROC curve and ROC area
fpr_dtc_micro, tpr_dtc_micro, _ = roc_curve(y_test.ravel(), y_pred_dtc.ravel())
roc_auc_dtc_micro = auc(fpr_dtc_micro, tpr_dtc_micro)

plt.figure()
lw = 2
plt.plot(fpr_dtc, tpr_dtc, color="darkorange", lw=lw, label="ROC curve (area = %0.5f)" % roc_auc_dtc)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic (Decision Tree)")
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.linear_model import LogisticRegression
pr_dtc = PrecisionRecallDisplay.from_estimator(
    model_dtc, x_test, y_test, color="darkorange", name="Decision Tree")
plt.title("Precision Recall from estimator (Decision Tree)")
PrecisionRecallDisplay.from_predictions(
    y_test, y_pred_dtc, name="Decision Tree")
plt.title("Precision Recall from predictions (Decision Tree)")
plt.show()

### Random Forest

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# model_rfc = RandomForestClassifier(n_estimators=20, random_state=50).fit(x_train, y_train)
# y_pred_rfc = model_rfc.predict(x_test)

# from sklearn import metrics
# print("Accuracy", metrics.accuracy_score(y_test, y_pred_rfc))
# print(classification_report(y_test, y_pred_rfc))

from sklearn.ensemble import RandomForestClassifier
params = {'n_estimators':list((10,20,50,100)),'random_state':list((10,20,50,100))}
model_rfc = GridSearchCV(RandomForestClassifier(), params, n_jobs=-1, cv=5, verbose=5).fit(x_train, y_train)

print('Train Accuracy : %.3f'%model_rfc.best_estimator_.score(x_train, y_train))
print('Test Accuracy : %.3f'%model_rfc.best_estimator_.score(x_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%model_rfc.best_score_)
print('Best Parameters : ',model_rfc.best_params_)

y_pred_rfc = model_rfc.best_estimator_.predict(x_test)

from sklearn import metrics
print("Accuracy", metrics.accuracy_score(y_test, y_pred_rfc))
print(classification_report(y_test, y_pred_rfc, digits=4))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

# Plot non-normalized confusion matrix
titles_options = [
    ("Confusion matrix without normalization (Random Forest)", None),
    ("Normalized confusion matrix (Random Forest)", "true"),
]
for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(
        model_rfc,
        x_test,
        y_test,
#         display_labels=class_names,
        cmap=plt.cm.Blues,
        normalize=normalize,
    )
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

# Compute ROC curve and ROC area 
fpr_rfc, tpr_rfc, _ = roc_curve(y_test, y_pred_rfc)
roc_auc_rfc = auc(fpr_rfc, tpr_rfc)

# Compute micro-average ROC curve and ROC area
fpr_rfc_micro, tpr_rfc_micro, _ = roc_curve(y_test.ravel(), y_pred_rfc.ravel())
roc_auc_rfc_micro = auc(fpr_rfc_micro, tpr_rfc_micro)

plt.figure()
lw = 2
plt.plot(fpr_rfc, tpr_rfc, color="darkorange", lw=lw, label="ROC curve (area = %0.5f)" % roc_auc_rfc)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic (Random Forest)")
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.linear_model import LogisticRegression
pr_rfc = PrecisionRecallDisplay.from_estimator(
    model_rfc, x_test, y_test, color="darkorange", name="Random Forest")
plt.title("Precision Recall from estimator (Random Forest)")
PrecisionRecallDisplay.from_predictions(
    y_test, y_pred_rfc, name="Random Forest")
plt.title("Precision Recall from predictions (Random Forest)")
plt.show()

### AdaBoost

In [None]:
# from sklearn.ensemble import AdaBoostClassifier
# model_abc = AdaBoostClassifier(n_estimators=100, random_state=0).fit(x_train, y_train)
# y_pred_abc = model_abc.predict(x_test)

# from sklearn import metrics
# print("Accuracy", metrics.accuracy_score(y_test, y_pred_abc))
# print(classification_report(y_test, y_pred_abc))

from sklearn.ensemble import AdaBoostClassifier
params = {'n_estimators':list(range(125, 140)),'random_state':list((0,10,20,50,100))}
model_abc = GridSearchCV(AdaBoostClassifier(), params, n_jobs=-1, cv=5, verbose=5).fit(x_train, y_train)

print('Train Accuracy : %.3f'%model_abc.best_estimator_.score(x_train, y_train))
print('Test Accuracy : %.3f'%model_abc.best_estimator_.score(x_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%model_abc.best_score_)
print('Best Parameters : ',model_abc.best_params_)

y_pred_abc = model_abc.best_estimator_.predict(x_test)

from sklearn import metrics
print("Accuracy", metrics.accuracy_score(y_test, y_pred_abc))
print(classification_report(y_test, y_pred_abc, digits=4))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

# Plot non-normalized confusion matrix
titles_options = [
    ("Confusion matrix without normalization (AdaBoost)", None),
    ("Normalized confusion matrix (AdaBoost)", "true"),
]
for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(
        model_abc,
        x_test,
        y_test,
#         display_labels=class_names,
        cmap=plt.cm.Blues,
        normalize=normalize,
    )
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

# Compute ROC curve and ROC area 
fpr_abc, tpr_abc, _ = roc_curve(y_test, y_pred_abc)
roc_auc_abc = auc(fpr_abc, tpr_abc)

# Compute micro-average ROC curve and ROC area
fpr_abc_micro, tpr_abc_micro, _ = roc_curve(y_test.ravel(), y_pred_abc.ravel())
roc_auc_abc_micro = auc(fpr_abc_micro, tpr_abc_micro)

plt.figure()
lw = 2
plt.plot(fpr_abc, tpr_abc, color="darkorange", lw=lw, label="ROC curve (area = %0.5f)" % roc_auc_abc)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic (AdaBoost)")
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.linear_model import LogisticRegression
pr_abc = PrecisionRecallDisplay.from_estimator(
    model_abc, x_test, y_test, color="darkorange", name="AdaBoost")
plt.title("Precision Recall from estimator (AdaBoost)")
PrecisionRecallDisplay.from_predictions(
    y_test, y_pred_abc, name="AdaBoost")
plt.title("Precision Recall from predictions (AdaBoost)")
plt.show()

### Hist Gradient Boosting (Histogram-based)

In [None]:
# from sklearn.ensemble import HistGradientBoostingClassifier
# model_hgb = HistGradientBoostingClassifier(random_state=0).fit(x_train, y_train)
# y_pred_hgb = model_hgb.predict(x_test)

# from sklearn import metrics
# print("Accuracy", metrics.accuracy_score(y_test, y_pred_hgb))
# print(classification_report(y_test, y_pred_hgb))


from sklearn.ensemble import HistGradientBoostingClassifier
params = {'random_state':list((0,10,20,50,100))}
model_hgb = GridSearchCV(HistGradientBoostingClassifier(), params).fit(x_train, y_train)

print('Train Accuracy : %.3f'%model_hgb.best_estimator_.score(x_train, y_train))
print('Test Accuracy : %.3f'%model_hgb.best_estimator_.score(x_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%model_hgb.best_score_)
print('Best Parameters : ',model_hgb.best_params_)

y_pred_hgb = model_hgb.best_estimator_.predict(x_test)
# y_pred_hgb_proba = model_hgb.best_estimator_.predict_proba(x_test)

from sklearn import metrics
print("Accuracy", metrics.accuracy_score(y_test, y_pred_hgb))
print(classification_report(y_test, y_pred_hgb, digits=4))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

# Plot non-normalized confusion matrix
titles_options = [
    ("Confusion matrix without normalization (Hist Gradient Boosting)", None),
    ("Normalized confusion matrix (Hist Gradient Boosting)", "true"),
]
for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(
        model_hgb,
        x_test,
        y_test,
#         display_labels=class_names,
        cmap=plt.cm.Blues,
        normalize=normalize,
    )
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
# # !pip install scikit-plot
# import scikitplot as skplt
# plot = skplt.metrics.plot_roc(y_test, y_pred_hgb_proba)
# plt.title("ROC Curves")


# Compute ROC curve and ROC area 
fpr_hgb, tpr_hgb, _ = roc_curve(y_test, y_pred_hgb)
roc_auc_hgb = auc(fpr_hgb, tpr_hgb)

# Compute micro-average ROC curve and ROC area
fpr_hgb_micro, tpr_hgb_micro, _ = roc_curve(y_test.ravel(), y_pred_hgb.ravel())
roc_auc_hgb_micro = auc(fpr_hgb_micro, tpr_hgb_micro)

plt.figure()
lw = 2
plt.plot(fpr_hgb, tpr_hgb, color="darkorange", lw=lw, label="ROC curve (area = %0.5f)" % roc_auc_hgb)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic (Hist Gradient Boosting)")
plt.legend(loc="lower right")
plt.show()


In [None]:
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.linear_model import LogisticRegression
pr_hgb = PrecisionRecallDisplay.from_estimator(
    model_hgb, x_test, y_test, color="darkorange", name="Hist Gradient Boosting")
plt.title("Precision Recall from estimator (Hist Gradient Boosting)")
PrecisionRecallDisplay.from_predictions(
    y_test, y_pred_hgb, name="Hist Gradient Boosting")
plt.title("Precision Recall from predictions Hist Gradient Boosting)")
plt.show()

### XGBoost

In [None]:
# !pip install xgboost
import xgboost as xgb

params = {'max_depth':list((0,5)), 'eta':list((0.3,0.5)), 'verbosity':list((0,1,2)), 
            'objective':('binary:logistic', ''), 'random_state':list((0,1,2))}
# print(param)

model_xgb = GridSearchCV(xgb.XGBClassifier(), params).fit(x_train, y_train, early_stopping_rounds=10, eval_metric="error",
        eval_set=[(x_test, y_test)])

print('Train Accuracy : %.3f'%model_xgb.best_estimator_.score(x_train, y_train))
print('Test Accuracy : %.3f'%model_xgb.best_estimator_.score(x_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%model_xgb.best_score_)
print('Best Parameters : ',model_xgb.best_params_)

y_pred_xgb = model_xgb.best_estimator_.predict(x_test)

from sklearn import metrics
print("Accuracy", metrics.accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb, digits=4))

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

# Plot non-normalized confusion matrix
titles_options = [
    ("Confusion matrix without normalization (XGBoost)", None),
    ("Normalized confusion matrix (XGBoost)", "true"),
]
for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(
        model_xgb,
        x_test,
        y_test,
#         display_labels=class_names,
        cmap=plt.cm.Blues,
        normalize=normalize,
    )
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
# # !pip install scikit-plot
# import scikitplot as skplt
# plot = skplt.metrics.plot_roc(y_test, y_pred_hgb_proba)
# plt.title("ROC Curves")


# Compute ROC curve and ROC area 
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_pred_xgb)
roc_auc_xgb = auc(fpr_xgb, tpr_xgb)

# Compute micro-average ROC curve and ROC area
fpr_xgb_micro, tpr_xgb_micro, _ = roc_curve(y_test.ravel(), y_pred_xgb.ravel())
roc_auc_xgb_micro = auc(fpr_xgb_micro, tpr_xgb_micro)

plt.figure()
lw = 2
plt.plot(fpr_xgb, tpr_xgb, color="darkorange", lw=lw, label="ROC curve (area = %0.5f)" % roc_auc_xgb)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic (XGBoost)")
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.linear_model import LogisticRegression
pr_xgb = PrecisionRecallDisplay.from_estimator(
    model_xgb, x_test, y_test, color="darkorange", name="XGBoost")
plt.title("Precision Recall from estimator (XGBoost)")
PrecisionRecallDisplay.from_predictions(
    y_test, y_pred_xgb, name="XGBoost")
plt.title("Precision Recall from predictions XGBoost)")
plt.show()

### XGBoost + Logistic Regression

In [None]:
from sklearn.preprocessing import  OneHotEncoder

model_xgb1 = xgb.XGBClassifier(eta= 0.5, max_depth=5, objective='binary:logistic', random_state=0, verbosity=0).fit(x_train.values, y_train)
#apply()方法可以获得leaf indices(叶节点索引)
x_train_leaves = model_xgb1.apply(x_train.values)

x_test_leaves = model_xgb1.apply(x_test.values)

# 训练样本个数
train_rows = x_train_leaves.shape[0]

# 合并编码后的训练数据和测试数据
x_leaves = np.concatenate((x_train_leaves, x_test_leaves), axis=0)
x_leaves = x_leaves.astype(np.int32)
(rows, cols) = x_leaves.shape
# X_leaves.shape = (371, 150)


# 对所有特征进行ont-hot编码
xgbenc = OneHotEncoder()
x_trans = xgbenc.fit_transform(x_leaves)

#fit_transform()的作用就是先拟合数据，然后转化它将其转化为标准形式
#(train_rows, cols) = X_train_leaves.shape

#这里得到的X_trans即为得到的one-hot的新特征
# 定义LR模型
lr = LogisticRegression()
# lr对xgboost特征编码后的样本模型训练
lr.fit(x_trans[:train_rows, :], y_train)
y_pred_xgblr = lr.predict(x_trans[train_rows:, :])
# print(y_pred_xgblr)
from sklearn import metrics
print("Accuracy", metrics.accuracy_score(y_test, y_pred_xgblr))
print(classification_report(y_test, y_pred_xgblr, digits=4))

In [None]:
from sklearn.metrics import roc_auc_score
# # !pip install scikit-plot
# import scikitplot as skplt
# plot = skplt.metrics.plot_roc(y_test, y_pred_hgb_proba)
# plt.title("ROC Curves")


# Compute ROC curve and ROC area 
fpr_xgblr, tpr_xgblr, _ = roc_curve(y_test, y_pred_xgblr)
roc_auc_xgblr = auc(fpr_xgblr, tpr_xgblr)

# Compute micro-average ROC curve and ROC area
fpr_xgblr_micro, tpr_xgblr_micro, _ = roc_curve(y_test.ravel(), y_pred_xgblr.ravel())
roc_auc_xgblr_micro = auc(fpr_xgblr_micro, tpr_xgblr_micro)

plt.figure()
lw = 2
plt.plot(fpr_xgblr, tpr_xgblr, color="darkorange", lw=lw, label="ROC curve (area = %0.5f)" % roc_auc_xgblr)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic (XGBoost+Logistic Regression)")
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.linear_model import LogisticRegression
pr_xgblr = PrecisionRecallDisplay.from_estimator(
    model_xgb1, x_test, y_test, color="darkorange", name="XGBoost+Logistic Regression")
plt.title("Precision Recall from estimator (XGBoost+Logistic Regression)")
PrecisionRecallDisplay.from_predictions(
    y_test, y_pred_xgblr, name="XGBoost+Logistic Regression")
plt.title("Precision Recall from predictions XGBoost+Logistic Regression)")
plt.show()

### LightGBM

In [None]:
# !pip install lightgbm
from lightgbm import LGBMClassifier

params = { 'random_state':list((0,1,2,3,4,5,6))}
# print(param)

model_lgbm = GridSearchCV(LGBMClassifier(), params).fit(x_train, y_train)

print('Train Accuracy : %.3f'%model_lgbm.best_estimator_.score(x_train, y_train))
print('Test Accuracy : %.3f'%model_lgbm.best_estimator_.score(x_test, y_test))
print('Best Accuracy Through Grid Search : %.3f'%model_lgbm.best_score_)
print('Best Parameters : ',model_lgbm.best_params_)

y_pred_lgbm = model_lgbm.best_estimator_.predict(x_test)

from sklearn import metrics
print("Accuracy", metrics.accuracy_score(y_test, y_pred_lgbm))
print(classification_report(y_test, y_pred_lgbm, digits=4))

In [None]:
# Compute ROC curve and ROC area 
fpr_lgbm, tpr_lgbm, _ = roc_curve(y_test, y_pred_lgbm)
roc_auc_lgbm = auc(fpr_lgbm, tpr_lgbm)

# Compute micro-average ROC curve and ROC area
fpr_lgbm_micro, tpr_lgbm_micro, _ = roc_curve(y_test.ravel(), y_pred_lgbm.ravel())
roc_auc_lgbm_micro = auc(fpr_lgbm_micro, tpr_lgbm_micro)

plt.figure()
lw = 2
plt.plot(fpr_lgbm, tpr_lgbm, color="darkorange", lw=lw, label="ROC curve (area = %0.5f)" % roc_auc_lgbm)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic (LightGBM)")
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.linear_model import LogisticRegression
pr_lgbm = PrecisionRecallDisplay.from_estimator(
    model_lgbm, x_test, y_test, color="darkorange", name="LightGBM")
plt.title("Precision Recall from estimator (LightGBM)")
PrecisionRecallDisplay.from_predictions(
    y_test, y_pred_lgbm, name="LightGBM")
plt.title("Precision Recall from predictions LightGBM)")
plt.show()

### Stacking

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
from sklearn import svm
from lightgbm import LGBMClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
stacking_estimators = [
    
#     ('nb', MultinomialNB(alpha=10.0)),
    ('svc', svm.SVC(kernel='rbf', C=10)),
#     ('dtc', DecisionTreeClassifier(max_depth=16)),
#     ('xgb', xgb.XGBClassifier(eta=0.5, max_depth=5, objective='binary:logistic')),
    ('rfc', RandomForestClassifier(n_estimators=50, random_state=10)),
    ('lgbm', LGBMClassifier(random_state=0)),
    ('hgb', HistGradientBoostingClassifier(random_state=0))
    
]
stacking_model = StackingClassifier(
     estimators=stacking_estimators, final_estimator=LogisticRegression()
)
model_stk = stacking_model.fit(x_train, y_train)

y_pred_stk = model_stk.predict(x_test)

from sklearn import metrics
print("Accuracy", metrics.accuracy_score(y_test, y_pred_stk))
print(classification_report(y_test, y_pred_stk, digits=4))


In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

# Plot non-normalized confusion matrix
titles_options = [
    ("Confusion matrix without normalization (Stacking)", None),
    ("Normalized confusion matrix (Stacking)", "true"),
]
for title, normalize in titles_options:
    disp = ConfusionMatrixDisplay.from_estimator(
        model_stk,
        x_test,
        y_test,
#         display_labels=class_names,
        cmap=plt.cm.Blues,
        normalize=normalize,
    )
    disp.ax_.set_title(title)

    print(title)
    print(disp.confusion_matrix)

plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc

# Compute ROC curve and ROC area 
fpr_stk, tpr_stk, _ = roc_curve(y_test, y_pred_stk)
roc_auc_stk = auc(fpr_stk, tpr_stk)

# Compute micro-average ROC curve and ROC area
fpr_hgb_micro, tpr_hgb_micro, _ = roc_curve(y_test.ravel(), y_pred_stk.ravel())
roc_auc_hgb_micro = auc(fpr_hgb_micro, tpr_hgb_micro)

plt.figure()
lw = 2
plt.plot(fpr_stk, tpr_stk, color="darkorange", lw=lw, label="ROC curve (area = %0.5f)" % roc_auc_stk)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic (Stacking)")
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.linear_model import LogisticRegression
pr_stk = PrecisionRecallDisplay.from_estimator(
    model_stk, x_test, y_test, color="darkorange", name="Stacking")
plt.title("Precision Recall from estimator (Stacking)")
PrecisionRecallDisplay.from_predictions(
    y_test, y_pred_stk, name="Stacking")
plt.title("Precision Recall from predictions Stacking)")
plt.show()

### ROC Curve Plot

In [None]:
plt.figure(figsize=(10, 7))
plt.plot(fpr_nb, tpr_nb, linestyle = '-' , label="MultinomialNB                         (ROCAUC = %0.5f)"%roc_auc_nb)
plt.plot(fpr_svc, tpr_svc, linestyle = '--' ,label="SVC                                          (ROCAUC = %0.5f)"%roc_auc_svc)
plt.plot(fpr_dtc, tpr_dtc, linestyle = '-.' ,label="Decision Tree                           (ROCAUC = %0.5f)"%roc_auc_dtc)
plt.plot(fpr_rfc, tpr_rfc, linestyle = '--' , label="Random Forest                        (ROCAUC = %0.5f)"%roc_auc_rfc)
plt.plot(fpr_abc, tpr_abc, linestyle = '--' ,label="AdaBoost                                 (ROCAUC = %0.5f)"%roc_auc_abc)
plt.plot(fpr_hgb, tpr_hgb, linestyle = '--' ,label="Hist Gradient Boosting            (ROCAUC = %0.5f)"%roc_auc_hgb)
plt.plot(fpr_xgb, tpr_xgb, linestyle = '--' ,label="XGBoost                                   (ROCAUC = %0.5f)"%roc_auc_xgb)
plt.plot(fpr_xgblr, tpr_xgblr, linestyle = '--' ,label="XGBoost+Logistic Regression (ROCAUC = %0.5f)"%roc_auc_xgblr)
plt.plot(fpr_lgbm, tpr_lgbm, linestyle = '--' ,label="LightGBM                                 (ROCAUC = %0.5f)"%roc_auc_lgbm)
plt.plot(fpr_stk, [0.,         0.9723, 1.        ], linestyle = '--' ,label="Stacking                                   (ROCAUC = %0.5f)"%roc_auc_stk)
plt.title("ROC Curve Plot")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()
print("%s"%tpr_stk)


plt.figure(figsize=(10, 7))
plt.plot(fpr_nb, tpr_nb, linestyle = '-' , label="MultinomialNB                         (ROCAUC = %0.5f)"%roc_auc_nb)
plt.plot(fpr_svc, tpr_svc, linestyle = '--' ,label="SVC                                          (ROCAUC = %0.5f)"%roc_auc_svc)
plt.plot(fpr_dtc, tpr_dtc, linestyle = '-.' ,label="Decision Tree                           (ROCAUC = %0.5f)"%roc_auc_dtc)
plt.plot(fpr_rfc, tpr_rfc, linestyle = ':' , label="Random Forest                        (ROCAUC = %0.5f)"%roc_auc_rfc)
plt.plot(fpr_abc, tpr_abc, linestyle = '--' ,marker = 'v', label="AdaBoost                                 (ROCAUC = %0.5f)"%roc_auc_abc)
plt.plot(fpr_hgb, tpr_hgb, linestyle = '--' ,marker = 'o', label="Hist Gradient Boosting            (ROCAUC = %0.5f)"%roc_auc_hgb)
plt.plot(fpr_xgb, tpr_xgb, linestyle = '--' ,marker = '^', label="XGBoost                                   (ROCAUC = %0.5f)"%roc_auc_xgb)
plt.plot(fpr_xgblr, tpr_xgblr, linestyle = '--' , marker = 'p', label="XGBoost+Logistic Regression (ROCAUC = %0.5f)"%roc_auc_xgblr)
plt.plot(fpr_lgbm, tpr_lgbm, linestyle = '--'  ,marker = 'h', label="LightGBM                                 (ROCAUC = %0.5f)"%roc_auc_lgbm)
plt.plot(fpr_stk, [0.,         0.9723, 1.        ], linestyle = '--' , marker = '+', label="Stacking                                   (ROCAUC = %0.5f)"%roc_auc_stk)
plt.title("ROC Curve Plot (zoomed in at top left)")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.xlim(0, 0.06)
plt.ylim(0.8, 1)
plt.legend()
plt.show()

### PR Curve Plot

In [None]:
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8), (ax9, ax10)) = plt.subplots(5, 2, figsize=(8, 14))
# fig.suptitle("PR Curve Plot", fontsize=14)

pr_nb.plot(ax=ax1, color="darkorange")
# plt.title("Precision Recall from estimator (MultinomialNB)")

pr_svc.plot(ax=ax2, color="darkorange")
# plt.title("Precision Recall from estimator (SVC)")

pr_dtc.plot(ax=ax3, color="darkorange")
# plt.title("Precision Recall from estimator (Decision Tree)")

# fig, (ax4, ax5, ax6) = plt.subplots(1, 3, figsize=(12, 4))
pr_rfc.plot(ax=ax4, color="darkorange")
# plt.title("Precision Recall from estimator (Random Forest)")

pr_abc.plot(ax=ax5, color="darkorange")
# plt.title("Precision Recall from estimator (AdaBoost)")

pr_hgb.plot(ax=ax6, color="darkorange")
# plt.title("Precision Recall from estimator (Hist Gradient Boosting)")

pr_xgb.plot(ax=ax7, color="darkorange")

pr_xgblr.plot(ax=ax8, color="darkorange")

pr_lgbm.plot(ax=ax9, color="darkorange")

pr_stk.plot(ax=ax10, color="darkorange")

plt.tight_layout()

plt.show()