In [1]:
import pandas as pd
import numpy as np
import json
import datetime
from collections import defaultdict
from tqdm import tqdm
import random
import spacy
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import ComplementNB, MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix,roc_auc_score,accuracy_score, log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

In [2]:
all_df = pd.read_csv('../preprocessing/result/pu_chart/pu_chart_nursing_3day.csv')
train_df = all_df[all_df.split == 'train']
test_df = all_df[all_df.split == 'test']
text_train = train_df.clean_text
Y_train = label_train = train_df.pu_label
text_test = test_df.clean_text
Y_test = label_test = test_df.pu_label

In [None]:
%%time
nlp = spacy.load("en_core_web_sm")
lemmatized_text = []
for doc in nlp.pipe(total_text, batch_size=10,n_process=20, disable=["parser", "ner"]):
    tmp_doc = [tok.lemma_ for tok in doc]
    tmp_doc = ' '.join(tmp_doc)
    lemmatized_text.append(tmp_doc)

In [None]:
text_train,text_test,Y_train,Y_test = train_test_split(lemmatized_text,total_label,test_size = 0.2,stratify = total_label,random_state=221)

In [None]:
import pickle
import os
des_dir = '../preprocessing/result/pu_chart/'
pickle.dump(text_train,open(os.path.join(des_dir,'train_text_lemma.pkl'),'wb'))
pickle.dump(text_test,open(os.path.join(des_dir,'text_test_lemma.pkl'),'wb'))

## 重点观察lemma的 gridient boost

In [None]:
Tfidf_vectorizer = TfidfVectorizer(ngram_range = (1,2),min_df =5,stop_words='english',max_df = 0.8)
Tfidf_vectorizer = Tfidf_vectorizer.fit(text_train)
X_train = Tfidf_vectorizer.transform(text_train)
X_test = Tfidf_vectorizer.transform(text_test)
print(np.shape(X_train))

In [6]:
%%time
clf = GradientBoostingClassifier(n_estimators=150)
clf.fit(X_train, Y_train)
train_predictions = clf.predict(X_test)
acc = accuracy_score(Y_test, train_predictions)
print("Accuracy: {:.4%}".format(acc))
train_predictions = clf.predict_proba(X_test)
ll = roc_auc_score(Y_test, train_predictions[:,1])
print("roc_auc_score: {}".format(ll))

Accuracy: 90.3226%
roc_auc_score: 0.9468037954556296
CPU times: user 3min 6s, sys: 1.62 s, total: 3min 8s
Wall time: 3min 8s


In [10]:
clf.n_estimators

150

In [None]:
from joblib import dump, load
dump(clf, 'filename.joblib') 

In [3]:
Tfidf_vectorizer = TfidfVectorizer(min_df =5,stop_words='english',max_df = 0.8)
Tfidf_vectorizer = Tfidf_vectorizer.fit(text_train)
X_train = Tfidf_vectorizer.transform(text_train)
X_test = Tfidf_vectorizer.transform(text_test)
print(np.shape(X_train))

ERROR! Session/line number was not unique in database. History logging moved to new session 348
(6075, 14517)


In [4]:
%%time
clf = GradientBoostingClassifier()
clf.fit(X_train, Y_train)
train_predictions = clf.predict(X_test)
acc = accuracy_score(Y_test, train_predictions)
print("Accuracy: {:.4%}".format(acc))
train_predictions = clf.predict_proba(X_test)
ll = roc_auc_score(Y_test, train_predictions[:,1])
print("roc_auc_score: {}".format(ll))

Accuracy: 90.2567%
roc_auc_score: 0.946154855225708
CPU times: user 1min 2s, sys: 45.5 ms, total: 1min 2s
Wall time: 1min 2s


In [5]:
from sklearn.inspection import permutation_importance

In [15]:
clf = load('./chart_model/gboost_1gram_lemma.joblib')
train_predictions = clf.predict(X_test)
acc = accuracy_score(Y_test, train_predictions)
print("Accuracy: {:.4%}".format(acc))
train_predictions = clf.predict_proba(X_test)
ll = roc_auc_score(Y_test, train_predictions[:,1])
print("roc_auc_score: {}".format(ll))

Accuracy: 90.3922%
roc_auc_score: 0.9534145794657185


In [6]:
result = permutation_importance(clf, X_test.todense(), Y_test, n_repeats=10,
                                random_state=42, n_jobs=-1)

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [16]:
pickle.dump(Tfidf_vectorizer,open('./chart_model/tdidf_1gram','wb'))

In [17]:
Tfidf_vectorizer2 = pickle.load(open('./chart_model/tdidf_1gram','rb'))

In [19]:
Tfidf_vectorizer2.idf_

array([3.19722458, 5.67626611, 5.00978717, ..., 5.28850058, 7.23441073,
       7.63987583])

In [18]:
X_test = Tfidf_vectorizer2.transform(text_test)
clf = load('./chart_model/gboost_1gram_lemma.joblib')
train_predictions = clf.predict(X_test)
acc = accuracy_score(Y_test, train_predictions)
print("Accuracy: {:.4%}".format(acc))
train_predictions = clf.predict_proba(X_test)
ll = roc_auc_score(Y_test, train_predictions[:,1])
print("roc_auc_score: {}".format(ll))

Accuracy: 90.3922%
roc_auc_score: 0.9534145794657185


In [None]:
from sklearn.model_selection import GridSearchCV
import random

clf = GradientBoostingClassifier()
distributions = dict(max_depth=[3,4,5],
                      min_samples_leaf=[1,2,3])
clf = GridSearchCV(clf, distributions)
search = clf.fit(X_train, Y_train)
search.best_params_

## lemma一般实验

In [36]:
Count_vectorizer = CountVectorizer(min_df =5,stop_words='english',max_df = 0.8)
Count_vectorizer = Count_vectorizer.fit(text_train)
X_train = Count_vectorizer.transform(text_train)
X_test = Count_vectorizer.transform(text_test)
print(np.shape(X_train))

(6119, 14555)


In [37]:
Count_vectorizer_upperbound = CountVectorizer(min_df =5,stop_words='english',max_df = 0.8)
Count_vectorizer_upperbound = Count_vectorizer.fit(lemmatized_text)
X_upperbound = Count_vectorizer.transform(lemmatized_text)
print(np.shape(X_upperbound))

(7649, 16264)


In [39]:
classifiers = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    ComplementNB(), MultinomialNB(),
    LogisticRegression(max_iter = 1000)]

classifiers_upperbound = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    ComplementNB(), MultinomialNB(),
    LogisticRegression(max_iter = 1000)]

# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "roc_auc_score"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, Y_train)
    name = clf.__class__.__name__
    print("="*30)
    print(name)
    print('****Results****')
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(Y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    train_predictions = clf.predict_proba(X_test)
    ll = roc_auc_score(Y_test, train_predictions[:,1])
    print("roc_auc_score: {}".format(ll))
    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    log = log.append(log_entry)
print("="*30)

for clf in classifiers_upperbound:
    clf.fit(X_upperbound, total_label)
    name = clf.__class__.__name__
    print("="*30)
    print(name)
    print('****Upper bound****')
    train_predictions = clf.predict(X_upperbound)
    acc = accuracy_score(total_label, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    train_predictions = clf.predict_proba(X_upperbound)
    ll = roc_auc_score(total_label, train_predictions[:,1])
    print("roc_auc_score: {}".format(ll))
    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    log = log.append(log_entry)
print("="*30)

DecisionTreeClassifier
****Results****
Accuracy: 86.2092%
roc_auc_score: 0.8431885403228343
RandomForestClassifier
****Results****
Accuracy: 88.3007%
roc_auc_score: 0.9444892388534031
AdaBoostClassifier
****Results****
Accuracy: 89.4771%
roc_auc_score: 0.9421077058619931
GradientBoostingClassifier
****Results****
Accuracy: 90.7190%
roc_auc_score: 0.9547046584437242
ComplementNB
****Results****
Accuracy: 83.9216%
roc_auc_score: 0.8257528082816777
MultinomialNB
****Results****
Accuracy: 84.0523%
roc_auc_score: 0.8257528082816777
LogisticRegression
****Results****
Accuracy: 88.4967%
roc_auc_score: 0.9213002658821309
DecisionTreeClassifier
****Upper bound****
Accuracy: 100.0000%
roc_auc_score: 1.0
RandomForestClassifier
****Upper bound****
Accuracy: 100.0000%
roc_auc_score: 1.0
AdaBoostClassifier
****Upper bound****
Accuracy: 90.7308%
roc_auc_score: 0.962077995229691
GradientBoostingClassifier
****Upper bound****
Accuracy: 92.7572%
roc_auc_score: 0.9770213401353764
ComplementNB
****Upper b

In [29]:
Count_vectorizer = CountVectorizer(ngram_range = (1,2),min_df =5,stop_words='english',max_df = 0.8)
Count_vectorizer = Count_vectorizer.fit(text_train)
X_train = Count_vectorizer.transform(text_train)
X_test = Count_vectorizer.transform(text_test)
print(np.shape(X_train))

(6119, 125394)


In [40]:
classifiers = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    ComplementNB(), MultinomialNB(),
    LogisticRegression(max_iter = 1000)]

# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "roc_auc_score"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, Y_train)
    name = clf.__class__.__name__
    print("="*30)
    print(name)
    print('****Results****')
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(Y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    train_predictions = clf.predict_proba(X_test)
    ll = roc_auc_score(Y_test, train_predictions[:,1])
    print("roc_auc_score: {}".format(ll))
    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    log = log.append(log_entry)
print("="*30)

DecisionTreeClassifier
****Results****
Accuracy: 86.6013%
roc_auc_score: 0.8466123627324502
RandomForestClassifier
****Results****
Accuracy: 88.1699%
roc_auc_score: 0.9408097605487555
AdaBoostClassifier
****Results****
Accuracy: 89.4771%
roc_auc_score: 0.9421077058619931
GradientBoostingClassifier
****Results****
Accuracy: 90.6536%
roc_auc_score: 0.9547380903684591
ComplementNB
****Results****
Accuracy: 83.9216%
roc_auc_score: 0.8257528082816777
MultinomialNB
****Results****
Accuracy: 84.0523%
roc_auc_score: 0.8257528082816777
LogisticRegression
****Results****
Accuracy: 88.4967%
roc_auc_score: 0.9213002658821309


In [46]:
Tfidf_vectorizer = TfidfVectorizer(ngram_range = (1,2),min_df =5,stop_words='english',max_df = 0.8)
Tfidf_vectorizer = Tfidf_vectorizer.fit(text_train)
X_train = Tfidf_vectorizer.transform(text_train)
X_test = Tfidf_vectorizer.transform(text_test)
print(np.shape(X_train))

(6119, 125394)


In [47]:
classifiers = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    ComplementNB(), MultinomialNB(),
    LogisticRegression(max_iter = 1000)]

# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "roc_auc_score"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, Y_train)
    name = clf.__class__.__name__
    print("="*30)
    print(name)
    print('****Results****')
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(Y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    train_predictions = clf.predict_proba(X_test)
    ll = roc_auc_score(Y_test, train_predictions[:,1])
    print("roc_auc_score: {}".format(ll))
    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    log = log.append(log_entry)
print("="*30)

DecisionTreeClassifier
****Results****
Accuracy: 85.1634%
roc_auc_score: 0.8371452282810485
RandomForestClassifier
****Results****
Accuracy: 86.7974%
roc_auc_score: 0.9475944746861333
AdaBoostClassifier
****Results****
Accuracy: 89.3464%
roc_auc_score: 0.9426475331172712
GradientBoostingClassifier
****Results****
Accuracy: 90.9150%
roc_auc_score: 0.9556702510934206
ComplementNB
****Results****
Accuracy: 81.4379%
roc_auc_score: 0.8970395047355337
MultinomialNB
****Results****
Accuracy: 79.7386%
roc_auc_score: 0.8970395047355337
LogisticRegression
****Results****
Accuracy: 87.9739%
roc_auc_score: 0.9520605865139549


In [44]:
Tfidf_vectorizer = TfidfVectorizer(min_df =5,stop_words='english',max_df = 0.8)
Tfidf_vectorizer = Tfidf_vectorizer.fit(text_train)
X_train = Tfidf_vectorizer.transform(text_train)
X_test = Tfidf_vectorizer.transform(text_test)
print(np.shape(X_train))

(6119, 14555)


In [45]:
classifiers = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    ComplementNB(), MultinomialNB(),
    LogisticRegression(max_iter = 1000)]

# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "roc_auc_score"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, Y_train)
    name = clf.__class__.__name__
    print("="*30)
    print(name)
    print('****Results****')
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(Y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    train_predictions = clf.predict_proba(X_test)
    ll = roc_auc_score(Y_test, train_predictions[:,1])
    print("roc_auc_score: {}".format(ll))
    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    log = log.append(log_entry)
print("="*30)

DecisionTreeClassifier
****Results****
Accuracy: 84.1830%
roc_auc_score: 0.8195974009628395
RandomForestClassifier
****Results****
Accuracy: 87.7124%
roc_auc_score: 0.9461962335986911
AdaBoostClassifier
****Results****
Accuracy: 89.2157%
roc_auc_score: 0.9460084248450333
GradientBoostingClassifier
****Results****
Accuracy: 90.4575%
roc_auc_score: 0.9538786932443913
ComplementNB
****Results****
Accuracy: 85.4248%
roc_auc_score: 0.9173759478933954
MultinomialNB
****Results****
Accuracy: 84.0523%
roc_auc_score: 0.9173759478933954
LogisticRegression
****Results****
Accuracy: 88.3007%
roc_auc_score: 0.954377222239703


# 没有lemmatized:

In [9]:
text_train,text_test,Y_train,Y_test = train_test_split(total_text,total_label,test_size = 0.2,stratify = total_label,random_state=221)

In [10]:
Count_vectorizer = CountVectorizer(min_df =5,stop_words='english',max_df = 0.8)
Count_vectorizer = Count_vectorizer.fit(text_train)
X_train = Count_vectorizer.transform(text_train)
X_test = Count_vectorizer.transform(text_test)
print(np.shape(X_train))

(6119, 15456)


In [15]:
classifiers = [
    SVC(kernel="linear", probability=True),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    ComplementNB(), MultinomialNB(),
    LogisticRegression()]

# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "roc_auc_score"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, Y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(Y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    
    train_predictions = clf.predict_proba(X_test)
    ll = roc_auc_score(Y_test, train_predictions[:,1])
    print("roc_auc_score: {}".format(ll))
    
    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    log = log.append(log_entry)
    
print("="*30)

SVC
****Results****
Accuracy: 87.1242%
roc_auc_score: 0.9043502800415343
NuSVC
****Results****
Accuracy: 80.7843%
roc_auc_score: 0.9069973018470154
DecisionTreeClassifier
****Results****
Accuracy: 85.6209%
roc_auc_score: 0.8323329977030302
RandomForestClassifier
****Results****
Accuracy: 88.4967%
roc_auc_score: 0.9451116626286147
AdaBoostClassifier
****Results****
Accuracy: 89.6078%
roc_auc_score: 0.9451627938076208
GradientBoostingClassifier
****Results****
Accuracy: 90.7843%
roc_auc_score: 0.9536653189012303
ComplementNB
****Results****
Accuracy: 84.2484%
roc_auc_score: 0.8348049935496051
MultinomialNB
****Results****
Accuracy: 84.1830%
roc_auc_score: 0.8350409836065574
LogisticRegression
****Results****
Accuracy: 87.7124%
roc_auc_score: 0.9143061105692081


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [16]:
clf = GradientBoostingClassifier()
clf.fit(X_train, Y_train)
train_predictions = clf.predict(X_test)
acc = accuracy_score(Y_test, train_predictions)
print("Accuracy: {:.4%}".format(acc))

train_predictions = clf.predict_proba(X_test)
ll = roc_auc_score(Y_test, train_predictions[:,1])
print("roc_auc_score: {}".format(ll))

Accuracy: 90.7190%
roc_auc_score: 0.9536987508259651


In [17]:
clf

GradientBoostingClassifier()

In [18]:
clf.n_estimators

100

In [19]:
Count_vectorizer = CountVectorizer(ngram_range = (1,2),min_df =5,stop_words='english',max_df = 0.8)
Count_vectorizer = Count_vectorizer.fit(text_train)
X_train = Count_vectorizer.transform(text_train)
X_test = Count_vectorizer.transform(text_test)
print(np.shape(X_train))

(6119, 125230)


In [20]:
classifiers = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    ComplementNB(), MultinomialNB(),
    LogisticRegression()]

# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "roc_auc_score"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, Y_train)
    name = clf.__class__.__name__
    print("="*30)
    print(name)
    print('****Results****')
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(Y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    train_predictions = clf.predict_proba(X_test)
    ll = roc_auc_score(Y_test, train_predictions[:,1])
    print("roc_auc_score: {}".format(ll))
    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    log = log.append(log_entry)
print("="*30)

DecisionTreeClassifier
****Results****
Accuracy: 86.3399%
roc_auc_score: 0.8457824643655014
RandomForestClassifier
****Results****
Accuracy: 87.7778%
roc_auc_score: 0.9404203769547842
AdaBoostClassifier
****Results****
Accuracy: 90.4575%
roc_auc_score: 0.947416498851515
GradientBoostingClassifier
****Results****
Accuracy: 91.3072%
roc_auc_score: 0.9547685724174821
ComplementNB
****Results****
Accuracy: 84.1830%
roc_auc_score: 0.8362337166860703
MultinomialNB
****Results****
Accuracy: 84.1176%
roc_auc_score: 0.8362337166860703
LogisticRegression
****Results****
Accuracy: 88.6275%
roc_auc_score: 0.936772363991064


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [23]:
Tfidf_vectorizer = TfidfVectorizer(ngram_range = (1,2),min_df =5,stop_words='english',max_df = 0.8)
Tfidf_vectorizer = Tfidf_vectorizer.fit(text_train)
X_train = Tfidf_vectorizer.transform(text_train)
X_test = Tfidf_vectorizer.transform(text_test)
print(np.shape(X_train))

(6119, 125230)


In [24]:
classifiers = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    ComplementNB(), MultinomialNB(),
    LogisticRegression()]

# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "roc_auc_score"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, Y_train)
    name = clf.__class__.__name__
    print("="*30)
    print(name)
    print('****Results****')
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(Y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    train_predictions = clf.predict_proba(X_test)
    ll = roc_auc_score(Y_test, train_predictions[:,1])
    print("roc_auc_score: {}".format(ll))
    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    log = log.append(log_entry)
print("="*30)

DecisionTreeClassifier
****Results****
Accuracy: 85.4902%
roc_auc_score: 0.8346417670935464
RandomForestClassifier
****Results****
Accuracy: 87.3203%
roc_auc_score: 0.9464784383751298
AdaBoostClassifier
****Results****
Accuracy: 89.2810%
roc_auc_score: 0.9418363172964979
GradientBoostingClassifier
****Results****
Accuracy: 90.3922%
roc_auc_score: 0.9541618813127339
ComplementNB
****Results****
Accuracy: 81.5033%
roc_auc_score: 0.8991535823290646
MultinomialNB
****Results****
Accuracy: 79.6732%
roc_auc_score: 0.8991535823290646
LogisticRegression
****Results****
Accuracy: 88.1046%
roc_auc_score: 0.9521707152071993


In [25]:
Tfidf_vectorizer = TfidfVectorizer(min_df =5,stop_words='english',max_df = 0.8)
Tfidf_vectorizer = Tfidf_vectorizer.fit(text_train)
X_train = Tfidf_vectorizer.transform(text_train)
X_test = Tfidf_vectorizer.transform(text_test)
print(np.shape(X_train))

(6119, 15456)


In [26]:
classifiers = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    ComplementNB(), MultinomialNB(),
    LogisticRegression()]

# Logging for Visual Comparison
log_cols=["Classifier", "Accuracy", "roc_auc_score"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    clf.fit(X_train, Y_train)
    name = clf.__class__.__name__
    print("="*30)
    print(name)
    print('****Results****')
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(Y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    train_predictions = clf.predict_proba(X_test)
    ll = roc_auc_score(Y_test, train_predictions[:,1])
    print("roc_auc_score: {}".format(ll))
    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    log = log.append(log_entry)
print("="*30)

DecisionTreeClassifier
****Results****
Accuracy: 85.0980%
roc_auc_score: 0.8306732009691324
RandomForestClassifier
****Results****
Accuracy: 87.0588%
roc_auc_score: 0.9477370520122086
AdaBoostClassifier
****Results****
Accuracy: 89.2157%
roc_auc_score: 0.9402129023630471
GradientBoostingClassifier
****Results****
Accuracy: 90.2614%
roc_auc_score: 0.9521146675686731
ComplementNB
****Results****
Accuracy: 85.6209%
roc_auc_score: 0.9181016173185236
MultinomialNB
****Results****
Accuracy: 83.5948%
roc_auc_score: 0.9181016173185236
LogisticRegression
****Results****
Accuracy: 88.3660%
roc_auc_score: 0.9543241244768886


In [None]:
import spacy
import spacy
nlp = spacy.load("en_core_web_sm")
lemmatized_text = []
count = 0
for doc in nlp.pipe(total_text, batch_size=120, n_process=20, disable=["parser", "ner"]):
    count+=1
    print(count)
    tmp_doc = [tok.lemma_ for tok in doc]
    tmp_doc = ' '.join(tmp_doc)
    lemmatized_text.append(tmp_doc)
print(len(lemmatized_text))
print(len(full_text))