# Discharge Notes and Readmission Rates: NLP

This notebook is for the processing the cleaned dataset through a NLP.

## Import

Import the relevant libraries.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#nlp libraries 
import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.stem.porter import PorterStemmer
#import re
import string

#vectorizers
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#modeling
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, plot_confusion_matrix
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline 
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.utils import class_weight

## Functions

In [2]:
def scores(model, X, y, set_type):
    '''
    This function takes in an already fitted model, X, y, and
    the type of data (train, validation, or test) in str format,
    and prints out the accuracy and recall scores. The function
    also prints out the confusion matrix
    '''
    pred = model.predict(X)
    
    print("{} Scores".format(set_type))
    print("accuracy: ", accuracy_score(y, pred))
    print("recall:   ", recall_score(y, pred))
    fig, ax = plt.subplots(figsize=(6,6))
    ax.set_title(set_type)
    plot_confusion_matrix(model, X, y, ax=ax)

In [3]:
stemmer = PorterStemmer()
def stem_text(text):
    '''
    This function takes a text and then tokenize, stem, and lemmenatize it
    '''
    return [stemmer.stem(w) for w in word_tokenize(text)]

In [4]:
lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    '''
    This function takes a text and then tokenize, stem, and lemmenatize it
    '''
    return [lemmatizer.lemmatize(w) for w in text]

## Reading in the Data

Here, I read in the cleand dataset from the previous EDA folder.

In [5]:
df = pd.read_csv('data/admissions_cleaned_2.csv.gz', compression='gzip')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52726 entries, 0 to 52725
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   ADMISSION_TYPE      52726 non-null  object
 1   ADMISSION_LOCATION  52726 non-null  object
 2   DISCHARGE_LOCATION  52726 non-null  object
 3   INSURANCE           52726 non-null  object
 4   LANGUAGE            52726 non-null  object
 5   RELIGION            52726 non-null  object
 6   MARITAL_STATUS      52726 non-null  object
 7   ETHNICITY           52726 non-null  object
 8   DIAGNOSIS           52726 non-null  object
 9   READMISSION         52726 non-null  int64 
 10  GENDER              52726 non-null  object
 11  AGE                 52726 non-null  int64 
 12  TEXT                52726 non-null  object
 13  CPT_CD              52726 non-null  object
 14  DIAG_ICD9_CODE      52722 non-null  object
 15  DRG_CODE            52726 non-null  object
 16  PROCED_ICD9_CODE    52

In [7]:
df['READMISSION'].value_counts()

0    49715
1     3011
Name: READMISSION, dtype: int64

The dataset is heavily imbalanced, with a lot more negative cases. In order to handle class imbalance, I will later on use two method, SMOTE and undersampling, to mitigate the class imbalance.

## Preprocessing

In this section, I preprocessed the text data by removing punctuation and numbers, and lowercasing all the text. I decided to preprocess the text in this notebook rather than the first EDA notebook in case I decided to modify the text in a different way to help with my modeling.

In [8]:
#set X as the text data, and y as the readmission status
X = df['TEXT']
y = df['READMISSION']

In [9]:
#review the first text example to see how to preprocess the text data
X[0]

"Admission Date:  [**2100-6-7**]              Discharge Date:   [**2100-6-9**]\n\nDate of Birth:  [**2044-4-23**]             Sex:   M\n\nService: MEDICINE\n\nAllergies:\nNo Known Allergies / Adverse Drug Reactions\n\nAttending:[**First Name3 (LF) 2024**]\nChief Complaint:\ndizziness\n\nMajor Surgical or Invasive Procedure:\nNONE\n\n\nHistory of Present Illness:\nHistory of Present Illness: Mr. [**Known lastname 83415**] is a 56 year old male\nwith PMH notable for metastatic cancer (unknown primary)\nundergoing chemo now presenting with lighheadedness and\nhemoptysis. Pt is undergoing chemo with gemcitabine ([**2100-5-24**],\n[**2100-5-31**]) for malignant neoplasm of unknown primary with multiple\nbone, muscle, and soft-tissue metastases. He came to the\nhospital to start RT to L femur today.  Pt reports a couple days\nof lightheadedness, nonpositional.  Pt reports one episode of\nhemoptysis [**2100-6-3**] but none since then. No sig lung lesions on\nchest CT [**2100-5-27**]. Was sche

In [10]:
#remove line breaks and other formatting text
X = X.str.replace('\n',' ')
X = X.str.replace('\r',' ')

#remove punctuations and numbers
punc_list = string.punctuation
num_list = '0123456789'
punc_num_list = punc_list + num_list
pun_num_remove = str.maketrans(dict.fromkeys(punc_num_list, " "))
X = X.str.translate(pun_num_remove)

#lower case the text
X = X.str.lower()

In [11]:
#stem the words
X = X.apply(stem_text)

In [None]:
#stem the words
X = X.apply(lemmatize_text)

In [None]:
#join the words together
X = X.apply(' '.join)

## Train, Validation, and Test Split

I split the data set into three sets: Train, Validation, and Test.
The ratio I used was 70:15:15.

In [None]:
X_train, X_v_t, y_train, y_v_t = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)

In [None]:
X_val, X_test, y_val, y_test = train_test_split(X_v_t, y_v_t, test_size=0.50, random_state=42, stratify=y_v_t)

In [None]:
X_train.shape

## Pipeline Parameters

In this section, I set specific pipeline parameters that I will keep constant throughout the project.

In [None]:
sw = stopwords.words('english')
max_feats = int(X_train.shape[0]*0.10)
#max_feats = 300

## Modeling

In this section, I will run several different models. Once I decide the best model, I will export the results out so that in another notebook, I can merge that results to other data features and run that through models again to see if I can get a better model.

### Model 0: Dummy Classification

For my Model 0, I run a simple dummy classifier. I test out both a count vectorizer and a tfidf vectorizer to see if it will impact the results. I have yet to do anything about the class imbalance.

* Model: Dummy
* Vectorizer: Count
* Class Imbalance Solution: None

In [None]:
pp_count = Pipeline(steps=[
    ('count', CountVectorizer(stop_words=sw, max_features=max_features)),
    ('dummy', DummyClassifier(strategy="stratified"))
    ])

In [None]:
pp_count.fit(X_train, y_train)

scores(pp_count, X_train, y_train, 'Train')
scores(pp_count, X_val, y_val, 'Validation')
scores(pp_count, X_test, y_test, 'Test')

* Model: Dummy
* Vectorizer: Tfidf
* Class Imbalance Solution: None

In [None]:
pp_tfidf = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(stop_words=sw, max_features=max_features)),
    ('dummy', DummyClassifier(strategy="stratified")),   
    ])

In [None]:
pp_tfidf.fit(X_train, y_train)

scores(pp_tfidf, X_train, y_train, 'Train')
scores(pp_tfidf, X_val, y_val, 'Validation')
scores(pp_tfidf, X_test, y_test, 'Test')

Though the accuracy is high for both the vectorizers through the dummy classifier model, the recall score was very poor. This is most likely due to the class imbalance. However, I will run a non-dummy, but simple model to validate that the low recall score is not because of the dummy model itself.

### Model 1: Logistic Regression

The next model I decided to use is the logitic regression model.

* Model: Logistic Regression
* Vectorizer: Count
* Class Imbalance Solution: None

In [None]:
pp_count = Pipeline(steps=[
    ('count', CountVectorizer(stop_words=sw, max_features=max_features)),
    ('logreg', LogisticRegression(random_state=42)),   
    ])

In [None]:
pp_count.fit(X_train, y_train)

scores(pp_count, X_train, y_train, 'Train')
scores(pp_count, X_val, y_val, 'Validation')
scores(pp_count, X_test, y_test, 'Test')

* Model: Logistic Regression
* Vectorizer: Tfidf
* Class Imbalance Solution: None

In [None]:
pp_tfidf = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(stop_words=sw, max_features=max_features)),
    ('logreg', LogisticRegression(random_state=42)),   
    ])

In [None]:
pp_tfidf.fit(X_train, y_train)

scores(pp_tfidf, X_train, y_train, 'Train')
scores(pp_tfidf, X_val, y_val, 'Validation')
scores(pp_tfidf, X_test, y_test, 'Test')

Again, the accuracy scores are high, but the recall scores are poor. Thus, for the next modeling step, I will try to improve the model by first trying to mitigate the class imbalance.

### Handling Imbalanced Data 

As mentioned before, the dataset is heavily imbalanced. I will implement three methods to handle the class imbalance:
* Undersample the negative cases
* Pass in class weights
* Implement SMOTE

#### Undersampling Negative Cases: 1 to 1

In [None]:
#concat the train dataset together
train_df = pd.concat([y_train, X_train], axis=1)

In [None]:
#split the positive and negative data
train_df_pos = train_df[train_df['READMISSION']==1]
train_df_neg = train_df[train_df['READMISSION']==0]

In [None]:
#take a sample of the negative training set with the same number of rows as positive dataset
train_df_neg_samp = train_df_neg.sample(n=len(train_df_pos))

In [None]:
#concat the positive dataset and sample of the negative dataset to get one dataset
#with equal number of postiive and negative cases
new_train_df = pd.concat([train_df_pos,train_df_neg_samp],axis=0)

In [None]:
#randomize the new dataset
new_train_df = new_train_df.sample(n=len(new_train_df))

In [None]:
#split out the X and y of the new dataset
y_train_1 = new_train_df['READMISSION']
X_train_1 = new_train_df['TEXT']
max_feats_1 = int(X_train_1.shape[0]*0.10)

#### Undersampling Negative Cases: 2 to 1

In [None]:
#concat the train dataset together
train_df = pd.concat([y_train, X_train], axis=1)

In [None]:
#split the positive and negative data
train_df_pos = train_df[train_df['READMISSION']==1]
train_df_neg = train_df[train_df['READMISSION']==0]

In [None]:
#take a sample of the negative training set with the same number of rows as positive dataset
train_df_neg_samp = train_df_neg.sample(n=2*len(train_df_pos))

In [None]:
#concat the positive dataset and sample of the negative dataset to get one dataset
#with equal number of postiive and negative cases
new_train_df = pd.concat([train_df_pos,train_df_neg_samp],axis=0)

In [None]:
#randomize the new dataset
new_train_df = new_train_df.sample(n=len(new_train_df))

In [None]:
#split out the X and y of the new dataset
y_train_2 = new_train_df['READMISSION']
X_train_2 = new_train_df['TEXT']
max_feats_2 = int(X_train_2.shape[0]*0.10)

#### Undersampling Negative Cases: 3 to 1

In [None]:
#concat the train dataset together
train_df = pd.concat([y_train, X_train], axis=1)

In [None]:
#split the positive and negative data
train_df_pos = train_df[train_df['READMISSION']==1]
train_df_neg = train_df[train_df['READMISSION']==0]

In [None]:
#take a sample of the negative training set with the same number of rows as positive dataset
train_df_neg_samp = train_df_neg.sample(n=2*len(train_df_pos))

In [None]:
#concat the positive dataset and sample of the negative dataset to get one dataset
#with equal number of postiive and negative cases
new_train_df = pd.concat([train_df_pos,train_df_neg_samp],axis=0)

In [None]:
#randomize the new dataset
new_train_df = new_train_df.sample(n=len(new_train_df))

In [None]:
#split out the X and y of the new dataset
y_train_3 = new_train_df['READMISSION']
X_train_3 = new_train_df['TEXT']
max_feats_3 = int(X_train_3.shape[0]*0.10)

#### Class Weights

In [None]:
#calculate class weights
class_weights = class_weight.compute_class_weight(class_weight='balanced',classes=[0,1],y=y_train)

In [None]:
class_weights

### Model 1.1: Logistic Regression - Undersampling

In this section, I applied the undersampled dataset to logistic regression model.

* Model: Logistic Regression
* Vectorizer: Count
* Class Imbalance Solution: Undersampling Negative 1:1

In [None]:
pp_count = Pipeline(steps=[
    ('count', CountVectorizer(stop_words=sw, max_features=max_feats_1)),
    ('logreg', LogisticRegression(random_state=42)),   
    ])

In [None]:
pp_count.fit(X_train_1, y_train_1)

scores(pp_count, X_train_1, y_train_1, 'Train')
scores(pp_count, X_val, y_val, 'Validation')
scores(pp_count, X_test, y_test, 'Test')

* Model: Logistic Regression
* Vectorizer: Tfidf
* Class Imbalance Solution: Undersampling Negative 1:1

In [None]:
pp_tfidf = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(stop_words=sw, max_features=max_feats_1)),
    ('logreg', LogisticRegression(random_state=42)),   
    ])

In [None]:
pp_tfidf.fit(X_train_1, y_train_1)

scores(pp_tfidf, X_train_1, y_train_1, 'Train')
scores(pp_tfidf, X_val, y_val, 'Validation')
scores(pp_tfidf, X_test, y_test, 'Test')

* Model: Logistic Regression
* Vectorizer: Count
* Class Imbalance Solution: Undersampling Negative 2:1

In [None]:
pp_count = Pipeline(steps=[
    ('count', CountVectorizer(stop_words=sw, max_features=max_feats_2)),
    ('logreg', LogisticRegression(random_state=42)),   
    ])

In [None]:
pp_count.fit(X_train_2, y_train_2)

scores(pp_count, X_train_2, y_train_2, 'Train')
scores(pp_count, X_val, y_val, 'Validation')
scores(pp_count, X_test, y_test, 'Test')

* Model: Logistic Regression
* Vectorizer: Tfidf
* Class Imbalance Solution: Undersampling Negative 2:1

In [None]:
pp_tfidf = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(stop_words=sw, max_features=max_feats_2)),
    ('logreg', LogisticRegression(random_state=42)),   
    ])

In [None]:
pp_tfidf.fit(X_train_2, y_train_2)

scores(pp_tfidf, X_train_2, y_train_2, 'Train')
scores(pp_tfidf, X_val, y_val, 'Validation')
scores(pp_tfidf, X_test, y_test, 'Test')

* Model: Logistic Regression
* Vectorizer: Count
* Class Imbalance Solution: Undersampling Negative 3:1

In [None]:
pp_count = Pipeline(steps=[
    ('count', CountVectorizer(stop_words=sw, max_features=max_feats_3)),
    ('logreg', LogisticRegression(random_state=42)),   
    ])

In [None]:
pp_count.fit(X_train_3, y_train_3)

scores(pp_count, X_train_3, y_train_3, 'Train')
scores(pp_count, X_val, y_val, 'Validation')
scores(pp_count, X_test, y_test, 'Test')

* Model: Logistic Regression
* Vectorizer: Tfidf
* Class Imbalance Solution: Undersampling Negative 3:1

In [None]:
pp_tfidf = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(stop_words=sw, max_features=max_feats_3)),
    ('logreg', LogisticRegression(random_state=42)),   
    ])

In [None]:
pp_tfidf.fit(X_train_3, y_train_3)

scores(pp_tfidf, X_train_3, y_train_3, 'Train')
scores(pp_tfidf, X_val, y_val, 'Validation')
scores(pp_tfidf, X_test, y_test, 'Test')

After using the undersampled dataset, though the accuracy went down, the recall score went up. The ratio of 1 to 1 of negative to positive cases had the best recall scores.

### Model 1.2: Logistic Regression - Class Weights

In this section, I passed in the class weights in the logistic regression model.

* Model: Logistic Regression
* Vectorizer: Count
* Class Imbalance Solution: Class Weights

In [None]:
pp_count = Pipeline(steps=[
    ('count', CountVectorizer(stop_words=sw, max_features=max_feats)),
    ('logreg', LogisticRegression(random_state=42, class_weight=class_weights)),   
    ])

In [None]:
pp_count.fit(X_train, y_train)

scores(pp_count, X_train, y_train, 'Train')
scores(pp_count, X_val, y_val, 'Validation')
scores(pp_count, X_test, y_test, 'Test')

* Model: Logistic Regression
* Vectorizer: Tfidf
* Class Imbalance Solution: Class Weights

In [None]:
pp_tfidf = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(stop_words=sw, max_features=max_feats)),
    ('logreg', LogisticRegression(random_state=42, class_weight=class_weights)),   
    ])

In [None]:
pp_tfidf.fit(X_train, y_train)

scores(pp_tfidf, X_train, y_train, 'Train')
scores(pp_tfidf, X_val, y_val, 'Validation')
scores(pp_tfidf, X_test, y_test, 'Test')

As seen above, passing in the class weights did not improve the recall score.

### Model 1.3: Logistic Regression - SMOTE

In this section, I utilized SMOTE to oversample the imbalanced class.

* Model: Logistic Regression
* Vectorizer: Count
* Class Imbalance Solution: SMOTE

In [None]:
pp_count = imbpipeline(steps=[
    ('count', CountVectorizer(stop_words=sw, max_features=max_feats)),
    ('smote', SMOTE(random_state=42)),
    ('logreg', LogisticRegression(random_state=42)),   
    ])

In [None]:
pp_count.fit(X_train, y_train)

scores(pp_count, X_train, y_train, 'Train')
scores(pp_count, X_val, y_val, 'Validation')
scores(pp_count, X_test, y_test, 'Test')

* Model: Logistic Regression
* Vectorizer: Tfidf
* Class Imbalance Solution: SMOTE

In [None]:
pp_tfidf = imbpipeline(steps=[
    ('tfidf', TfidfVectorizer(stop_words=sw, max_features=100max_feats0)),
    ('smote', SMOTE(random_state=42)),
    ('logreg', LogisticRegression(random_state=42)),   
    ])

In [None]:
pp_tfidf.fit(X_train, y_train)

scores(pp_tfidf, X_train, y_train, 'Train')
scores(pp_tfidf, X_val, y_val, 'Validation')
scores(pp_tfidf, X_test, y_test, 'Test')

SMOTE did improve the recall score. However, running SMOTE into grid search will take too much computation power. Therefore, I will test out SMOTE without the grid search and use default paramaters for the models.

### Introducing N-Grams

In [None]:
pp_tfidf = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(stop_words=sw, max_features=max_feats_1, ngram_range=(1,3))),
    ('logreg', LogisticRegression(random_state=42)),   
    ])

In [None]:
pp_tfidf.fit(X_train_1, y_train_1)

scores(pp_tfidf, X_train_1, y_train_1, 'Train')
scores(pp_tfidf, X_val, y_val, 'Validation')
scores(pp_tfidf, X_test, y_test, 'Test')

### Model X: Multinomial Naives Bayes

In [None]:
pp_tfidf = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(stop_words=sw, max_features=max_feats_1)),
    ('mnb', MultinomialNB())   
    ])

In [None]:
pp_tfidf.fit(X_train_1, y_train_1)

scores(pp_tfidf, X_train_1, y_train_1, 'Train')
scores(pp_tfidf, X_val, y_val, 'Validation')
scores(pp_tfidf, X_test, y_test, 'Test')

### Model X: Random Forest

In [None]:
pp_tfidf = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(stop_words=sw, max_features=420)),
    ('rf', RandomForestClassifier()),   
    ])

In [None]:
grid = {'rf__max_depth': [8,10],
        'rf__max_features': ['auto', 'sqrt'],
        'rf__min_samples_leaf': [1, 2, 4],
        'rf__min_samples_split': [2, 5, 7]}

In [None]:
gs_rf = GridSearchCV(estimator=pp_tfidf, 
                      param_grid=grid, 
                      scoring='recall', 
                      cv=3,
                      verbose=2,
                      n_jobs=-1
                     )

In [None]:
gs_rf.fit(X_train_1, y_train_1)

scores(gs_rf, X_train_1, y_train_1, 'Train')
scores(gs_rf, X_val, y_val, 'Validation')
scores(gs_rf, X_test, y_test, 'Test')

In [None]:
gs_rf.best_params_

In [None]:
"""
{'rf__max_depth': 10,
 'rf__max_features': 'sqrt',
 'rf__min_samples_leaf': 2,
 'rf__min_samples_split': 7}
"""

### Model 2: K-Nearest Neighbors

In [None]:
pp_tfidf = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(stop_words=sw, max_features=20)),
    ('knn', KNeighborsClassifier()),   
    ])

In [None]:
grid = {'knn__n_neighbors': list(range(3, 10, 2)),
        'knn__weights': ['uniform', 'distance']}

In [None]:
gs_knn = GridSearchCV(estimator=pp_tfidf, 
                      param_grid=grid, 
                      scoring='recall', 
                      cv=3,
                      verbose=2,
                      n_jobs=-1
                     )

In [None]:
gs_knn.fit(X_train_1, y_train_1)

scores(gs_knn, X_train_1, y_train_1, 'Train')
scores(gs_knn, X_val, y_val, 'Validation')
scores(gs_knn, X_test, y_test, 'Test')

### Model 3: Decision Tree Model

In [None]:
pp_tfidf = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(stop_words=sw, max_features=200)),
    ('dt', DecisionTreeClassifier()),   
    ])

In [None]:
grid = {'dt__criterion': ['gini', 'entropy'],
        'dt__max_depth': np.arange(1, 15, 2),
        'dt__min_samples_split': np.arange(50, 100, 5),
        'dt__min_samples_leaf': np.arange(50, 100,5)
       }

In [None]:
gs_dt = GridSearchCV(estimator=pp_tfidf, 
                      param_grid=grid, 
                      scoring='recall', 
                      cv=3,
                      verbose=2,
                      n_jobs=-1
                     )

In [None]:
gs_dt.best_params_

In [None]:
"""
{'dt__criterion': 'gini',
 'dt__max_depth': 1,
 'dt__min_samples_leaf': 50,
 'dt__min_samples_split': 50}
"""

In [None]:
gs_dt.fit(X_train_2, y_train_2)

scores(gs_dt, X_train_2, y_train_2, 'Train')
scores(gs_dt, X_val, y_val, 'Validation')
scores(gs_dt, X_test, y_test, 'Test')

### Model 4: Random Forest

In [None]:
pp_tfidf = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(stop_words=sw, max_features=420)),
    ('rf', RandomForestClassifier()),   
    ])

In [None]:
grid = {'rf__max_depth': [8,10],
        'rf__max_features': ['auto', 'sqrt'],
        'rf__min_samples_leaf': [1, 2, 4],
        'rf__min_samples_split': [2, 5, 7]}

In [None]:
gs_rf = GridSearchCV(estimator=pp_tfidf, 
                      param_grid=grid, 
                      scoring='recall', 
                      cv=3,
                      verbose=2,
                      n_jobs=-1
                     )

In [None]:
"""
{'rf__max_depth': 10,
 'rf__max_features': 'sqrt',
 'rf__min_samples_leaf': 2,
 'rf__min_samples_split': 7}
"""

In [None]:
gs_rf.fit(X_train_1, y_train_1)

scores(gs_rf, X_train_1, y_train_1, 'Train')
scores(gs_rf, X_val, y_val, 'Validation')
scores(gs_rf, X_test, y_test, 'Test')

In [None]:
gs_rf.best_params_

### Model 5: XGBoost

### Model 6: AdaBoost

### Model 7: Gradient Boost

In [None]:
pp_tfidf = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(stop_words=sw, max_features=50)),
    ('gbc', GradientBoostingClassifier(random_state=42)),   
    ])

In [None]:
grid = {'gbc__max_features': ['auto','log2'],
        'gbc__min_samples_split':[10, 50, 100, 1000],
        'gbc__max_depth':[3, 5, 10, 20]
       }

In [None]:
gs_gbc = GridSearchCV(estimator=pp_tfidf, 
                      param_grid=grid, 
                      scoring='recall', 
                      cv=3,
                      verbose=2,
                      n_jobs=-1
                     )

In [None]:
gs_gbc.fit(X_train_2, y_train_2)

scores(gs_gbc, X_train_2, y_train_2, 'Train')
scores(gs_gbc, X_val, y_val, 'Validation')
scores(gs_gbc, X_test, y_test, 'Test')

In [None]:
gs_gbc.best_params_

In [None]:
"""
{'gbc__max_depth': 3,
 'gbc__max_features': 'log2',
 'gbc__min_samples_split': 10}
"""

In [None]:
pp_tfidf = imbpipeline(steps=[
    ('tfidf', TfidfVectorizer(stop_words=sw, max_features=max_features)),
    ('smote', SMOTE(random_state=42)),
    ('gbc', GradientBoostingClassifier(random_state=42)),   
    ])

In [None]:
#grid = {'gbc__max_features': ['auto','log2'],
#        'gbc__min_samples_split':[10, 50, 100, 1000],
#        'gbc__max_depth':[3, 5, 10, 20]
#       }

In [None]:
# gs_gbc = GridSearchCV(estimator=pp_tfidf, 
#                       param_grid=grid, 
#                       scoring='recall', 
#                       cv=3,
#                       verbose=2,
#                       n_jobs=-1
#                      )

In [None]:
pp_tfidf.fit(X_train, y_train)

scores(pp_tfidf, X_train, y_train, 'Train')
scores(pp_tfidf, X_val, y_val, 'Validation')
scores(pp_tfidf, X_test, y_test, 'Test')

### Model 8: Neural Network

### Model 9: Convolutional Neural Network