<a href="https://colab.research.google.com/github/abarnett1999/AIPI-540-NLP-Team2-Project/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Classification using Word Counts / TFIDF

In [None]:
import os
import numpy as np
import pandas as pd
import string
import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
import urllib.request
import zipfile

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
#!python -m spacy download en_core_web_md
nlp = spacy.load('en_core_web_sm')

import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('omw-1.4')
nltk.download('wordnet')

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score

### **Prepare Data**

In [None]:
# Load data from csv to pandas dataframe
data_df = pd.read_csv('labeled_data.csv')

# Clean it up a bit more 
data_df = data_df[['filename', 'impression', 'label']]

In [None]:
data_df

Unnamed: 0,filename,impression,label
0,1.xml,Normal chest x-XXXX.,0
1,10.xml,No acute cardiopulmonary process.,0
2,100.xml,No active disease.,0
3,1000.xml,Increased opacity in the right upper lobe wit...,1
4,1001.xml,Diffuse fibrosis. No visible focal acute disease.,1
...,...,...,...
410,1377.xml,No acute radiographic cardiopulmonary process.,0
411,1378.xml,Negative for acute abnormality.,0
412,1379.xml,No acute cardiopulmonary findings.,0
413,138.xml,No acute preoperative findings.,0


In [None]:
# Split into training and test sets - 80/20
X = data_df['impression']
y = data_df['label']
X_train,X_test,y_train,y_test = train_test_split(X, y, random_state=0,test_size=0.2)

**Pipeline to Streamlit**

In [None]:
# from sklearn.pipelines import make_pipeline

# #The pipeline will save all transformations and weights to make predictions 
# #easier. The pipeline can be saved out and reused easily.
# pipeline = make_pipeline([
#           tokenize(), # <--- This needs the sklearn TransformerMixin or something similar 
#           TfidfVectorizer(),
#           LogisticRegression(solver = "saga")
# ])

# pipeline.fit(Xtrain, ytrain)
# pipeline.predict(Xtest)

### **Pre-process text**

In [None]:
# Tokenize text on white space and punctuation (using NLTK)
# Then lemmatize the text 

def tokenize(sentence,method):
# Tokenize and lemmatize text, remove stopwords and punctuation

    punctuations = string.punctuation
    stopwords = list(STOP_WORDS)

    if method=='nltk':
        # Tokenize
        tokens = nltk.word_tokenize(sentence,preserve_line=True)
        # Remove stopwords and punctuation
        tokens = [word for word in tokens if word not in stopwords and word not in punctuations]
        # Lemmatize
        wordnet_lemmatizer = WordNetLemmatizer()
        tokens = [wordnet_lemmatizer.lemmatize(word) for word in tokens]
        tokens = " ".join([i for i in tokens])
    else:
        # Tokenize
        with nlp.select_pipes(enable=['tokenizer','lemmatizer']):
            tokens = nlp(sentence)
        # Lemmatize
        tokens = [word.lemma_.lower().strip() for word in tokens]
        # Remove stopwords and punctuation
        tokens = [word for word in tokens if word not in stopwords and word not in punctuations]
        tokens = " ".join([i for i in tokens])
    return tokens

In [None]:
# Process the training set 
tqdm.pandas()
X_train_processed = X_train.progress_apply(lambda x: tokenize(x,method='nltk'))

100%|██████████| 332/332 [00:04<00:00, 69.31it/s]


In [None]:
# Process the test set text
tqdm.pandas()
X_test_processed = X_test.progress_apply(lambda x: tokenize(x,method='nltk'))

100%|██████████| 83/83 [00:00<00:00, 1114.58it/s]


## **Create features**

In [None]:
def build_features(train_data, test_data, ngram_range, method):
    if method == 'tfidf':
        # Create features using TFIDF
        vec = TfidfVectorizer(ngram_range=ngram_range)
        X_train = vec.fit_transform(train_data)
        X_test = vec.transform(test_data)

    elif method=='count':
        # Create features using word counts
        vec = CountVectorizer(ngram_range=ngram_range)
        X_train = vec.fit_transform(train_data)
        X_test = vec.transform(test_data)

    return X_train, X_test

#### **Option 1: Count Vectorization**

In [None]:
# Option 1: Create features using count vectorization

method = 'count'
ngram_range = (1, 2)
X_train,X_test = build_features(X_train_processed,X_test_processed,ngram_range,method)

In [None]:
X_train

<332x1998 sparse matrix of type '<class 'numpy.int64'>'
	with 5460 stored elements in Compressed Sparse Row format>

### **Train model - Count Vectorization**

In [None]:
# Simple softmax regression classification model 

# Train on training set 
logreg_model = LogisticRegression(solver='saga')
logreg_model.fit(X_train,y_train)
preds = logreg_model.predict(X_train)
acc = sum(preds==y_train)/len(y_train)
recall = recall_score(y_train, preds)
print('Accuracy on the training set is {:.3f}'.format(acc))
print('Recall on the training set is {:.3f}'.format(recall))

Accuracy on the training set is 0.988
Recall on the training set is 0.974


In [None]:
# Evaluate model on test set 

test_preds = logreg_model.predict(X_test)
test_acc = sum(test_preds==y_test)/len(y_test)
test_recall = recall_score(y_test, test_preds)
print('Accuracy on the test set is {:.3f}'.format(test_acc))
print('Recall on the test set is {:.3f}'.format(test_recall))

Accuracy on the test set is 0.916
Recall on the test set is 0.917


#### **Option 2: TFIDF**

In [None]:
# Option 2: Create features using TFIDF

method = 'tfidf'
ngram_range = (1, 2)
X_train,X_test = build_features(X_train_processed,X_test_processed,ngram_range,method)

In [None]:
X_train

<332x1998 sparse matrix of type '<class 'numpy.float64'>'
	with 5460 stored elements in Compressed Sparse Row format>

### **Train model - TFIDF**

In [None]:
# Simple softmax regression classification model 

# Train on training set 
logreg_model = LogisticRegression(solver='saga')
logreg_model.fit(X_train,y_train)
preds = logreg_model.predict(X_train)
acc = sum(preds==y_train)/len(y_train)
recall = recall_score(y_train, preds)
print('Accuracy on the training set is {:.3f}'.format(acc))
print('Recall on the training set is {:.3f}'.format(recall))

Accuracy on the training set is 0.961
Recall on the training set is 0.915


In [None]:
# Evaluate model on test set 

test_preds = logreg_model.predict(X_test)
test_acc = sum(test_preds==y_test)/len(y_test)
test_recall = recall_score(y_test, test_preds)
print('Accuracy on the test set is {:.3f}'.format(test_acc))
print('Recall on the test set is {:.3f}'.format(test_recall))

Accuracy on the test set is 0.916
Recall on the test set is 0.875
