## Natural Language Processing

We aim to predict the purpose of a loan, given by the <i>'purpose'</i> column, using the description entered by the applicant in the <i>'desc'</i> column.

In [1]:
#-----------------------------------packages-----------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import string
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import  TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
#----------------------------------load data-----------------------------------
data = pd.read_csv('data/nlp_data.csv')

In [3]:
#----------------------------------nlp model-----------------------------------
X, y = data['desc'], data['purpose']

def text_process(text):
    no_punc = ''.join([char for char in text
                       if char not in string.punctuation])
    # remove common words
    return [word for word in no_punc.split()
            if word.lower() not in stopwords.words('english')]

cv = CountVectorizer(analyzer=text_process)
cv.fit(X)
X = cv.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    shuffle = True)
model = MultinomialNB()
model.fit(X_train,y_train)
preds = model.predict(X_test)

print(classification_report(y_test, preds, zero_division = 0))