- **Requirements classification in the customer service area for software companies using Machine Learning and Natural Language Processing**

The dataset that will be worked on is called "TCC.xlsx" and contains the information of the requirements, requests and petitions presented to the company SIGMA Ingeniería S.A of Manizales in the technical support area. 

The fields that will be taken into account for this work will be "description" and "category", the idea is to find the best performance technique in the classification of descriptions to implement in the company and perform the automatic classification of future requirements, it is intended that Through the predicted category, the protocols of solution to the request presented by the client are provided to offer a better quality in the response and also, reduce the time in the response by the service area and technical support of the company towards the client. 

# Libraries

In [None]:
#Classification Methods
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

#NLP
import nltk 
nltk.download('stopwords') 
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords 
import spacy 
import en_core_web_sm

#Metrics
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from yellowbrick.classifier import ClassificationReport 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

#Tools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import model_selection
from scipy.sparse import csr_matrix 
import string 
import time as tm
import os

import warnings
warnings.filterwarnings('ignore')

# Functions

In [None]:
nlp = spacy.load("en_core_web_sm")

def lemmatizer(text):  
    doc = nlp(text)
    return ' '.join([word.lemma_ for word in doc]) 

In [None]:
def classifier_metrics(X_train,X_test,y_train,y_test):    
    def metrics(model):
        print("\nHold-Out in process...")
        start_time = tm.time()
        model.fit(X_train, y_train) 
        TIME = tm.time() - start_time 
        print("Time, Training: {0:.4f} [seconds]".format(TIME))
        start_time = tm.time()
        y_pred = model.predict(X_test)
        TIME = tm.time() - start_time 
        print("Time, Prediction: {0:.4f} [seconds]".format(TIME))
        accuracy_s  = accuracy_score(y_test,y_pred) 
        f1_s        = f1_score(y_test,y_pred,average='weighted')
        recall_s    = recall_score(y_test,y_pred,average='weighted')
        precision_s = precision_score(y_test,y_pred,average='weighted')
        print('accuracy_score: {0:.4f}'.format(accuracy_s))
        print('f1_score: {0:.4f}'.format(f1_s))
        print('recall_score: {0:.4f}'.format(recall_s))
        print('precision_score: {0:.4f}'.format(precision_s))
        
        print('\nCross-Validation in process...')
        start_time = tm.time() 
        kfold = model_selection.KFold(n_splits=10)
        y_CV = np.concatenate((y_train,y_test))
        if "GaussianNB" in str(name) or "LinearDiscriminantAnalysis" in str(name):
            X_CV = np.concatenate((X_train,X_test))
            cv_results = np.array(model_selection.cross_val_score(model, X_CV, y_CV, cv=kfold, scoring='accuracy', n_jobs=-1))
        else:
            X_CV = np.concatenate((X_train.toarray(),X_test.toarray()))
            X_CV = csr_matrix(X_CV)
            cv_results = np.array(model_selection.cross_val_score(model, X_CV, y_CV, cv=kfold, scoring='accuracy', n_jobs=-1))
        
        cv_results = cv_results[np.logical_not(np.isnan(cv_results))] 
        TIME = tm.time() - start_time 
        print("Time, CV: {0:.4f} [seconds]".format(TIME))
        print('CV: {0:.4f} {1:.4f}'.format(cv_results.mean(),cv_results.std()))

    for name in classifiers:
        print ("---------------------------------------------------------------------------------\n") 
        print(str(name))
        if "GaussianNB" in str(name) or "LinearDiscriminantAnalysis" in str(name):
            X_train=csr_matrix(X_train) 
            X_test =csr_matrix(X_test) 
            X_train=X_train.toarray() 
            X_test=X_test.toarray() 
        else:
            X_train=csr_matrix(X_train)
            X_test=csr_matrix(X_test)
            
        metrics(name)
        print()
        

In [None]:
path_figures = "../images"
if not os.path.exists(path_figures):
    os.makedirs(path_figures)

# Classification report
def CR_viz():
    ax = plt.figure(figsize=(15,20)) 
    visualizer = ClassificationReport(model_selected, classes=classes, support=True,  
                                      cmap='Blues', title="Classification Report - "+model_name)
    visualizer.fit(X_train, y_train)   
    visualizer.score(X_test, y_test)      
    visualizer.poof()
    ax.show()
    ax.savefig(path_figures+"/"+model_name+"_CR"+".pdf", bbox_inches = "tight") 

# Confusion matrix
def CM_viz():
    model_selected.fit(X_train, y_train) 
    y_pred = model_selected.predict(X_test) 
    conf = confusion_matrix(y_test, y_pred) 
    plt.figure(figsize=(42 , 42)) 
    annot_kws={'fontsize':20, 'verticalalignment':'center' }
    ax = sns.heatmap(conf, annot=True, cmap='Blues',fmt = 'd',annot_kws= annot_kws, xticklabels=np.unique(y_test), yticklabels=np.unique(y_test)) 
    ax.set(title="Confusion Matrix with labels", xlabel="Predicted Values", ylabel="Actual Values")
    sns.set(font_scale=2)
    plt.title("Confusion Matrix - "+model_name, fontsize = 35)
    plt.xlabel("Predicted Values", fontsize = 35)
    plt.ylabel("Actual Values", fontsize = 35)
    plt.savefig(path_figures+"/"+model_name+"_CM"+".pdf", bbox_inches = "tight") 

# Dataset preparation

In [None]:
# Loading Dataset
filename = '../Data/TCC.xlsx'
DataSet0 = pd.read_excel(os.path.join(filename), engine='openpyxl')
DataSet0.shape 

# Machine learning application

## 1. Original Dataset (OD)

In [None]:
'''
'X' and 'y' are defined, 'X' will be in charge of containing the characteristics of the dataset that for this case
is the description that will define the category to which it belongs and 'y' contains the values of the labels, 
in this case of the possible categories defined.
'''

X = DataSet0['description'] 
y = DataSet0['category'] 

In [None]:
'''
The data for training and validation is defined, the training data will represent 80% of all the data 
and the validation data the remaining 20%
'''
X_train, X_test, y_train, y_test = train_test_split(X , y , test_size=0.20, random_state=8, stratify=y)

print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 

In [None]:
'''
Bearing in mind that the problem worked consists of text, it is necessary to transform them and prepare them for 
later use, in this case, each of the words contained in the description will be encoded in floating point values 
for use in machine learning algorithms, this process is also known as feature extraction or vectorization 
using the TfidfVectorizer library
'''

vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2), stop_words=stopwords.words("english"))
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 


In [None]:
#ML Models definition
classifiers=[svm.SVC(), 
             ExtraTreesClassifier(n_jobs=-1), 
             RandomForestClassifier(n_jobs=-1),
             LogisticRegression(solver='liblinear'),
             DecisionTreeClassifier(),
             LinearDiscriminantAnalysis(),
             GaussianNB(),
             KNeighborsClassifier()
            ] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test) 

## 2. Dataset with Preprocessing (DP)

In [None]:
# Convert texts to lowercase
DataSet0['description'] = DataSet0['description'].str.lower()
DataSet0.shape

In [None]:
# Remove punctuation marks
punct = string.punctuation

for c in punct:
    for fila in range(len(DataSet0)):
        DataSet0['description'][fila] = DataSet0['description'][fila].replace(c, " ")
DataSet0.shape

In [None]:
# Apply stemming to the description field
DataSet0['description'] = DataSet0['description'].apply(lambda x: lemmatizer(x)) 
DataSet0.shape

In [None]:
'''
'X' and 'y' are defined, 'X' will be in charge of containing the characteristics of the dataset that for this case
is the description that will define the category to which it belongs and 'y' contains the values of the labels, 
in this case of the possible categories defined.
'''

X = DataSet0['description'] 
y = DataSet0['category'] 

In [None]:
'''
The data for training and validation is defined, the training data will represent 80% of all the data 
and the validation data the remaining 20%
'''
X_train, X_test, y_train, y_test = train_test_split(X , y , test_size=0.20, random_state=8, stratify=y)

print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 

In [None]:
'''
Bearing in mind that the problem worked consists of text, it is necessary to transform them and prepare them for 
later use, in this case, each of the words contained in the description will be encoded in floating point values 
for use in machine learning algorithms, this process is also known as feature extraction or vectorization 
using the TfidfVectorizer library
'''

vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2), stop_words=stopwords.words("english"))
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 


In [None]:
#ML Models definition
classifiers=[svm.SVC(), ExtraTreesClassifier(n_jobs=-1), RandomForestClassifier(n_jobs=-1)] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test)

## 3. Dataset with Preprocessing and Balancing (DPB)

In [None]:
# Convert texts to lowercase
DataSet0['description'] = DataSet0['description'].str.lower()
DataSet0.shape

In [None]:
# Remove punctuation marks
punct = string.punctuation

for c in punct:
    for fila in range(len(DataSet0)):
        DataSet0['description'][fila] = DataSet0['description'][fila].replace(c, " ")
DataSet0.shape

In [None]:
# Apply stemming to the description field
DataSet0['description'] = DataSet0['description'].apply(lambda x: lemmatizer(x)) 
DataSet0.shape

In [None]:
'''
'X' and 'y' are defined, 'X' will be in charge of containing the characteristics of the dataset that for this case
is the description that will define the category to which it belongs and 'y' contains the values of the labels, 
in this case of the possible categories defined.
'''

X = DataSet0['description'] 
y = DataSet0['category'] 

In [None]:
'''
Bearing in mind that the problem worked consists of text, it is necessary to transform them and prepare them for 
later use, in this case, each of the words contained in the description will be encoded in floating point values 
for use in machine learning algorithms, this process is also known as feature extraction or vectorization 
using the TfidfVectorizer library
'''

vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2), stop_words=stopwords.words("english"))
X = vectorizer.fit_transform(X) 

print(X.shape,y.shape) 


In [None]:
sm = SMOTE(random_state=42)

X_sm, y_sm = sm.fit_resample(X, y) 

print(f'''Shape of X before SMOTE: {X.shape}
Shape of X after SMOTE: {X_sm.shape}''')


In [None]:
'''
The data for training and validation is defined, the training data will represent 80% of all the data 
and the validation data the remaining 20%
'''
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.20, random_state=8, stratify=y_sm)

print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 

In [None]:
#ML Models definition
classifiers=[svm.SVC(), ExtraTreesClassifier(n_jobs=-1), RandomForestClassifier(n_jobs=-1)] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test) 

## 4. Dataset with Preprocessing and Balancing, Optimization of parameters (DPBO)

In [None]:
# Convert texts to lowercase
DataSet0['description'] = DataSet0['description'].str.lower()
DataSet0.shape

In [None]:
# Remove punctuation marks
punct = string.punctuation

for c in punct:
    for fila in range(len(DataSet0)):
        DataSet0['description'][fila] = DataSet0['description'][fila].replace(c, " ")
DataSet0.shape

In [None]:
# Apply stemming to the description field
DataSet0['description'] = DataSet0['description'].apply(lambda x: lemmatizer(x)) 
DataSet0.shape

In [None]:
'''
'X' and 'y' are defined, 'X' will be in charge of containing the characteristics of the dataset that for this case
is the description that will define the category to which it belongs and 'y' contains the values of the labels, 
in this case of the possible categories defined.
'''

X = DataSet0['description'] 
y = DataSet0['category'] 

In [None]:
'''
Bearing in mind that the problem worked consists of text, it is necessary to transform them and prepare them for 
later use, in this case, each of the words contained in the description will be encoded in floating point values 
for use in machine learning algorithms, this process is also known as feature extraction or vectorization 
using the TfidfVectorizer library
'''

vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1, 2), stop_words=stopwords.words("english"))
X = vectorizer.fit_transform(X) 

print(X.shape,y.shape) 

In [None]:
sm = SMOTE(random_state=42)

X_sm, y_sm = sm.fit_resample(X, y) 

print(f'''Shape of X before SMOTE: {X.shape}
Shape of X after SMOTE: {X_sm.shape}''')

In [None]:
'''
The data for training and validation is defined, the training data will represent 80% of all the data 
and the validation data the remaining 20%
'''
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.20, random_state=8, stratify=y_sm)

print(X_train.shape,X_test.shape,y_train.shape,y_test.shape) 

In [None]:
#Models with the Parameters Optimized
classifiers=[svm.SVC(C = 10, gamma = 1, kernel = 'rbf'), 
             ExtraTreesClassifier(min_samples_leaf = 1,min_samples_split= 3, n_estimators= 100, n_jobs=-1), 
             RandomForestClassifier(min_samples_leaf= 1, min_samples_split= 2, n_estimators= 800, n_jobs=-1)] 

#Deploy aggregate metrics 
classifier_metrics(X_train,X_test,y_train,y_test)

### Confusion Matrix and Classification Report for the best model and the best results

In [None]:
model_name = "SVM"
model_selected = svm.SVC(C = 10, gamma = 1, kernel = 'rbf') 
classes = np.unique(y_test)

visualization =[CR_viz(), CM_viz()] 
