# IH Complaints/Routine Service Text Classification

## Perform imports and load dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_excel('../IH Complaint Machine Learning/IH_combined.xlsx')
df.head()

Unnamed: 0,Service Request Number,SR Type,Platform,Merged
0,1-100039399,Complaint,BGG-RUO,Yellow flags for multiple samples Site reporte...
1,1-100786893,Complaint,BGG-RUO,Results not appearing in BIDS software Site re...
2,1-101810312,Complaint,BGG-RUO,Unable to launch Bids XT software / Life Share...
3,1-101810811,Complaint,BGG-RUO,during FDA genotyping training customer tried ...
4,1-101887056,Complaint,BGG-RUO,FDA genotyping customer did not have Taq polym...


In [3]:
len(df)

5897

## Check for missing values and empty strings

In [4]:
#Check for NaN

df.isnull().sum()

Service Request Number    0
SR Type                   6
Platform                  0
Merged                    0
dtype: int64

In [5]:
df.dropna(inplace=True)
df.isnull().sum()

Service Request Number    0
SR Type                   0
Platform                  0
Merged                    0
dtype: int64

In [6]:
len(df)

5891

In [7]:
#Check for empty strings

blanks = []
for i, s, srtype, plat, merged in df.itertuples():
    if type(merged)==str:
        if merged.isspace():
            blanks.append(i)
len(blanks)

101

In [8]:
df.drop(blanks, inplace=True)
len(df)

5790

In [9]:
5897 - 6 - 101

5790

## Look at the Label (SR Type) column:

In [10]:
df['SR Type'].value_counts()

Routine Service    3602
Complaint          2188
Name: SR Type, dtype: int64

## Split data into train & test sets:

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X = df['Merged']
y = df['SR Type']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=42)

## Build pipeline to vectorize text, then train and fit a model

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [14]:
text_clf = Pipeline([('tfidf',TfidfVectorizer()),
                     ('clf',LinearSVC())])

In [15]:
text_clf.fit(X_train,y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

## Run predictions and analyze results

In [16]:
predictions = text_clf.predict(X_test)

In [17]:
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score

In [18]:
print(confusion_matrix(y_test,predictions))
cmdf = pd.DataFrame(confusion_matrix(y_test,predictions),index=["RS","C"],columns=["RS","C"])
cmdf

[[ 575  159]
 [ 115 1062]]


Unnamed: 0,RS,C
RS,575,159
C,115,1062


In [19]:
print(classification_report(y_test,predictions))

                 precision    recall  f1-score   support

      Complaint       0.83      0.78      0.81       734
Routine Service       0.87      0.90      0.89      1177

    avg / total       0.86      0.86      0.86      1911



In [20]:
print(accuracy_score(y_test,predictions))

0.8566195709052852


## Test model of made up entries

In [21]:
text_clf.predict(["Unexpected results with cell 1 on Search-Cyte TCS 0.8% lot 024"])

array(['Complaint'], dtype=object)

In [22]:
text_clf.predict(['Fluidics pressure error with Fluid A container.'])

array(['Routine Service'], dtype=object)

## Index and evaluate mismatches

In [23]:
y_test[0:5]

1891          Complaint
1550    Routine Service
1049    Routine Service
2523    Routine Service
156           Complaint
Name: SR Type, dtype: object

In [24]:
predictions[0:5]

array(['Complaint', 'Routine Service', 'Routine Service',
       'Routine Service', 'Complaint'], dtype=object)

In [25]:
y_test_list = []
prediction_list = []
for i in y_test:
    y_test_list.append(i)
for p in predictions:
    prediction_list.append(p)
    
print(len(y_test_list))
print(len(prediction_list))

1911
1911


In [26]:
prediction_list[0:5]

['Complaint',
 'Routine Service',
 'Routine Service',
 'Routine Service',
 'Complaint']

In [28]:
# Create a list "mismatches" that contains y_test indeces where y_test != predictions

mismatches = []
i = 0
for i in range(len(y_test_list)):
    if prediction_list[i] != y_test_list[i]:
        mismatches.append(i)
len(mismatches)

274

In [29]:
print(mismatches[0:50])

[10, 12, 16, 21, 31, 37, 39, 50, 54, 62, 80, 82, 84, 89, 93, 95, 105, 106, 126, 129, 135, 137, 142, 146, 148, 157, 165, 168, 184, 186, 188, 193, 197, 204, 205, 209, 210, 214, 216, 228, 234, 237, 238, 242, 243, 249, 252, 256, 259, 261]


In [30]:
# Find the dataframe row number using the "mistmatches[]" y_test indeces 

y_test[10:13]

2480          Complaint
2210    Routine Service
1867    Routine Service
Name: SR Type, dtype: object

In [31]:
predictions[10:13]

array(['Routine Service', 'Routine Service', 'Complaint'], dtype=object)

In [32]:
print(df.iloc[2480]['Merged'])
print('\n')
print(f"Actual:{df.iloc[2480]['SR Type']}")
print('\n')
print(f"Predicted:{predictions[10]}")

Customer had a gripper crash and cards were jammed. Customer reported a gripper crash and the cards were jammed. The TAS investigating this case worked with the customer on the phone to get the cards removed from the gripper. All cards that were loose were removed from the drawers. All the card drawers were checked to make sure the cards were sitting properly in the racks. The gripper was enabled. The next shift will ran QC and reported that they had some cards in centrifuge and had problems with gripper again. The cards were removed from centrifuge, QC was run and the customer was able to run patient samples. The customer was able to remove the cards from gripper and the centrifuge and restart analyzer. The root cause of the gripper error was that cards were sticking up higher than expected due to improper loading of cards and or closing the card drawer forcefully. A discussion took place with the customer to ensure that they understand proper loading of the cards and closing of the c

## Index off y_test.iteritems()

In [33]:
# Add y_test Excel rows to a list
# Add y_test_scores to a list

y_test_rows = []
y_test_scores = []

for i,s in y_test.iteritems():
    y_test_rows.append(i)
    y_test_scores.append(s)
    
print(y_test_rows[0:5])
print(y_test_scores[0:5])

[1891, 1550, 1049, 2523, 156]
['Complaint', 'Routine Service', 'Routine Service', 'Routine Service', 'Complaint']


In [34]:
# Add y_test vs. prediction mismatches to a list

mismatch_list = []
i = 0
for i in range(len(y_test)):
    if y_test_scores[i] != predictions[i]:
        mismatch_list.append(i)

In [35]:
# Get index of mismatches in y_test

print(mismatch_list[0:100])

[10, 12, 16, 21, 31, 37, 39, 50, 54, 62, 80, 82, 84, 89, 93, 95, 105, 106, 126, 129, 135, 137, 142, 146, 148, 157, 165, 168, 184, 186, 188, 193, 197, 204, 205, 209, 210, 214, 216, 228, 234, 237, 238, 242, 243, 249, 252, 256, 259, 261, 262, 285, 307, 315, 319, 327, 328, 352, 356, 363, 364, 369, 370, 385, 388, 389, 392, 400, 402, 409, 415, 435, 438, 442, 443, 451, 455, 457, 466, 471, 484, 494, 499, 507, 518, 520, 523, 558, 559, 563, 575, 593, 599, 604, 608, 609, 611, 615, 616, 618]


In [37]:
# Connect the mismatched y_test indeces to the y_test Excel row number

print(y_test_rows[10])
print(y_test_rows[12])
print(y_test_rows[16])

2480
1867
4787


In [38]:
# Create a function to evaluate mismatches

#df.iloc[y_test_row[10]] 
#df.iloc[2480]

def mismatch_eval(i):
    print(df.iloc[y_test_rows[i]])
    print('\n')
    print(df.iloc[y_test_rows[i]]['Merged'])
    print('\n')
    print(f"Actual: {df.iloc[y_test_rows[i]]['SR Type']}")
    print(f"Prediction: {predictions[i]}")

In [39]:
mismatch_eval(10)

Service Request Number                                          1-222097692
SR Type                                                           Complaint
Platform                                                             Erytra
Merged                    Customer had a gripper crash and cards were ja...
Name: 2480, dtype: object


Customer had a gripper crash and cards were jammed. Customer reported a gripper crash and the cards were jammed. The TAS investigating this case worked with the customer on the phone to get the cards removed from the gripper. All cards that were loose were removed from the drawers. All the card drawers were checked to make sure the cards were sitting properly in the racks. The gripper was enabled. The next shift will ran QC and reported that they had some cards in centrifuge and had problems with gripper again. The cards were removed from centrifuge, QC was run and the customer was able to run patient samples. The customer was able to remove the cards from gri

## Next Steps

- Evaluate mismatches, re-classify if required, and retrain model
- Evaluate different modelsa