In [1]:
# Import pandas, numpy, and sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn import metrics

# Set the column width to view the text message data.
pd.set_option('max_colwidth', 200)

In [32]:
# Load the movie review dataset.
csat_reviews_df = pd.read_csv('customer_comments_data.csv')
# Display the first five rows of the dataset. 
csat_reviews_df.iloc[[28, 48, 1239]]

Unnamed: 0,comment,label
28,Very clean.,positive
48,Everything fine,positive
1239,I'm appalled by the lack of basic hygiene practices. It's no wonder infections are rampant in this facility.,negative


In [33]:
# Check for missing values. 
csat_reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1242 entries, 0 to 1241
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  1242 non-null   object
 1   label    1242 non-null   object
dtypes: object(2)
memory usage: 19.5+ KB


In [34]:
#  Get the number of "pos" and "neg" from the "label" column:
csat_reviews_df['label'].value_counts()

label
positive    684
negative    558
Name: count, dtype: int64

## Split the data into training & testing data sets.

In [9]:
# Set the features variable to the "review" column.
X = csat_reviews_df['comment']
# Set the target variable to the "label" column.
y = csat_reviews_df['label']

# Split data into training and testing and use `test_size = 30%`.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [10]:
#  Build a pipeline using `TfidfVectorizer()`, without `stopwords='english`, and `LinearSVC()`.
text_clf = Pipeline([('tfidf', TfidfVectorizer()),('clf', LinearSVC()),])

# Fit the model to the transformed data.
text_clf.fit(X_train, y_train)  

In [11]:
# Validate the model by checking the model's training and testing accuracy.
print('Train Accuracy: %.3f' % text_clf.score(X_train, y_train))
print('Test Accuracy: %.3f' % text_clf.score(X_test, y_test))

Train Accuracy: 0.999
Test Accuracy: 0.928


In [13]:
# Create the confusion matrix on the test data and predictions
print(metrics.confusion_matrix(y_test,test_predictions))

# Print a classification report
print(metrics.classification_report(y_test,test_predictions))

# Print the overall accuracy
print(metrics.accuracy_score(y_test,test_predictions))

[[157  19]
 [  8 189]]
              precision    recall  f1-score   support

    negative       0.95      0.89      0.92       176
    positive       0.91      0.96      0.93       197

    accuracy                           0.93       373
   macro avg       0.93      0.93      0.93       373
weighted avg       0.93      0.93      0.93       373

0.9276139410187667


Now let's repeat the process above with standard and customized stop words.

In [39]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Create custom stopwords.
custom_stopwords = ['a', 'about', 'an', 'and', 'are', 'as', 'at', 'be', 'been', 'but', 'by', 'can', \
             'even', 'ever', 'for', 'from', 'get', 'had', 'has', 'have', 'he', 'her', 'hers', 'his', \
             'how', 'i', 'if', 'in', 'into', 'is', 'it', 'its', 'just', 'me', 'my', 'of', 'on', 'or', \
             'see', 'seen', 'she', 'so', 'than', 'that', 'the', 'their', 'there', 'they', 'this', \
             'to', 'was', 'we', 'were', 'what', 'when', 'which', 'who', 'will', 'with', 'you']

combined_stopwords = list(ENGLISH_STOP_WORDS.union(custom_stopwords))

# Build a LinearSVC pipeline using`TfidfVectorizer()`, with `stopwords`, and `LinearSVC()`.
text_clf_3 = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=combined_stopwords)),
    ('clf', LinearSVC())
])

# Fit the data to the model.
text_clf_3.fit(X_train, y_train)

In [41]:
# Validate the model by checking the model accuracy with model.score
print('Train Accuracy: %.3f' % text_clf_3.score(X_train, y_train))
print('Test Accuracy: %.3f' % text_clf_3.score(X_test, y_test))

Train Accuracy: 0.999
Test Accuracy: 0.920


In [42]:
# Create the confusion matrix on the test data and predictions
print(metrics.confusion_matrix(y_test,test_predictions_2))

# Print a classification report
print(metrics.classification_report(y_test,test_predictions_2))

# Print the overall accuracy
print(metrics.accuracy_score(y_test,test_predictions_2))

[[153  23]
 [  7 190]]
              precision    recall  f1-score   support

    negative       0.96      0.87      0.91       176
    positive       0.89      0.96      0.93       197

    accuracy                           0.92       373
   macro avg       0.92      0.92      0.92       373
weighted avg       0.92      0.92      0.92       373

0.9195710455764075
