#  Natural Language Processing Case Study

## Sentiment Classification for Customer reviews

## 1. Import required packages

In [1]:
import sklearn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
from matplotlib import pyplot as plt
from sklearn import preprocessing
from sklearn.feature_extraction import text

# import sklearn classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

## 2. Load dataset

In [8]:
data = pd.read_csv('data/eh_reviews_sentiment.csv',sep=';')

In [9]:
data = data.sample(frac=.5)

In [10]:
data.head()

Unnamed: 0,review,sentiment
3888,soda stream crystal 2.0 titan promopack soda s...,positive
6338,Preis/Leistungsverhältnis sehr gut Preis/Leist...,positive
8361,Alles super gelaufen! Danke. Alles super gelau...,positive
2487,Sehr gutes Gerät Sehr gutes Gerät Sehr gutes G...,positive
6844,Funktionales Handy im unteren Preissegment Fun...,positive


In [11]:
data.shape

(5000, 2)

In [12]:
i = 4
print('Review =\n{}\n'.format(data.iloc[i,0]))
print("Sentiment = {}".format(data.iloc[i,1]))

Review =
Funktionales Handy im unteren Preissegment Funktionales Handy im unteren Preissegment Funktionales Handy im unteren Preissegment Funktionales Handy im unteren Preissegment

Sentiment = positive


##  3. Analyze Dataset

### Apply tokenizing

_______________

### <span style="color:blue">**TODO: Experiment with both Vectorizer choices and different parameters!**</span>

In [13]:
# set stopwords
stopwords = text.ENGLISH_STOP_WORDS

# initialize and fit vectorizer
vect = CountVectorizer(max_features=3000, stop_words=stopwords, token_pattern=r'\b[^\d\W]+\b')\
                      .fit(data['review'])
#vect = TfidfVectorizer(max_features=3000, stop_words=stopwords.words('english'), token_pattern=r'\b[^\d\W]+\b')\
#                      .fit(data['review'])

# apply vectorizer to data set
X = vect.transform(data['review'])

In [14]:
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])

Number of discarded tokens: since we chose to use only the most frequent tokens, the number of stop words goes up a lot!

In [15]:
print(len(vect.stop_words_))

15929


In [16]:
y = data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

Basic information on tokens and size of dictionary

In [17]:
feature_names = vect.get_feature_names()

print("Number of features: {}\n".format(len(feature_names)))
print("First 100 features:\n{}\n".format(feature_names[:100]))
print("Features 110 to 130:\n{}\n".format(feature_names[110:130]))
print("Every 100th feature:\n{}\n".format(feature_names[::100]))

Number of features: 3000

First 100 features:
['_', '_top', 'ab', 'abbildung', 'abdeckung', 'abend', 'aber', 'abgeben', 'abgebildet', 'abgebrochen', 'abgelaufen', 'abgesehen', 'abgespielt', 'ablauf', 'ablesbar', 'ablesen', 'abnehmbar', 'abraten', 'abschalten', 'absolut', 'absolute', 'absoluter', 'absolutes', 'abspielen', 'abstand', 'abwicklung', 'abziehen', 'abzug', 'accu', 'acer', 'ach', 'achtung', 'activity', 'ad', 'adapter', 'ade', 'aeg', 'ahnung', 'aid', 'akku', 'akkulaufzeit', 'akkuleistung', 'akkus', 'akkusauger', 'aktion', 'aktiv', 'aktivieren', 'aktivitäten', 'aktuelle', 'aktuellen', 'aktueller', 'akzeptabel', 'alle', 'allein', 'alleine', 'allem', 'allen', 'aller', 'allerdings', 'alles', 'allgemein', 'allrounder', 'alltag', 'als', 'alt', 'alte', 'alten', 'alter', 'alternative', 'alternativen', 'altes', 'alu', 'aluminium', 'amazon', 'analog', 'anbieter', 'anbietern', 'andere', 'anderem', 'anderen', 'anderer', 'anderes', 'anders', 'android', 'anfang', 'anfangs', 'anforderungen', 

## 4. Split into train and test set

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 5. Choose classifier method and fit on data
_______________
### <span style="color:blue">**TODO: Experiment with different classifier choices and different parameters!**</span>

In [19]:
#clf = DecisionTreeClassifier(max_depth=15, min_samples_leaf=10)
clf = LogisticRegression(max_iter=5000)
#clf = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
#clf = GaussianNB()
#clf = MLPClassifier(alpha=.01, max_iter=500)

In [20]:
#clf = clf.fit(X_train, y_train)
clf = clf.fit(X_train.toarray(), y_train)

## 6. Use trained model to predict labels for train and test set

In [21]:
y_pred = clf.predict(X_test.toarray())
y_train_pred = clf.predict(X_train.toarray())

In [22]:
test_accuracy = accuracy_score(y_test, y_pred)
train_accuracy = accuracy_score(y_train, y_train_pred)

In [23]:
print("accuracy on training set = {:1.1f}%".format(100*train_accuracy))
print("accuracy on test set\t = {:1.1f}%".format(100*test_accuracy))

accuracy on training set = 99.4%
accuracy on test set	 = 94.1%


## Aside: Look at tokens associated with positive or negative sentiments

In [24]:
try:
    indices = np.argsort(clf.coef_[0])
    feature_names = np.array(vect.get_feature_names())[indices]
    neg_unigrams = feature_names[:100]
    print('Words associated with negative reviews:')
    print(neg_unigrams)
    pos_unigrams = feature_names[-100:-1]
    print('\n')
    print('Words associated with positive reviews:')
    print(pos_unigrams)
except:
    print('Print classifier does not have coeff attribute')

Words associated with negative reviews:
['zurück' 'schlechte' 'enttäuscht' 'unzufrieden' 'enttäuschend' 'undicht'
 'schafft' 'nicht' 'unittest' 'leider' 'schlecht' 'test' 'unbrauchbar'
 'fehler' 'schlechter' 'defekt' 'null' 'enttäuschung' 'horrible' 'lieber'
 'streamer' 'läuft' 'billige' 'software' 'zurückschicken' 'irreführende'
 'egal' 'bereits' 'weg' 'klang' 'nie' 'absoluter' 'verliert' 'plastikmüll'
 'überhaupt' 'müll' 'probleme' 'schade' 'ab' 'phone' 'laptop' 'wenig'
 'total' 'finger' 'wochen' 'tv' 'werde' 'obwohl' 'hardware' 'dennoch'
 'wenn' 'mobile' 'ca' 'voll' 'beschädigt' 'kaputt' 'falsche' 'mangelhaft'
 'aussehen' 'poliermaschine' 'schere' 'daneben' 'schrott' 'katastrophe'
 'nix' 'zeitschaltuhr' 'topfset' 'totaler' 'update' 'selbst' 'stunden'
 'ungenügend' 'keine' 'dringend' 'big' 'fragen' 'ärgerlich' 'leer'
 'geöffnet' 'munddusche' 'beim' 'touchpad' 'laut' 'schwer' 'severin'
 'anwendung' 'low' 'fuer' 'folie' 'support' 'fast' 'gehäuse' 'abgelaufen'
 'tassimo' 'trotz' 'hängen

## Apply sentiment classification to new sentence

In [25]:
sentiment = lambda x: 'positive' if x == 1 else 'negative'

### <span style="color:blue">**TODO: Write your own review and check the detected sentiment!**</span>

In [33]:
my_review = ['der wassersprudler ist schlecht']

In [34]:
# vectorize new review
x_rev = vect.transform(my_review)
# Classify new review
y_pred = clf.predict(x_rev.toarray())
print('Sentiment of review: {}'.format(sentiment(y_pred)))

Sentiment of review: positive


In [48]:
import pickle

In [49]:
pickle.dump(clf, open('nlp_model.pkl','wb'))

In [50]:
pickle.dump(vect, open('nlp_vect.pkl','wb'))