In [None]:
import pandas as pd
import numpy as np
path = 'dataset/labelled_tweets.csv'
    
tweet = pd.read_csv(path, usecols=['text', 'polarity'])
# Remove rows will nan values
tweet = tweet.dropna()
tweet = tweet.reindex(np.random.permutation(tweet.index))
# preprocessing bit
# replace user handles (@Jumia) to be empty
pattern = "(@[A-Za-z0-9]+)|(http|https|ftp)://[a-zA-Z0-9./]+|#(\w+)"
tweet['text'] = tweet.text.str.replace(pattern, '')
tweet.head()

Unnamed: 0,polarity,text
749,neutral,where in Eldoret can I go pick now ?
192,negative,Are all purchases to be prepaid? I have attem...
1236,neutral,Hey...308694786. Kindly deliver this today wi...
1750,neutral,Does Jumia sell TVs with inbuilt decorder such...
110,negative,And we might consider shifting shop


In [None]:
# examine class distribution
tweet.polarity.value_counts()

neutral     959
negative    748
positive    100
Name: polarity, dtype: int64

In [3]:
# convert label to a numerical count by creating a new column
tweet['polarity_num'] = tweet.polarity.map({'negative':0,'positive':1,'neutral':2})

In [4]:
# Define X matrix as features and y as vectors
X = tweet.text
y = tweet.polarity_num
print(X.shape)
print(y.shape)

(1807,)
(1807,)


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1355,)
(452,)
(1355,)
(452,)


### Model evaluation

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
import re
import numpy as np

In [7]:
clf_nb = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
clf_nb = clf_nb.fit(X_train,y_train)

In [8]:
np.mean(y_pred_class == y_test)

NameError: name 'y_pred_class' is not defined

### SVM

In [None]:
clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='hinge',penalty='l2',alpha=1e-3, n_iter=5,random_state=42))])
clf_svm = clf_svm.fit(X_train,y_train)
y_pred_class = clf_svm.predict(X_test)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf_reg = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression())])
clf_reg = clf_reg.fit(X_train,y_train)

### Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV 
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), }

In [None]:
clf_list = [clf_nb, clf_reg, clf_svm]
clf_dict = {
    clf_nb : {'vect__ngram_range': [(1, 1), (1, 2)],'tfidf__use_idf': (True, False),'clf__alpha': (1e-2, 1e-3),},
    clf_reg : {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), },
    clf_svm : {'vect__ngram_range': [(1, 1), (1, 2)],'tfidf__use_idf': (True, False),'clf__alpha': (1e-2, 1e-3),}
}

scores = {}
# loop every key and value in the dictionary
for clf, params in clf_dict.items():
    gs_clf = GridSearchCV(clf, params, n_jobs=-1, cv=10)
    gs_clf = gs_clf.fit(X_train, y_train)
    scores[clf] = gs_clf.best_score_



# for clf in clf_list:
#     gs_clf = GridSearchCV(clf, parameters, n_jobs=-1, cv=10)
#     gs_clf = gs_clf.fit(X_train, y_train)
#     print(gs_clf.best_score_)

In [None]:
# pickle the scores for faster access
from sklearn.externals import joblib
joblib.dump(scores, 'scores.pkl') 

In [None]:
scores = joblib.load('scores.pkl')
maximum = max(scores, key=scores.get)  # Just use 'min' instead of 'max' for minimum.
print(maximum, scores[maximum])
# scores

In [None]:
gs_clf.best_score_

In [None]:
gs_clf.best_params_

In [None]:
# calculate accuracy of class predictions
from sklearn import metrics
from sklearn.metrics import classification_report
metrics.accuracy_score(y_test, y_pred_class)
print(classification_report(y_test, y_pred_class))

In [None]:
vect = CountVectorizer()
X_dtm = vect.fit_transform(X)

In [None]:
from sklearn.model_selection import cross_val_score
logreg = MultinomialNB()
print(cross_val_score(logreg, X_dtm, y, cv=10, scoring='accuracy').mean())

# Confusion Matrix

In [None]:
"""
This snippet of code was extracted from: 
http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
"""

print(__doc__)

import itertools
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

class_names = ['negative','positive','neutral']

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred_class)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()

In [None]:
clf_svm.predict(['OK'])

# Topic Modelling

In [36]:

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

def display_topics(model, feature_names, no_top_words):
    for topic_index, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_index))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words -1:-1]]))

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(X)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(X)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 20

# Run NMF
nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)
display_topics(lda, tf_feature_names, no_top_words)



Topic 0:
order placed confirm cancelled happened need hello want received week
Topic 1:
thank yes response alright sorted received assistance recieved jamo customer
Topic 2:
com twitter pic https jumiakenya email time money ready ago
Topic 3:
thanks alright got sorted great assisted lot yeah received wanted
Topic 4:
dm follow send mind like following ur respond great did
Topic 5:
cancel like order want replaced need orders just does yes
Topic 6:
delivery pay option hello expect monday today cash saturday called
Topic 7:
okay week boss look past cheers tomorrow expensive days price
Topic 8:
waiting reply feedback ll weeks tomorrow need ago today sawa
Topic 9:
kindly confirm advise update assist send advice help look item
Topic 10:
delivered today ordered package said tomorrow paid item want ada
Topic 11:
number order got 304684986 paybill thats confirm 301131586 ordered provided
Topic 12:
long wait days taking deliver 308891886 refund product package guys
Topic 13:
ok poor fine ready pi

In [35]:
nmf.components_.shape

(20, 1000)

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import numpy as np

def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            print(documents[doc_index])

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(X)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
tf = tf_vectorizer.fit_transform(X)
tf_feature_names = tf_vectorizer.get_feature_names()

no_topics = 2

# Run NMF
nmf_model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
nmf_W = nmf_model.transform(tfidf)
nmf_H = nmf_model.components_

# Run LDA
lda_model = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
lda_W = lda_model.transform(tf)
lda_H = lda_model.components_

no_top_words = 4
no_top_documents = 4
display_topics(nmf_H, nmf_W, tfidf_feature_names, X, no_top_words, no_top_documents)
display_topics(lda_H, lda_W, tf_feature_names, X, no_top_words, no_top_documents)



Topic 0:
order cancel number kindly
 today is my birthday..
Can i get one?
 hi.Kindly update me on rectification of a return of a product made recently after consulting customer care.Order no 307245886
why has it taken this long when my previous orders took only a few days??
Topic 1:
thank okay ok yes
Okay thanks I will
 its now 11 days since I placed an order and paid for a phone,order no.303673286..up to now I have not received it?..wassup?
Thank you
 why did you supply me one for UAE knowing very well am in Kenya. That is conmanship
Topic 0:
order delivery number thanks
 hi. My order was cancelled and I still received a message saying that it was ready for collection. Kindly advise.
I want to buy a phone cover but it's from Jumia Global. Delivery is at Adams Arcade pick up station
 ORDER N.° 30429438 i have picked my package the phone is okay but the flash disc isnt functioning
 i wanna buy a phone from you. where are ur offices?
Topic 1:
order waiting phone delivered
   _DUKAS http