In [1]:
print_imports()

Loaded modules:
Numpy                np              1.14.0
Pandas               pd              0.22.0
Keras                ks              2.0.6

Matplotlib           mpl             2.1.2
matplotlib.pyplot    plt             N/A
matplotlib.image     mpimg           N/A
Seaborn              sns             0.8.1
PIL                  PIL             5.0.0

ExergyUtilities      exergy          2.0.

pyspark              pyspark         2.2.1


In [2]:
from utils import css_from_file
css_from_file('style/style.css')

In [3]:
import re
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import (
    CountVectorizer, TfidfVectorizer, HashingVectorizer
)

import warnings
warnings.filterwarnings("ignore")

Read csv file into a dataframe

In [4]:
df = pd.read_csv("data/ebaytitles.csv")
df.head()

Unnamed: 0,title,category_name
0,Max Protection Solid Red Yugioh Sized Card Sle...,Collectibles
1,Hippies Use Side Door metal sign (ar),Collectibles
2,REVERSE Embellished Hi Lo Bandeau Dress Black ...,"Clothes, Shoes & Accessories"
3,VAL72299 Game Color Set - Introduction Set ...,Toys & Games
4,bang tidy decal sticker large 250mmx145mm dub ...,Vehicle Parts & Accessories


In [5]:
#%%script false
df = df.sample(frac=0.1) # delete this line if you are brave and have many GBs of RAM

Print out unique values of a column

In [6]:
df.category_name.unique()

array(['Jewellery & Watches', 'Home, Furniture & DIY',
       'Vehicle Parts & Accessories', 'Sporting Goods', 'Toys & Games',
       'Music', 'Crafts', 'Clothes, Shoes & Accessories', 'Collectibles',
       'Sports Memorabilia', 'Garden & Patio',
       'Computers/Tablets & Networking', 'Health & Beauty',
       'Pet Supplies', 'Business, Office & Industrial',
       'Mobile Phones & Communication', 'Dolls & Bears', 'Sound & Vision',
       'Musical Instruments & Gear', 'DVDs, Films & TV',
       'Consumer Electronics', 'Antiques', 'Art',
       'Video Games & Consoles', 'Baby', 'Everything Else',
       'Books, Comics & Magazines', 'Stamps', 'Cameras & Photography',
       'Coins & Paper Money', 'Pottery, Porcelain & Glass',
       'Cell Phones & Accessories', 'Wholesale & Job Lots',
       'Holidays & Travel', 'Entertainment Memorabilia'], dtype=object)

Split the data into train and test observations - there is a column

In [7]:
df.category_name[0:10]

663046            Jewellery & Watches
115104          Home, Furniture & DIY
354724    Vehicle Parts & Accessories
148463          Home, Furniture & DIY
106838          Home, Furniture & DIY
418108            Jewellery & Watches
587234          Home, Furniture & DIY
222700                 Sporting Goods
607791    Vehicle Parts & Accessories
286580                   Toys & Games
Name: category_name, dtype: object

In [8]:
X = df.title.values
y = df.category_name.values

X_tr, X_te, y_tr, y_te = train_test_split(X, 
                                          y,
                                          test_size=0.1,
                                          random_state=0)

Exercise 
------------------

1. Count how many titles are in each category (```pandas.DataFrame.groupby```). Print out most common at the top

In [9]:
df.groupby('category_name').count().sort_values('title',ascending=False)
#df.groupby('category_name').count().sort_values(ascending=False)

Unnamed: 0_level_0,title
category_name,Unnamed: 1_level_1
Vehicle Parts & Accessories,23177
"Clothes, Shoes & Accessories",16747
"Home, Furniture & DIY",12664
Computers/Tablets & Networking,6764
Jewellery & Watches,6286
Sporting Goods,4886
Mobile Phones & Communication,3963
Crafts,3406
Health & Beauty,3312
Toys & Games,2979


<a>Double click to show the solution</a>
<div class='spoiler'>

frequencies = df.groupby("category_name")["title"].count()
frequencies.sort_values(inplace=True,ascending=False)
print(frequencies)

# or faster

df.category_name.value_counts()

</div>

In [10]:
frequencies = df.groupby("category_name")["title"].count()
frequencies.sort_values(inplace=True,ascending=False)
#print(frequencies)

# or faster

df.category_name.value_counts()

Vehicle Parts & Accessories       23177
Clothes, Shoes & Accessories      16747
Home, Furniture & DIY             12664
Computers/Tablets & Networking     6764
Jewellery & Watches                6286
Sporting Goods                     4886
Mobile Phones & Communication      3963
Crafts                             3406
Health & Beauty                    3312
Toys & Games                       2979
Business, Office & Industrial      2841
Collectibles                       2437
Sound & Vision                     1862
Music                              1349
Garden & Patio                     1043
Cameras & Photography               835
Baby                                673
DVDs, Films & TV                    599
Pet Supplies                        592
Art                                 584
Video Games & Consoles              494
Books, Comics & Magazines           483
Musical Instruments & Gear          439
Sports Memorabilia                  289
Dolls & Bears                       280


Bag of words
--------------------

Different types of vectorizers:

<ul>
<li>```sklearn.feature_extraction.text.CountVectorizer``` - Counts the number of times a word appears in the text</li>
<li>```sklearn.feature_extraction.text.TfidfVectorizer``` - Weighs the words according to the importance of the word in the context of whole collection. Is the word ```the``` important if it appears in all documents?</li>
<li>```sklearn.feature_extraction.text.HashingVectorizer``` - Useful when you don't know the vocabulary upfront. Feature number is calculated as ```hash(token) % vocabulary_size```.</li>
</ul>

Exercise
-------------------
1. Use ```CountVectorizer``` / ```TfidfVectorizer``` to fit the collection of documents
2. How many unique tokens are there in text? Print some examples (ie first few hundred).
3. What methods you can use to reduce this number? 
   - Check out and experiment with the arguments: ```ngram_range```, ```min_df```. How the vocabulary size changes with each change?
   - What would you replace / delete from the text?
4. Write a custom function `clean_text` that accepts a text as input and transforms it (remove/hash numbers, delete short/long words etc.)
5. (Extra points) When would you use ```HashingVectorizer```?

Here's the data

In [11]:
cats = df.category_name.values
print(X_tr[0],"-IS A-",y_tr[0])
print(X_tr[-1],"-IS A-",y_tr[-1])

Unisex Men's Women's UCC 50/50 Blend Adult Set-In Sweatshirt - UCC001 -IS A- Clothes, Shoes & Accessories
KIA RIO 01-05 14" LUXURY WHEEL TRIM HUB CAP SET SPARK BRAND NEW -IS A- Vehicle Parts & Accessories


Now fit it to a Vectorizer

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_tr)
X_train_counts.shape

(90000, 72233)

In [13]:
# Feature index: 
count_vect.vocabulary_.get(u'shampoo')

59143

Using FREQUENCIES

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer()
X_train_vects = tfidf_vect.fit_transform(X_tr)
X_train_vects.shape

(90000, 72233)

This returns a mapping to the column indices

In [15]:
tfidf_vect.vocabulary_.get(u'shampoo')

59143

TRANSFORMER

In [16]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(90000, 72233)

Faster, do both at once! 

In [17]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

X_train_tfidf.shape

(90000, 72233)

In [18]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, y_tr)

In [19]:
docs_new = ['pills shampoo','sport ball']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print(doc,"CLASSIFIED AS", category)

pills shampoo CLASSIFIED AS Health & Beauty
sport ball CLASSIFIED AS Vehicle Parts & Accessories


<a>Double click to show the solution</a>
<div class='spoiler'>
from sklearn.feature_extraction.text import CountVectorizer
import re

def clean_text(t):
    t = t.lower()
    t = re.sub("[^A-Za-z0-9]"," ",t)
    t = re.sub("[0-9]+","#",t)
    return t

vectorizers = [
     ("vanilla",
          CountVectorizer())
    ,("preprocessing",
          CountVectorizer(preprocessor=clean_text))
    ,("preprocessing + min_df=10",
          CountVectorizer(preprocessor=clean_text,
                          min_df=10))
]

for vect_name, vect in vectorizers:
    print(vect_name)
    vect.fit(X_tr)
    
    print(list(vect.get_feature_names())[:10])
    print(len(vect.get_feature_names()))
</div>

In [20]:
import snowballstemmer

stemmer = snowballstemmer.stemmer('english')
print(stemmer.stemWords("bigger biggest running We are the world connections".split()))

['bigger', 'biggest', 'run', 'We', 'are', 'the', 'world', 'connect']


In [21]:
from sklearn.feature_extraction.text import CountVectorizer
import re

import snowballstemmer

stemmer = snowballstemmer.stemmer('english')

def clean_text(t):
    """Accepts a Document 
    """
    t = t.lower()
    # Remove single characters
    t = re.sub("[^A-Za-z0-9]"," ",t)
    # Replace all numbers by a single char
    t = re.sub("[0-9]+","#",t)
    
    return t

def clean_text_stemmed(t):
    """Accepts a Document 
    """
    t = t.lower()
    # Remove single characters
    t = re.sub("[^A-Za-z0-9]"," ",t)
    # Replace all numbers by a single char
    t = re.sub("[0-9]+","#",t)
    tfinal = " ".join(stemmer.stemWords(t.split()))
    
    return tfinal

vectorizers = [
     ("vanilla",
          CountVectorizer())
    ,("preprocessing",
          CountVectorizer(preprocessor=clean_text))
    ,("preprocessing + min_df=10",
          CountVectorizer(preprocessor=clean_text,
                          min_df=10))
]

for vect_name, vect in vectorizers:
    print(vect_name)
    vect.fit(X_tr)
    
    print(list(vect.get_feature_names())[:10])
    print(len(vect.get_feature_names()))


vanilla
['00', '000', '0000', '00000', '000002', '000018', '000021', '00003f', '00003g', '000051446b']
72233
preprocessing
['aa', 'aaa', 'aaaa', 'aaaaa', 'aaad', 'aabd', 'aac', 'aaci', 'aadipod', 'aai']
39547
preprocessing + min_df=10
['aa', 'aaa', 'ab', 'abarth', 'abc', 'about', 'abr', 'abs', 'absolute', 'absorber']
7532


Type | Clean text | Stemmed
---------| ---------| ---------
vanilla | 71866 | 71866
preprocessing  | 39309 | 33307
preprocessing + min_df=10 | 7499 | 6630

Stemming
------------------

Linguistic normalization in which variant forms are reduced to a common form

    connection
    connections
    connective     --->   connect
    connected
    connecting
    
Usage:

    import snowballstemmer

    stemmer = snowballstemmer.stemmer('english')
    print(stemmer.stemWords("We are the world".split()))

Putting it into a pipeline
----------------------

Now that we know how to transform text data, let's put it into a pipeline.

1. Create a pipeline with `CountVectorizer`, `StandardScaler` and `SGDClassifier` as your final algorithm
    a) use alternative format for pipeline definition when you name the steps - refer to the documentation how to do this
2. Using ```sklearn.metrics.classification_report``` create a report about your classifier

In [22]:
##########################
# put your solution here #
##########################

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV


text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),])


In [23]:
preprocessors = [
    None,
    clean_text,
    clean_text_stemmed,
]
for i,prep in enumerate(preprocessors):
    print(i,prep)
#     this_text_clf = Pipeline([('vect', CountVectorizer(preprocessor=prep)),
#                          ('tfidf', TfidfTransformer()),
#                          ('clf', MultinomialNB()),])
    this_text_clf = Pipeline([('vect', CountVectorizer(preprocessor=prep)),
                         ('tfidf', TfidfTransformer()),
                         ('clf', SGDClassifier()),])    
    print(this_text_clf)                         
    this_text_clf.fit(X_tr, y_tr)  
    predicted = this_text_clf.predict(X_te)
    acc=np.mean(predicted == y_te)
    #print(classification_report(X_tr)
    print(classification_report(y_te, predicted))
    print(acc)

0 None
Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False))])
                                precision    recall  f1-score   support

                      Antiques       0.25      0.05      0.08        22
                           Art       0.63      0.59      0.61        58
                          Baby       0.82      0.43      0.56        77
     Books, Comics & Magazines       0.95      0.62      0.75        60
 Business, Office & Industrial       0.85      0.50      0.63       292
         Cameras & Photography       0.86      0.82      0.84        74
     Cell Phones & Accessories       0.00      0.00     

In [24]:
text_clf.fit(X_tr, y_tr)  

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [25]:
predicted = text_clf.predict(X_te)
np.mean(predicted == y_te)  

0.7359

<a>Double click to show the solution</a>
<div class='spoiler'>

from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import cross_val_predict
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV

clf = Pipeline([('vect', CountVectorizer(min_df=10, preprocessor=clean_text)),
                ('scaling', StandardScaler(with_mean=False)),
                ('clf', SGDClassifier())])

preds = cross_val_predict(clf, 
                          X_tr, 
                          y_tr, 
                          cv=8, n_jobs=-1, verbose=True)

print(classification_report(y_tr, preds)


</div>


Grid search
--------------------------

Scikit-learn has `GridSearchCV` and `RandomizedSearchCV`. Both have the same functionality and can be used to find good parameters for the models. What is great about both these classes that they are both transformers - they return an estimator so you can chain them and put in your pipeline.

**GridSearchCV** - you specify the exact values of the parameters you want to test
**RandomizedSearchCV** - you specify ranges of parameters

Exercise
----------------------

1. Use `GridSearchCV` or `RandomizedSearchCV` to find the best parameters for the models. Check at least 2 parameters.

2. Inspect the attribute `cv_results_` after fitting. It gives a nice representation of the learning.

In [65]:
np.logspace(-5,10,10)

array([1.00000000e-05, 4.64158883e-04, 2.15443469e-02, 1.00000000e+00,
       4.64158883e+01, 2.15443469e+03, 1.00000000e+05, 4.64158883e+06,
       2.15443469e+08, 1.00000000e+10])

In [66]:
import sklearn

SGDClassifier

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier())])

params = {          'vect__min_df': [1,2,5,10,15],
         'clf__alpha':np.logspace(-5,10,9)}
    #'clf__alpha':np.logspace(-4,10,8)}
grid_clf = GridSearchCV(text_clf, params, n_jobs=-2, verbose=True)
grid_clf.fit(X_tr, y_tr)

#sklearn.grid_search.GridSearchCV

Fitting 3 folds for each of 45 candidates, totalling 135 fits


[Parallel(n_jobs=-2)]: Done  36 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-2)]: Done 135 out of 135 | elapsed:  4.0min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-2,
       param_grid={'vect__min_df': [1, 2, 5, 10, 15], 'clf__alpha': array([1.00000e-05, 7.49894e-04, 5.62341e-02, 4.21697e+00, 3.16228e+02,
       2.37137e+04, 1.77828e+06, 1.33352e+08, 1.00000e+10])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=True)

In [49]:
print(grid_clf.best_score_,grid_clf.best_params_)
grid_clf

0.8623333333333333 {'clf__alpha': 0.0001, 'vect__min_df': 2}


In [67]:

tuned_parameters = {          'vect__min_df': [1,2,5,10],
         'clf__alpha':np.logspace(-4,1,6)}

tuned_parameters = {          'vect__min_df': [1],
         'clf__alpha':np.logspace(-4,1,6)}

#clf = grid_clf

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    #clf = GridSearchCV(text_clf, tuned_parameters, cv=5, scoring=score)
    #clf = GridSearchCV(text_clf, tuned_parameters, cv=5, scoring=score)
    clf = GridSearchCV(text_clf, tuned_parameters, n_jobs=-2, scoring=score, verbose=True)
    
    clf.fit(X_tr, y_tr)
    y_pred = clf.predict(X_te)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_estimator_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params))
    print()
    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_te, y_pred = y_te, clf.predict(X_te)
    print(classification_report(y_te, y_pred))
    print()


# Tuning hyper-parameters for precision

Fitting 3 folds for each of 6 candidates, totalling 18 fits


KeyboardInterrupt: 

<a>Double click to show the solution</a>
<div class='spoiler'>

from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

print("Grid search")
print()

params = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
          'vect__analyzer': ["word","char"],
          'vect__binary': [True, False]}

grid_clf = GridSearchCV(clf, params, n_jobs=1, verbose=True)
grid_clf.fit(X_tr, y_tr)

best_params = sorted(grid_clf.grid_scores_, key=lambda x: -x[1])


for params, score, _ in best_params:
    print(score, params) 
    
print("Randomized search")
print()
    
params = {'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
          'vect__analyzer': ["word","char"],
          'model__lr__dimensions': [100, 200]}

grid_clf = RandomizedSearchCV(clf, params, n_jobs=1, verbose=True, n_iter=8)
grid_clf.fit(np.array(X_tr[:10000]), y_tr[:10000])

best_params = sorted(grid_clf.grid_scores_, key=lambda x: -x[1])

for params, score, _ in best_params:
    print(score, params)

</div>


Useful materials

1. http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
2. http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html