Sometimes your input data can be nested with more difficult structure than a simple table or a matrix.

In such cases it is sometime useful to shift mental orientation to analyze and extract information froms rows rather then non-defined columns.

In [None]:
from utils import css_from_file
css_from_file('style/style.css')

In [None]:
!pip install nltk

In [None]:
import json
import numpy as np
import pprint
from nltk import download, word_tokenize

download('punkt')

In [None]:
with open("data/companies/companies.json") as dataf:
    data = [json.loads(line) for line in dataf]

An example of deeply nested data with various data types:

Exercise:

1. Name variable types
2. What do you do with lists, geo location?
3. What do you do with counts?

In [None]:
pprint.pprint(data[7])

With such data you can be sure that you'll need a sparse matrix.

Remember `DictVectorizer` class? It accepts a dictionary and returns a sparse matrix.

So the only thing we need is a function such that 

```f(Json) => Dict```

First we need a function `deep_select` to retrieve nested values

In [None]:
### write your deep_select class here

jsondata = {'a': 
    {'b': 
         {'c': 1}
    }
}

class DeepSelect():
    def __init__(self, path, default):
        self.path = path
        self.default = default
    
    def _helper(self, row, path):
        if len(path) == 1:
            return row.get(path[0], self.default)
        elif path[0] in row:
            return self._helper(row[path[0]], path[1:])
        else:
            return self.default

    def __call__(self, row):
        return self._helper(row, self.path)

# tests
assert DeepSelect(["a","b","c"], None)(jsondata) == 1
assert DeepSelect(["a","b"], None)(jsondata) == {'c': 1}
assert DeepSelect(["x"], None)(jsondata) == None

Click here to see the DeepSelect solution
<div class="spoiler">

class DeepSelect():
    def __init__(self, path, default):
        self.path = path
        self.default = default
    
    def _helper(self, row, path):
        if len(path) == 1:
            return row.get(path[0], self.default)
        elif path[0] in row:
            return self._helper(row[path[0]], path[1:])
        else:
            return self.default

    def __call__(self, row):
        return self._helper(row, self.path)

</a>

In [None]:
def create_skills_features(row):
    features = {}
    for tech in DeepSelect(['extension','skills'],[])(row):
        features[tech['skill']] = tech["count"]
    return features

create_skills_features(data[4])

So far so good what about text?

In [None]:
def create_description_features(row):
    features = {}
    for word in word_tokenize(row['description']):
        features["description=" + word.lower()] = 1
    return features

create_description_features(data[0])

Let's create a more generic way to transform text

In [None]:
class TransformText():
    def __init__(self, field, tokenizer=word_tokenize):
        self.field = field
        self.tokenizer = tokenizer
    
    def __call__(self, row):
        features = {}
        for word in self.tokenizer(DeepSelect(self.field,"")(row)):
            word = word.lower()
            features[word] = 1
        return features
    
text_transformer = TransformText(['extension','address'], tokenizer=lambda x: [x])
text_transformer(data[6])

Exercise:
-------------
    
1. Write function or classes that transform other features? You'll need a function to retrieve nested values. 
2. There are some fields which you can treat as a categorical feature or a text features. What is best and why?
3. Write a function / class that will accept a list of transforming functions and creates a concatenation of the features
4. Wrap previous function in a scikit-learn transformer class so we can use it in a pipeline

In [None]:
def combine_features(fs):

    def helper(row):
        all_features = {}
        for name, f in fs:
            for k,v in f(row).items():
                all_features[name + "_" + k] = v
        return all_features

    return helper

company_data = {
    'description': 'Fortune 500 hundred company',
    'skills': [{'skill': 'waste disposal', 'count': 1},
               {'skill': 'data science', 'count': 2}]
}


features_generator = combine_features([('description', TransformText(['description'])),
                                       ('skills', create_skills_features)]) 

features_generator(company_data)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction import DictVectorizer

class JsonTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, fs):
        self.fs = fs
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        out = []
        for x in X:
            out.append(combine_features(self.fs)(x))
        return out
    
pipeline = make_pipeline(
    JsonTransformer([('description', TransformText(['description'])), 
                     ('skills', create_skills_features)]),
    DictVectorizer()
)

pipeline.fit_transform(data)

In [None]:
X = pipeline.fit_transform(data)
X.shape

Number of features is really high. We need to reduce this. We can remove too sparse values.
To check the sparsity of the data we can use a method

X.getnnz (number of non-zero values)

In [None]:
X.getnnz(0)

Exercise
===============

1. Write a transformation class called SparsityFilter that accepts a minimum frequency. Watch out for fit function - this class has some state that you must save

```
class SparsityFilter(BaseEstimator, TransformerMixin):
    def __init__(self, min_nnz=None):
        self.min_nnz = min_nnz

    def fit(self, X, y=None):
        ???
        return self

    def transform(self, X):
        return ???
```

In [None]:
# write sparsity class here
class SparsityFilter(BaseEstimator, TransformerMixin):
    def __init__(self, min_nnz=None):
        self.min_nnz = min_nnz

    def fit(self, X, y=None):
        self.sparsity = X.getnnz(0)
        return self

    def transform(self, X):
        return X[:, self.sparsity >= self.min_nnz]

Double click to see the solution 

<div class="spoiler">

class SparsityFilter(BaseEstimator, TransformerMixin):
    def __init__(self, min_nnz=None):
        self.min_nnz = min_nnz

    def fit(self, X, y=None):
        self.sparsity = X.getnnz(0)
        return self

    def transform(self, X):
        return X[:, self.sparsity >= self.min_nnz]
</div>

In [None]:
pipeline = make_pipeline(
    JsonTransformer([('description', TransformText(['description'])), 
                     ('skills', create_skills_features)]),
    DictVectorizer(),
    SparsityFilter(min_nnz=25)
)

X = pipeline.fit_transform(data)
X.shape

You should see a shape that is much smaller than the original

Exercise
================

1. Build a model - try to predict the industry.
2. Evaluate its results using cross validation - what would be the best measure for this problem?

In [None]:
# write solution here

Click to see the solution

<div class="spoiler">

from xgboost import XGBClassifier
from sklearn.cross_validation import cross_val_predict

pipeline = make_pipeline(
    JsonTransformer([('description', TransformText('description')), 
                     ('technologies', create_technologies_features)]),
    DictVectorizer(),
    SparsityFilter(min_nnz=25),
    XGBClassifier()
)

X = data
y = [row['industries'][0] if len(row['industries']) else "" for row in data]

predictions = cross_val_predict(pipeline, X, y)

print("Accuracy = {}".format((predictions == np.array(y)).mean()))

</div>

In [None]:
from sklearn.cross_validation import cross_val_predict
from sklearn.naive_bayes import MultinomialNB

def makelist(x):
    return [x]
    
pipeline = make_pipeline(
    JsonTransformer([('description', TransformText(['description'])),
                     ('country', TransformText(['extension','geo_location','country'], makelist)),
                     ('skills', create_skills_features)]),
    DictVectorizer(),
    SparsityFilter(min_nnz=25),
    MultinomialNB()
)

X = data
y = [row['industries'][0] if len(row['industries']) else "" for row in data]

predictions = cross_val_predict(pipeline, X, y)

print("Accuracy = {}".format((predictions == np.array(y)).mean()))

In [None]:
len(set(y))