# Solr Client

In [1]:
from ltr.client import SolrClient
client = SolrClient()

# Download & Build Index (run once)

If you don't already have the downloaded dependencies; if you don't have TheMovieDB data indexed run this

In [2]:
from ltr import download

corpus='http://es-learn-to-rank.labs.o19s.com/tmdb.json'
judgments='http://es-learn-to-rank.labs.o19s.com/title_judgments.txt'
download([corpus, judgments], dest='data/');

data/tmdb.json already exists
data/title_judgments.txt already exists


In [None]:
from ltr.index import rebuild
from ltr.helpers.movies import indexable_movies

movies=indexable_movies(movies='data/tmdb.json')
rebuild(client, index='tmdb', doc_src=movies)

## Features for movie titles

We'll be searching movie titles (think searching for a specific movie on Netflix). And we have a set of judgments around the appropriatte movie to return. IE search for "Star Wars" return good star wars matches, in quality order...

These cover various aspects of the problem (searching title by phrase, title bm25 score, release date, etc). We'll use this to explore and analyze a simple model

In [3]:
client.reset_ltr(index='tmdb')

config = [
    #1
    {
      "name" : "title_has_phrase",
      "store": "title",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "title:\"${keywords})\"^=1"
      }
    },
    #2
    {
      "name" : "title_has_terms",
      "store": "title",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "title:(${keywords})^=1"
      }
    },
    #3
    {
      "name" : "title_bm25",
      "store": "title",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "title:(${keywords})"
      }
    },
    #4
    {
      "name" : "overview_bm25",
      "store": "title",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "overview:(${keywords})"
      }
    },
    #5
    {
      "name" : "overview_phrase_bm25",
      "store": "title",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "overview:\"${keywords}\""
      }
    },
    #6
    {
      "name" : "title_fuzzy",
      "store": "title",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "{!lucene df=title}${keywords}~"
      }
    },
    #7
    {
      "name" : "release_year",
      "store": "title",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "{!func}def(release_year,2000)"
      }
    }

]

client.create_featureset(index='tmdb', name='title', ftr_config=config)

Deleted title model [Status: 200]
Deleted title Featurestore [Status: 200]
Created title feature store under tmdb: [Status: 200]


## Training Set Generation

Log out features for each of the above queries out to a training set file

In [4]:
from ltr.judgments import judgments_open
from ltr.log import FeatureLogger
from itertools import groupby

ftr_logger=FeatureLogger(client, index='tmdb', feature_set='title')
with judgments_open('data/title_judgments.txt') as judgment_list:
    for qid, query_judgments in groupby(judgment_list, key=lambda j: j.qid):
        ftr_logger.log_for_qid(qid=qid, 
                               judgments=query_judgments,
                               keywords=judgment_list.keywords(qid))
        
training_set=ftr_logger.logged

Recognizing 40 queries...
Searching tmdb [Status: 200]
Discarded 0 Keep 41
Searching tmdb [Status: 200]
Discarded 0 Keep 41
Searching tmdb [Status: 200]
Discarded 0 Keep 39
Searching tmdb [Status: 200]
Discarded 0 Keep 28
Searching tmdb [Status: 200]
Discarded 0 Keep 33
Searching tmdb [Status: 200]
Discarded 0 Keep 39
Searching tmdb [Status: 200]
Discarded 0 Keep 35
Searching tmdb [Status: 200]
Discarded 0 Keep 38
Searching tmdb [Status: 200]
Discarded 0 Keep 35
Searching tmdb [Status: 200]
Discarded 0 Keep 31
Searching tmdb [Status: 200]
Discarded 0 Keep 28
Searching tmdb [Status: 200]
Discarded 0 Keep 34
Searching tmdb [Status: 200]
Discarded 0 Keep 31
Searching tmdb [Status: 200]
Discarded 0 Keep 30
Searching tmdb [Status: 200]
Discarded 0 Keep 35
Searching tmdb [Status: 200]
Discarded 0 Keep 31
Searching tmdb [Status: 200]
Discarded 0 Keep 31
Searching tmdb [Status: 200]
Discarded 0 Keep 35
Searching tmdb [Status: 200]
Discarded 0 Keep 32
Searching tmdb [Status: 200]
Discarded 0 Ke

## Feature Search: which features work best?

What combination of these features work best? Train a model with every combination, and use k-fold cross valudation (see `kcv=15` below). The combination with the best NDCG is output

In [5]:
from ltr.ranklib import feature_search
rankLibResult, ndcgPerFeature = feature_search(client,
                                               training_set=training_set,
                                               metric2t='NDCG@10',
                                               leafs=20,
                                               trees=20,
                                               kcv=15,
                                               features=[1,2,7],
                                               featureSet='title')

print()
print("Impact of each feature on the model")
trainLogs = rankLibResult.trainingLogs
for ftrId, impact in trainLogs[-1].impacts.items():
    print("{} - {}".format(ftrId, impact))
    
for roundDcg in trainLogs[-1].rounds:
    print(roundDcg)
    
print("Avg NDCG@10 when feature included:")
for ftrId, ndcg in ndcgPerFeature.items():
    print("%s => %s" % (ftrId, ndcg))
    
print("Avg K-Fold NDCG@10 %s" % rankLibResult.kcvTestAvg)

/var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar already exists
Running java -jar /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar -ranker 6 -shrinkage 0.1 -metric2t NDCG@10 -tree 20 -bag 1 -leaf 20 -frate 1.0 -srate 1.0 -train /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/training.txt -save data/temp_model.txt  -feature /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/features.txt -kcv 15 
DONE
Trying features [1] TEST NDCG@10=0.9071
/var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar already exists
Running java -jar /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar -ranker 6 -shrinkage 0.1 -metric2t NDCG@10 -tree 20 -bag 1 -leaf 20 -frate 1.0 -srate 1.0 -train /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/training.txt -save data/temp_model.txt  -feature /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/features.txt -kcv 15 
DONE
Trying features [2] TEST NDCG@10=0.9064
/var/folders/vc/thmh159x5xddb6_cgtx7

## Compare to model w/ all features

Compare the features output above (something like...)

```
Impact of each feature on the model
7 - 17618.35445148437
4 - 16165.586045512271
3 - 10958.610341321868
5 - 9256.821192289186
1 - 1436.0640878600943
```

to one trained with the full model. Notice how features have different impacts. This is due to feature dependency

In [6]:
from ltr.ranklib import train
trainResponse = train(client,
                  training_set=training_set,
                  metric2t='NDCG@10',
                  leafs=20,
                  trees=20,
                  features=[1,2,3,4,5,6,7],
                  featureSet='title',
                  index='tmdb',
                  modelName='title')

trainLog = trainResponse.trainingLogs[0]

print()
print("Impact of each feature on the model")
for ftrId, impact in trainLog.impacts.items():
    print("{} - {}".format(ftrId, impact))
    
for roundDcg in trainLog.rounds:
    print(roundDcg)
    
print("Train NDCG@10 %s" % trainLog.rounds[-1])

/var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar already exists
Running java -jar /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar -ranker 6 -shrinkage 0.1 -metric2t NDCG@10 -tree 20 -bag 1 -leaf 20 -frate 1.0 -srate 1.0 -train /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/training.txt -save data/title_model.txt  -feature /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/features.txt
DONE
Submit Model title Ftr Set title [Status: 200]
Feature Set title... [Status: 200]
Deleted Model title [Status: 200]
Created Model title [Status: 200]
Done

Impact of each feature on the model
3 - 33716.81189167293
7 - 28769.019966278764
4 - 14862.855713290153
6 - 5028.751391215197
1 - 1742.7824900498674
5 - 216.52853462896087
2 - 0.0
0.9178
0.9207
0.9207
0.9207
0.9198
0.9216
0.9216
0.922
0.922
0.9217
0.9352
0.9362
0.9364
0.9364
0.9368
0.9368
0.9378
0.9377
0.9369
0.9378
Train NDCG@10 0.9378


## Bias towards fewer features

By adding a 'cost', to feature search, we add a multiplier that punishes models with more features slightly. This results in a tiny bias towards simpler models all things being equal. As we'd prefer one that doesn't need to execute more features

In [7]:
from ltr.ranklib import feature_search
rankLibResult, ndcgPerFeature = feature_search(client,
                                               training_set=training_set,
                                               metric2t='NDCG@10',
                                               leafs=20,
                                               trees=20,
                                               kcv=15,
                                               featureCost=0.1,# adjustedNDCG = NDCG * ( (1.0-cost) ^ num_features)
                                               features=[1,2,3,4,5,6,7],
                                               featureSet='title')

print()
print("Impact of each feature on the model")
trainLogs = rankLibResult.trainingLogs
for ftrId, impact in trainLogs[-1].impacts.items():
    print("{} - {}".format(ftrId, impact))
    
for roundDcg in trainLogs[-1].rounds:
    print(roundDcg)
    
print("Avg NDCG@10 when feature included:")
for ftrId, ndcg in ndcgPerFeature.items():
    print("%s => %s" % (ftrId, ndcg))
    
print("Avg K-Fold NDCG@10 %s" % rankLibResult.kcvTestAvg)

/var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar already exists
Running java -jar /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar -ranker 6 -shrinkage 0.1 -metric2t NDCG@10 -tree 20 -bag 1 -leaf 20 -frate 1.0 -srate 1.0 -train /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/training.txt -save data/temp_model.txt  -feature /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/features.txt -kcv 15 
DONE
Trying features [1] TEST NDCG@10=0.9071 after cost 0.9071
/var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar already exists
Running java -jar /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar -ranker 6 -shrinkage 0.1 -metric2t NDCG@10 -tree 20 -bag 1 -leaf 20 -frate 1.0 -srate 1.0 -train /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/training.txt -save data/temp_model.txt  -feature /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/features.txt -kcv 15 
DONE
Trying features [2] TEST NDCG@10=0.9064 after cost 0.9064


DONE
Trying features [2, 6] TEST NDCG@10=0.8611 after cost 0.77499
/var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar already exists
Running java -jar /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar -ranker 6 -shrinkage 0.1 -metric2t NDCG@10 -tree 20 -bag 1 -leaf 20 -frate 1.0 -srate 1.0 -train /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/training.txt -save data/temp_model.txt  -feature /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/features.txt -kcv 15 
DONE
Trying features [2, 7] TEST NDCG@10=0.5693 after cost 0.51237
/var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar already exists
Running java -jar /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar -ranker 6 -shrinkage 0.1 -metric2t NDCG@10 -tree 20 -bag 1 -leaf 20 -frate 1.0 -srate 1.0 -train /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/training.txt -save data/temp_model.txt  -feature /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/features.txt -

DONE
Trying features [1, 3, 4] TEST NDCG@10=0.9068 after cost 0.734508
/var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar already exists
Running java -jar /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar -ranker 6 -shrinkage 0.1 -metric2t NDCG@10 -tree 20 -bag 1 -leaf 20 -frate 1.0 -srate 1.0 -train /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/training.txt -save data/temp_model.txt  -feature /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/features.txt -kcv 15 
DONE
Trying features [1, 3, 5] TEST NDCG@10=0.9036 after cost 0.731916
/var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar already exists
Running java -jar /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar -ranker 6 -shrinkage 0.1 -metric2t NDCG@10 -tree 20 -bag 1 -leaf 20 -frate 1.0 -srate 1.0 -train /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/training.txt -save data/temp_model.txt  -feature /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/featur

DONE
Trying features [2, 4, 7] TEST NDCG@10=0.5514 after cost 0.44663400000000003
/var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar already exists
Running java -jar /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar -ranker 6 -shrinkage 0.1 -metric2t NDCG@10 -tree 20 -bag 1 -leaf 20 -frate 1.0 -srate 1.0 -train /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/training.txt -save data/temp_model.txt  -feature /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/features.txt -kcv 15 
DONE
Trying features [2, 5, 6] TEST NDCG@10=0.876 after cost 0.7095600000000001
/var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar already exists
Running java -jar /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar -ranker 6 -shrinkage 0.1 -metric2t NDCG@10 -tree 20 -bag 1 -leaf 20 -frate 1.0 -srate 1.0 -train /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/training.txt -save data/temp_model.txt  -feature /var/folders/vc/thmh159x5xddb6_cgtx

DONE
Trying features [1, 2, 3, 6] TEST NDCG@10=0.9013 after cost 0.6570477000000001
/var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar already exists
Running java -jar /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar -ranker 6 -shrinkage 0.1 -metric2t NDCG@10 -tree 20 -bag 1 -leaf 20 -frate 1.0 -srate 1.0 -train /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/training.txt -save data/temp_model.txt  -feature /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/features.txt -kcv 15 
DONE
Trying features [1, 2, 3, 7] TEST NDCG@10=0.8894 after cost 0.6483726000000001
/var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar already exists
Running java -jar /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar -ranker 6 -shrinkage 0.1 -metric2t NDCG@10 -tree 20 -bag 1 -leaf 20 -frate 1.0 -srate 1.0 -train /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/training.txt -save data/temp_model.txt  -feature /var/folders/vc/thmh159x5xddb

DONE
Trying features [1, 4, 6, 7] TEST NDCG@10=0.9055 after cost 0.6601095
/var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar already exists
Running java -jar /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar -ranker 6 -shrinkage 0.1 -metric2t NDCG@10 -tree 20 -bag 1 -leaf 20 -frate 1.0 -srate 1.0 -train /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/training.txt -save data/temp_model.txt  -feature /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/features.txt -kcv 15 
DONE
Trying features [1, 5, 6, 7] TEST NDCG@10=0.894 after cost 0.6517260000000001
/var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar already exists
Running java -jar /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar -ranker 6 -shrinkage 0.1 -metric2t NDCG@10 -tree 20 -bag 1 -leaf 20 -frate 1.0 -srate 1.0 -train /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/training.txt -save data/temp_model.txt  -feature /var/folders/vc/thmh159x5xddb6_cgtx778s

KeyboardInterrupt: 

# Evaluating the Model

It's interesting to see what features our model makes use of, but we need guidance on adding additional features to the model. We know our model is an ensemble of decision trees. Wouldn't it be cool if we could trace where documents end up on that decision tree?

Specifically, we care about problems. Or what we will call affectionately *whoopsies*. 

As a 'whoopsie' example, consider the query "Rambo". if a '0' document like 'First Daughter' ranked the same or higher than a '4' document ("Rambo")., that's a problem. It's also an opportunity for improvement. We'd want to isolate that, see if it's indicative of a broader trend, and thus worth adding a feature for.

Let's see a concrete example

In [8]:
from ltr.MART_model import eval_model
from itertools import groupby

features, _ = client.feature_set(index='tmdb', name='title')

for qid, query_judgments in groupby(training_set, key=lambda j: j.qid):
    print(qid)
    if qid == 1: # Rambo
        model = eval_model(modelName='title',
                               features=features,
                               judgments=query_judgments)

        print()
        print("## Evaluating graded docs for search keywords 'Rambo'")
        print()
        print(model)
        break

Feature Set title... [Status: 200]
1
Whoopsies on LAMBDAMart
Header At 6

## Evaluating graded docs for search keywords 'Rambo'

if title_bm25 > 11.069752:
  if title_has_phrase > 0.0:
    if title_fuzzy > 11.173207:
      if overview_bm25 > 0.0:
        if overview_bm25 > 8.769033:
          if title_fuzzy > 19.553139:
            <= 0.2000(0/0/)
          else:
            if release_year > 2014.0:
              <= 0.2000(0/0/)
            else:
              <= 0.1970(0/0/)
        else:
          if title_bm25 > 17.690922:
            <= 0.2000(0/0/)
          else:
            if release_year > 1999.0:
              <= 0.2000(0/0/)
            else:
              <= 0.2000(0/0/)
      else:
        if title_bm25 > 15.621796:
          if release_year > 2000.0:
            <= 0.1915(0/0/)
          else:
            if title_bm25 > 15.725251:
              if title_bm25 > 16.656357:
                <= 0.2000(0/0/)
              else:
                <= 0.2000(0/0/)
            else

## Examining our evaluation for whoopsies

Let's looks at one tree in our ensemble, te see how it was evaluated.

```
if title_bm25 > 10.664251:
  if title_phrase > 0.0:
    if title_bm25 > 13.815164:
      if release_year > 2000.0:
        <= 0.1215(0/0/)
      else:
        <= 0.1240(0/0/)
    else:
      if title_bm25 > 10.667803:
        if overview_bm25 > 0.0:
          <= 0.1194(0/0/)
        else:
          <= 0.1161(1/0/)
      else:
        <= 0.1264(0/0/)
  else:
    <= 0.0800(0/0/)
else:
  if title_phrase > 0.0:
    if title_bm25 > 8.115499:
      if title_bm25 > 8.217656:
        <= 0.1097(2/1/qid:40:2(12180)-3(140607))
      else:
        <= 0.1559(0/0/)
    else:
      <= -0.0021(2/1/qid:40:2(1895)-3(330459))
  else:
    <= -0.1093(25/1/qid:40:0(85783)-3(1892))
```

You'll notice here this tree is represented by a series of if statements, where the feature's name is used. This is handy as it lets us take apart the structure of the tree.

You'll also notice the leaf nodes starting with 

```
<=
```

These leaf nodes have a floating point value, corresponding to the relevance score that documents ending up here will have. Each leaf also has three items in paranthesis, such as `(2/1/qid:40:2(1895)-3(330459))`. This is a report summarizing the result of evaluating the tree on the provided judgment list. Indicating:


```



   +--- 2 Documents evaluated to this leaf node                   +-- max grade doc eval'd to this leaf
   |                                                              |
   | +----- 1 'whoopsie' occured                                  |  +-- corresp. doc id of max doc
   | |                                                            |  |
   | |   +--- details on each whoopsie ----------- qid:40:2(1985)-3(330459)
   | |   |                                              | |  |
  (2/1/qid:40:2(1895)-3(330459))                        | |  |
                                                        | |  + doc id of min graded doc
                                                        | |
                                                        | + min grade of docs eval'd to this leaf
                                                        |
                                                        + query id of whoopsie from judgments
```


Looking at Star Wars, our biggest issues in this tree are with the bottom-most leaf. Here

```
if title_bm25 > 10.664251:
  ...
else:
  if title_phrase > 0.0:
    ...
  else:
    <= -0.1093(25/1/qid:40:0(85783)-3(1892))
```


Document 85783 (a '0') and doc 1892 are given the same grade.

### Whoopsie, from the query perspective

Whoopsies can also be examined at the "query" level to see for a query id, how many whoopsies existed, and what was the evaluation for that query at each tree. This can help see if an error was fixed later in the ensemble of trees.

In [9]:
whoopsies = model.whoopsies()
for qid, whoopsie in whoopsies.items():
    print("== QID %s ==" % qid)
    print("%s - %s" % (whoopsie.count, whoopsie.totalMagnitude))
    print(whoopsie.perTreeReport())

== QID 1 ==
20 - 42
tree:0=>0(319074)-4(1368);tree:1=>0(319074)-4(1368);tree:2=>0(319074)-4(1368);tree:3=>0(319074)-4(1368);tree:4=>0(319074)-4(1368);tree:5=>0(319074)-2(13258);tree:6=>0(319074)-2(13258);tree:7=>0(319074)-2(13258);tree:8=>0(319074)-2(13258);tree:9=>0(319074)-2(13258);tree:10=>0(319074)-1(61410);tree:11=>1(31362)-4(1368);tree:12=>0(319074)-1(61410);tree:13=>0(319074)-1(61410);tree:14=>0(319074)-1(61410);tree:15=>0(319074)-1(61410);tree:16=>0(319074)-1(61410);tree:17=>0(319074)-1(208982);tree:18=>0(319074)-1(208982);tree:19=>0(319074)-1(208982)


## Examine problem doc 319074

(notice nothing mentions 'star wars')

In [10]:
client.get_doc(index='tmdb', doc_id=1368)

{'id': '1368',
 'title': ['First Blood'],
 'title_bidirect_syn': ['First Blood'],
 'title_directed_syn': ['First Blood'],
 'title_multiterm_syn': ['First Blood'],
 'title_idioms': ['First Blood'],
 'text_all_idioms': ['First Blood',
  'When former Green Beret John Rambo is harassed by local law enforcement and arrested for vagrancy, the Vietnam vet snaps, runs for the hills and rat-a-tat-tats his way into the action-movie hall of fame. Hounded by a relentless sheriff, Rambo employs heavy-handed guerilla tactics to shake the cops off his tail.',
  "This time he's fighting for his life.",
  'Ted Kotcheff',
  'Sylvester Stallone Richard Crenna Brian Dennehy Bill McKinney Jack Starrett Michael Talbott Chris Mulkey John McLiam Alf Humphreys David Caruso David L. Crowley Don MacKay Charles A. Tamburro David Petersen Craig Huston',
  'Action',
  'Adventure',
  'Thriller',
  'War'],
 'title_taxonomy': ['First Blood'],
 'text_all_taxonomy': ['First Blood',
  'When former Green Beret John Rambo 

## Add a feature: collection name

We have an intuition about our data, there is a field for the movies "collection name". See it here below:

In [11]:
from ltr.helpers.movies import get_movie
get_movie(1368)

{'id': 1368,
 'title': 'First Blood',
 'video': False,
 'mlensId': '2403',
 'vote_average': 7.1,
 'backdrop_path': '/cU3goO0TMNDWxgDwILAARyiblXK.jpg',
 'tagline': "This time he's fighting for his life.",
 'directors': [{'id': 16544,
   'department': 'Directing',
   'credit_id': '52fe42efc3a36847f802e18b',
   'name': 'Ted Kotcheff',
   'profile_path': '/r4rMwZeClwJR2VCgvYsNjLOTXpB.jpg',
   'job': 'Director'}],
 'release_date': '1982-10-22',
 'belongs_to_collection': {'poster_path': '/feGOEOVrOLyjtEnVa88rQLgD3XY.jpg',
  'id': 5039,
  'backdrop_path': '/Yt2ZxbJv2HM842B6FNMr59Vhyb.jpg',
  'name': 'Rambo Collection'},
 'runtime': 93,
 'popularity': 3.06572,
 'status': 'Released',
 'original_language': 'en',
 'cast': [{'order': 0,
   'id': 16483,
   'cast_id': 20,
   'credit_id': '52fe42efc3a36847f802e1fb',
   'name': 'Sylvester Stallone',
   'profile_path': '/gnmwOa46C2TP35N7ARSzboTdx2u.jpg',
   'character': 'John J. Rambo'},
  {'order': 1,
   'id': 16554,
   'cast_id': 21,
   'credit_id': 

## Now reindex with collection name...

We'll add collection name, and reindex.

In [12]:
from ltr.index import rebuild
from ltr.helpers.movies import indexable_movies

def add_collection_and_char_name(src_movie, base_doc):
    if 'belongs_to_collection' in src_movie and src_movie['belongs_to_collection'] is not None:
        if 'name' in src_movie['belongs_to_collection']:
            base_doc['collection_name_en'] = src_movie['belongs_to_collection']['name']
            
    if 'cast' in src_movie and src_movie['cast'] is not None:
        characters = [cast['character'] for cast in get_movie(1368)['cast']][:5]
        base_doc['top_cast_en'] = characters
    return base_doc

movies=indexable_movies(movies='data/tmdb.json', enrich=add_collection_and_char_name)
rebuild(client, index='tmdb', doc_src=movies)

Reconfig from disk...
Deleted index tmdb [Status: 200]


 16%|█▋        | 4533/27846 [00:00<00:00, 45322.16it/s]

Created index tmdb [Status: 200]
Reindexing...
Indexed 0 movies (last Black Mirror: White Christmas)
Indexed 100 movies (last Apocalypse Now)
Indexed 200 movies (last Crooks in Clover)
Indexed 300 movies (last For a Few Dollars More)
Indexed 400 movies (last Downfall)
Indexed 500 movies (last Finding Nemo)
Indexed 600 movies (last Platoon)
Indexed 700 movies (last Night of the Living Dead)
Indexed 800 movies (last Evangelion: 1.0: You Are (Not) Alone)
Indexed 900 movies (last Batman: Assault on Arkham)
Indexed 1000 movies (last Riley's First Date?)
Indexed 1100 movies (last The Raid)
Indexed 1200 movies (last Falling Down)
Indexed 1300 movies (last Kal Ho Naa Ho)
Indexed 1400 movies (last Elizabeth)
Indexed 1500 movies (last Irreversible)
Indexed 1600 movies (last Friday Night Lights)
Indexed 1700 movies (last Ben X)
Indexed 1800 movies (last Pump up the Volume)
Indexed 1900 movies (last Armour of God)
Indexed 2000 movies (last Swingers)
Indexed 2100 movies (last The Guard)
Indexed 220

 21%|██        | 5893/27846 [00:12<00:59, 367.62it/s]  

Done [Status: 200]
Indexed 5000 movies (last Absolutely Anything)
Indexed 5100 movies (last Funny People)
Indexed 5200 movies (last Tad, the Lost Explorer)
Indexed 5300 movies (last Fiston)
Indexed 5400 movies (last The Reaping)
Indexed 5500 movies (last American Pie Presents: Beta House)
Indexed 5600 movies (last Casino Royale)
Indexed 5700 movies (last Jennifer's Body)
Indexed 5800 movies (last Caligula)
Indexed 5900 movies (last Alex Cross)
Indexed 6000 movies (last Rapture Palooza)
Indexed 6100 movies (last The Seeker: The Dark Is Rising)
Indexed 6200 movies (last Boogeyman)
Indexed 6300 movies (last Fifty Shades of Black)
Indexed 6400 movies (last Wilbur Wants to Kill Himself)
Indexed 6500 movies (last A Cruel Romance)
Indexed 6600 movies (last La discrète)
Indexed 6700 movies (last The Rapture)
Indexed 6800 movies (last Ricky Gervais Live 3: Fame)
Indexed 6900 movies (last Bunny Lake Is Missing)
Indexed 7000 movies (last Alice in the Cities)
Indexed 7100 movies (last Le pélican)


 46%|████▌     | 12867/27846 [00:22<00:27, 539.27it/s]

Done [Status: 200]
Indexed 10000 movies (last Titus)
Indexed 10100 movies (last Revolution)
Indexed 10200 movies (last The Last Dispatch)
Indexed 10300 movies (last Wings of Courage)
Indexed 10400 movies (last Gaslight)
Indexed 10500 movies (last Frenchmen 2)
Indexed 10600 movies (last Closing the Ring)
Indexed 10700 movies (last 3:10 to Yuma)
Indexed 10800 movies (last Harvest)
Indexed 10900 movies (last Song of the Thin Man)
Indexed 11000 movies (last Nobody Knows Anything!)
Indexed 11100 movies (last Dirty Deeds)
Indexed 11200 movies (last The Adventure of Faustus Bidgood)
Indexed 11300 movies (last Running Time)
Indexed 11400 movies (last While She Was Out)
Indexed 11500 movies (last What?)
Indexed 11600 movies (last The Legend of Hell House)
Indexed 11700 movies (last When Worlds Collide)
Indexed 11800 movies (last The Girl in the Red Velvet Swing)
Indexed 11900 movies (last 9 Souls)
Indexed 12000 movies (last Paradise Lost 2: Revelations)
Indexed 12100 movies (last Riki-Oh: The S

 54%|█████▍    | 15036/27846 [00:32<00:34, 374.16it/s]

Done [Status: 200]
Indexed 15000 movies (last Miracles: Mr. Canton and Lady Rose)
Indexed 15100 movies (last An Occurrence at Owl Creek Bridge)
Indexed 15200 movies (last 16 to Life)
Indexed 15300 movies (last Week-End at the Waldorf)
Indexed 15400 movies (last Made in Hong Kong)
Indexed 15500 movies (last Viva Cuba)
Indexed 15600 movies (last Big Pun: The Legacy)
Indexed 15700 movies (last Hurt)
Indexed 15800 movies (last The Mudge Boy)
Indexed 15900 movies (last The Hollywood Complex)
Indexed 16000 movies (last The Great Northfield Minnesota Raid)
Indexed 16100 movies (last Lotta Leaves Home)
Indexed 16200 movies (last Just One of the Girls)
Indexed 16300 movies (last Which Way Is The Front Line From Here? The Life and Time of Tim Hetherington)
Indexed 16400 movies (last The Ladies Man)
Indexed 16500 movies (last Assassin of the Tsar)
Indexed 16600 movies (last The Adventures of Tarzan)
Indexed 16700 movies (last Vendetta)
Indexed 16800 movies (last Trucker)
Indexed 16900 movies (las

 72%|███████▏  | 20053/27846 [00:41<00:18, 410.87it/s]

Done [Status: 200]
Indexed 20000 movies (last Left Behind III: World at War)
Indexed 20100 movies (last Dragon Ball Z: Lord Slug)
Indexed 20200 movies (last The Adventures of Sherlock Holmes)
Indexed 20300 movies (last Billy's Hollywood Screen Kiss)
Indexed 20400 movies (last Short Night of Glass Dolls)
Indexed 20500 movies (last Kawa)
Indexed 20600 movies (last Bears)
Indexed 20700 movies (last Pyrates)
Indexed 20800 movies (last Bastard Out of Carolina)
Indexed 20900 movies (last The Mole People)
Indexed 21000 movies (last Till Human Voices Wake Us)
Indexed 21100 movies (last It's a Wonderful Afterlife)
Indexed 21200 movies (last The Bingo Long Traveling All-Stars & Motor Kings)
Indexed 21300 movies (last Ciao! Manhattan)
Indexed 21400 movies (last The Night They Raided Minsky's)
Indexed 21500 movies (last The Girl Can't Help It)
Indexed 21600 movies (last Sam Peckinpah's West: Legacy of a Hollywood Renegade)
Indexed 21700 movies (last A Guy Named Joe)
Indexed 21800 movies (last Odd 

100%|██████████| 27846/27846 [00:51<00:00, 538.67it/s]

Done [Status: 200]
Indexed 25000 movies (last Holiday for Drumsticks)
Indexed 25100 movies (last China Blue)
Indexed 25200 movies (last Pancho, el perro millonario)
Indexed 25300 movies (last The Diary of Anne Frank)
Indexed 25400 movies (last To Be Twenty)
Indexed 25500 movies (last Empire of Silver)
Indexed 25600 movies (last Knockout)
Indexed 25700 movies (last Speed & Angels)
Indexed 25800 movies (last Meek's Cutoff)
Indexed 25900 movies (last Sharpe's Sword)
Indexed 26000 movies (last May 18)
Indexed 26100 movies (last Dealer)
Indexed 26200 movies (last Carmen Miranda: Bananas Is My Business)
Indexed 26300 movies (last Il figlio dello sceicco)
Indexed 26400 movies (last Muddy River)
Indexed 26500 movies (last Judy Moody and the Not Bummer Summer)
Indexed 26600 movies (last The Naughty Room)
Indexed 26700 movies (last Time Without Pity)
Indexed 26800 movies (last L'outremangeur)
Indexed 26900 movies (last Buddha Collapsed Out of Shame)
Indexed 27000 movies (last Uno contro l'altro 




Done [Status: 200]
Committed index tmdb [Status: 200]
Done


Confirm it's in our doc now...

In [13]:
client.get_doc(index='tmdb', doc_id=319074)

{'id': '319074',
 'title': ['In Football We Trust'],
 'title_bidirect_syn': ['In Football We Trust'],
 'title_directed_syn': ['In Football We Trust'],
 'title_multiterm_syn': ['In Football We Trust'],
 'title_idioms': ['In Football We Trust'],
 'text_all_idioms': ['In Football We Trust',
  '‘In Football We Trust’ captures a snapshot in time amid the rise of the Pacific Islander presence in the NFL. Presenting a new take on the American immigrant story, this feature length documentary transports viewers deep inside the tightly-knit Polynesian community in Salt Lake City, Utah. With unprecedented access and shot over a four-year time period, the film intimately portrays four young Polynesian men striving to overcome gang violence and near poverty through American football. Viewed as the "salvation" for their families, these young players reveal the culture clash they experience as they transform out of their adolescence and into the high stakes world of collegiate recruiting and rigors o

## Add it to the features, and regenerate training data....

In [14]:
client.reset_ltr(index='tmdb')


config = [
    #1
    {
      "name" : "title_has_phrase",
      "store": "title2",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "title:\"${keywords})\"^=1"
      }
    },
    #2
    {
      "name" : "title_has_terms",
      "store": "title2",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "title:(${keywords})^=1"
      }
    },
    #3
    {
      "name" : "title_bm25",
      "store": "title2",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "title:(${keywords})"
      }
    },
    #4
    {
      "name" : "overview_bm25",
      "store": "title2",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "overview:(${keywords})"
      }
    },
    #5
    {
      "name" : "overview_phrase_bm25",
      "store": "title2",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "overview:\"${keywords}\""
      }
    },
    #6
    {
      "name" : "title_fuzzy",
      "store": "title2",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "{!lucene df=title}${keywords}~"
      }
    },
    #7
    {
      "name" : "release_year",
      "store": "title2",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "{!func}def(release_year,2000)"
      }
    },
    #8 Collection Name BM25 Score
    {
      "name" : "coll_name_bm25",
      "store": "title2",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "collection_name_en:(${keywords})"
      }
    },
    #9 Collection Name Phrase BM25 Score
    {
      "name" : "coll_name_phrase_bm25",
      "store": "title2",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "collection_name_en:\"${keywords}\""
      }
    }

]


client.create_featureset(index='tmdb', name='title2', ftr_config=config)

from ltr.judgments import judgments_open
from ltr.log import FeatureLogger
from itertools import groupby

ftr_logger=FeatureLogger(client, index='tmdb', feature_set='title2')
with judgments_open('data/title_judgments.txt') as judgment_list:
    for qid, query_judgments in groupby(judgment_list, key=lambda j: j.qid):
        ftr_logger.log_for_qid(qid=qid, 
                               judgments=query_judgments,
                               keywords=judgment_list.keywords(qid))
        
training_set=ftr_logger.logged

Created title2 feature store under tmdb: [Status: 200]
Recognizing 40 queries...
Searching tmdb [Status: 200]
Discarded 0 Keep 41
Searching tmdb [Status: 200]
Discarded 0 Keep 41
Searching tmdb [Status: 200]
Discarded 0 Keep 39
Searching tmdb [Status: 200]
Discarded 0 Keep 28
Searching tmdb [Status: 200]
Discarded 0 Keep 33
Searching tmdb [Status: 200]
Discarded 0 Keep 39
Searching tmdb [Status: 200]
Discarded 0 Keep 35
Searching tmdb [Status: 200]
Discarded 0 Keep 38
Searching tmdb [Status: 200]
Discarded 0 Keep 35
Searching tmdb [Status: 200]
Discarded 0 Keep 31
Searching tmdb [Status: 200]
Discarded 0 Keep 28
Searching tmdb [Status: 200]
Discarded 0 Keep 34
Searching tmdb [Status: 200]
Discarded 0 Keep 31
Searching tmdb [Status: 200]
Discarded 0 Keep 30
Searching tmdb [Status: 200]
Discarded 0 Keep 35
Searching tmdb [Status: 200]
Discarded 0 Keep 31
Searching tmdb [Status: 200]
Discarded 0 Keep 31
Searching tmdb [Status: 200]
Discarded 0 Keep 35
Searching tmdb [Status: 200]
Discarde

## Now a feature search

And do a feature search over these new features (go get some coffee).

We also up the number of trees & leafs to see if it has an impact

In [None]:
from ltr.ranklib import feature_search
rankLibResult, ndcgPerFeature = feature_search(client,
                                               training_set=training_set,
                                               metric2t='NDCG@10',
                                               leafs=20,
                                               trees=20,
                                               kcv=15,
                                               features=[1,2,3,4,5,6,7,8,9],
                                               featureSet='title2')

print()
print("Impact of each feature on the model")
trainLogs = rankLibResult.trainingLogs
for ftrId, impact in trainLogs[-1].impacts.items():
    print("{} - {}".format(ftrId, impact))
    
for roundDcg in trainLogs[-1].rounds:
    print(roundDcg)
    
print("Avg NDCG@10 when feature included:")
for ftrId, ndcg in ndcgPerFeature.items():
    print("%s => %s" % (ftrId, ndcg))
    
print("Avg K-Fold NDCG@10 %s" % rankLibResult.kcvTestAvg)

## Review new feature impacts

Impact of each feature on the model... this is the best mix. Feature 8 helps, but not feature 9 as much. Interesting

```
4 - 18032.527656827504
3 - 9801.409052757816
5 - 8051.741259194476
7 - 5711.964176322393
8 - 3798.6132329430748
1 - 1439.2180228991883
```

## Now save away this model

In [16]:
from ltr.ranklib import train
trainResponse  = train(client,
                  training_set=training_set,
                  metric2t='NDCG@10',
                  leafs=20,
                  trees=20,
                  features=[1,3,4,5,7,8],
                  featureSet='title2',
                  index='tmdb',
                  modelName='title2')

trainLog = trainResponse.trainingLogs[0]

print()
print("Impact of each feature on the model")
for ftrId, impact in trainLog.impacts.items():
    print("{} - {}".format(ftrId, impact))
    
for roundDcg in trainLog.rounds:
    print(roundDcg)
    
print("Train NDCG@10 %s" % trainLog.rounds[-1])

/var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar already exists
Running java -jar /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/RankyMcRankFace.jar -ranker 6 -shrinkage 0.1 -metric2t NDCG@10 -tree 20 -bag 1 -leaf 20 -frate 1.0 -srate 1.0 -train /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/training.txt -save data/title2_model.txt  -feature /var/folders/vc/thmh159x5xddb6_cgtx778sc0000gn/T/features.txt
DONE
Submit Model title2 Ftr Set title2 [Status: 200]
Feature Set title2... [Status: 200]
Deleted Model title2 [Status: 200]
Created Model title2 [Status: 200]
Done

Impact of each feature on the model
3 - 45970.19084259676
4 - 14863.915643776061
5 - 6010.83353619383
8 - 5937.817286358266
1 - 1747.9503211101292
7 - 903.9746857424649
0.9169
0.9261
0.9226
0.9259
0.9265
0.9265
0.9265
0.9282
0.9282
0.9282
0.9422
0.9463
0.9465
0.9469
0.9465
0.9474
0.9482
0.9508
0.9508
0.951
Train NDCG@10 0.951


In [17]:
from ltr import search
search(client, "rambo", modelName='title2')

['Rambo'] 
2.1835294 
2008 
['Action', 'Thriller'] 
["When governments fail to act on behalf of captive missionaries, ex-Green Beret John James Rambo sets aside his peaceful existence along the Salween River in a war-torn region of Thailand to take action.  Although he's still haunted by violent memories of his time as a U.S. soldier during the Vietnam War, Rambo can hardly turn his back on the aid workers who so desperately need his help."] 
---------------------------------------
['Rambo III'] 
1.9167165 
1988 
['Action', 'Adventure', 'Thriller', 'War'] 
["Combat has taken its toll on Rambo, but he's finally begun to find inner peace in a monastery. When Rambo's friend and mentor Col. Trautman asks for his help on a top secret mission to Afghanistan, Rambo declines but must reconsider when Trautman is captured."] 
---------------------------------------
['Rambo: First Blood Part II'] 
1.0534304 
1985 
['Action', 'Adventure', 'Drama', 'Thriller', 'War'] 
["Col. Troutman recruits ex-Gr

## Examine Model 2

In [18]:
from ltr.MART_model import eval_model
from itertools import groupby

features, _ = client.feature_set(index='tmdb', name='title2')

for qid, query_judgments in groupby(training_set, key=lambda j: j.qid):
    if qid == 1:
        model = eval_model(modelName='title',
                           features=features,
                           judgments=query_judgments)

        print()
        print("## Evaluating graded docs for search keywords 'Rambo'")
        print()
        print(model)
        break

Feature Set title2... [Status: 200]
Whoopsies on LAMBDAMart
Header At 6

## Evaluating graded docs for search keywords 'Rambo'

if title_bm25 > 11.069752:
  if title_has_phrase > 0.0:
    if title_fuzzy > 11.173207:
      if overview_bm25 > 0.0:
        if overview_bm25 > 8.769033:
          if title_fuzzy > 19.553139:
            <= 0.2000(0/0/)
          else:
            if release_year > 2014.0:
              <= 0.2000(0/0/)
            else:
              <= 0.1970(0/0/)
        else:
          if title_bm25 > 17.690922:
            <= 0.2000(0/0/)
          else:
            if release_year > 1999.0:
              <= 0.2000(0/0/)
            else:
              <= 0.2000(0/0/)
      else:
        if title_bm25 > 15.621796:
          if release_year > 2000.0:
            <= 0.1915(0/0/)
          else:
            if title_bm25 > 15.725251:
              if title_bm25 > 16.656357:
                <= 0.2000(0/0/)
              else:
                <= 0.2000(0/0/)
            else:

In [19]:
whoopsies = model.whoopsies()
for qid, whoopsie in whoopsies.items():
    print("== QID %s ==" % qid)
    print("%s - %s" % (whoopsie.count, whoopsie.totalMagnitude))
    print(whoopsie.perTreeReport())

== QID 1 ==
20 - 42
tree:0=>0(319074)-4(1368);tree:1=>0(319074)-4(1368);tree:2=>0(319074)-4(1368);tree:3=>0(319074)-4(1368);tree:4=>0(319074)-4(1368);tree:5=>0(319074)-2(13258);tree:6=>0(319074)-2(13258);tree:7=>0(319074)-2(13258);tree:8=>0(319074)-2(13258);tree:9=>0(319074)-2(13258);tree:10=>0(319074)-1(61410);tree:11=>1(31362)-4(1368);tree:12=>0(319074)-1(61410);tree:13=>0(319074)-1(61410);tree:14=>0(319074)-1(61410);tree:15=>0(319074)-1(61410);tree:16=>0(319074)-1(61410);tree:17=>0(319074)-1(208982);tree:18=>0(319074)-1(208982);tree:19=>0(319074)-1(208982)


```
== QID 1 ==
10 - 40
tree:0=>0(319074)-4(1368);tree:1=>0(319074)-4(1368);tree:2=>0(319074)-4(1368);tree:3=>0(319074)-4(1368);tree:4=>0(319074)-4(1368);tree:5=>0(319074)-4(1368);tree:6=>0(319074)-4(1368);tree:7=>0(319074)-4(1368);tree:8=>0(319074)-4(1368);tree:9=>0(319074)-4(1368)
```

In [20]:
from ltr.helpers.movies import get_movie
[cast['character'] for cast in get_movie(1368)['cast']][:5]

['John J. Rambo',
 'Col. Samuel Trautman',
 'Hope Sheriff Will Teasle',
 'State Police Capt. Dave Kern',
 'Deputy Sgt. Arthur Galt']