In [None]:
import requests
import json

from ltr.client.solr_client import SolrClient

client = SolrClient()
host = client.get_host()

## Download, reindex...

1. Download the corpus & judgments
2. Rebuild the index from the tmdb solr config
3. Reindex movies loaded from the corpus

In [None]:
from ltr import download

tmdb_corpus='http://es-learn-to-rank.labs.o19s.com/tmdb_ai_pow_search.json'
judgments='http://es-learn-to-rank.labs.o19s.com/title_judgments_binary.txt'
download([tmdb_corpus, judgments], dest='data/');

In [None]:
from ltr.index import rebuild
from ltr.helpers.movies import indexable_movies
movies=indexable_movies(movies='data/tmdb_ai_pow_search.json')
rebuild(client, index='tmdb', doc_src=movies)

## Listing 1, View Doc

In [None]:
client.get_doc(index='tmdb', doc_id='37799', fields=['title','tagline','overview','release_year'])

## Manual boosting

One 'generalizable' relevance solution that gets at the long tail is a manually derivved relevance function

In [None]:
q="""title:({keywords})^10
     overview:({keywords})^20
     {{!func}}release_year^0.01"""

q = q.format(keywords='mark zuckerberg college')

solr_q = {'defType': 'edismax',
          'fl': 'title ',
          'q': q}

client.query(index='tmdb', query=solr_q)

In [None]:
q="""title:({keywords})^{ti_bm25_weight}
     overview:({keywords})^{ov_bm25_weight}
     {{!func}}release_year^{release_year_weight}"""

q = q.format(ti_bm25_weight=10,
             ov_bm25_weight=20,
             release_year_weight=0.01,
             keywords='mark zuckerberg college')

solr_q = {'defType': 'edismax',
          'fl': 'title',
          'q': q}

client.query(index='tmdb', query=solr_q)

In [None]:

def judg_csv(judgment):
    from ltr.helpers.movies import get_movie
    judgedMovie = get_movie(movies='data/tmdb_ai_pow_search.json', tmdb_id=judgment.docId)
    return "{grade},'{title}',{keywords}".format(grade=judgment.grade,
                                                 title=judgedMovie['title'],
                                                 keywords=judgment.keywords)

# Make a baby judgment list for book display
from ltr.judgments import judgments_from_file
to_sample={11:[0,1,6,9], # qid->rows in qid to sample
           40:[0,5,12,7,20]}

mini_judg_list=[]

from itertools import groupby
judgment_dict={}
new_qid=1
with open('data/title_judgments_binary.txt') as f:
    for qid, query_judgments in groupby(judgments_from_file(f), key=lambda j: j.qid):
        if qid in to_sample.keys():
            query_judgments = [j for j in query_judgments]
            for row in to_sample[qid]:
                query_judgments[row].qid=new_qid
                mini_judg_list.append(query_judgments[row])
            new_qid+=1

mini_judg_list

In [None]:
def judg_csv(judgment):
    from ltr.helpers.movies import get_movie
    judgedMovie = get_movie(movies='data/tmdb_ai_pow_search.json', tmdb_id=judgment.doc_id)
    return "{grade},'{title}',{keywords}".format(grade=judgment.grade,
                                                 title=judgedMovie['title'],
                                                 keywords=judgment.keywords)

### As CSV

In [None]:
for judgment in mini_judg_list:
    print(judg_csv(judgment))

### Dump the file...

In [None]:
from ltr.judgments import judgments_to_file
from io import StringIO

string_f = StringIO()
judgments_to_file(string_f, judgmentsList=mini_judg_list)

print(string_f.getvalue())

### Same plausible features on each

In [None]:
# Setup some features for this dummy dataset
client.reset_ltr(index='tmdb')

ftr_config = [
    #1
    {
      "name" : "title_bm25",
      "store": "dummy",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "title:(${keywords})"
      }
    },
    #2
    {
      "name" : "overview_bm25",
      "store": "dummy",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "overview:(${keywords})"
      }
    },
    {#3
      "name" : "release_year",
      "store": "dummy",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "{!func}def(release_year,2000)"
      }
    }

]


judgments_string=string_f.getvalue()
client.create_featureset(index='tmdb', name='dummy', ftr_config=ftr_config)

from ltr.judgments import judgments_reader
from ltr.log import FeatureLogger

ftr_logger=FeatureLogger(client, index='tmdb', feature_set='dummy')
with judgments_reader(StringIO(judgments_string)) as judgments:
    for qid, query_judgments in groupby(judgments, key=lambda j: j.qid):
        ftr_logger.log_for_qid(qid=qid,
                               keywords=judgments.keywords(qid),
                               judgments=query_judgments)

## Dump the training set

In [None]:
from ltr.judgments import judgments_writer
from io import StringIO

string_f = StringIO()
with judgments_writer(string_f) as writer:
    for j in ftr_logger.logged:
        writer.write(j)

print(string_f.getvalue())

In [None]:
import requests

feature_set = [
    {
      "name" : "title_bm25",
      "store": "movie",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : { #q=title:({$keywords})
        "q" : "title:(${keywords})"
      }
    },
    {
      "name" : "overview_bm25",
      "store": "movie",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "overview:(${keywords})"
      }
    },
    {
      "name" : "vote_average",
      "store": "movie",
      "class" : "org.apache.solr.ltr.feature.SolrFeature",
      "params" : {
        "q" : "{!func}vote_average"
      }
    }
   
]

requests.put('http://localhost:8983/solr/tmdb/schema/feature-store',
             json=feature_set)

In [None]:
logging_solr_query = {
    "fl": "id,title,[features store=movie efi.keywords=\"social network\"]",
    'q': "id:37799 OR id:267752 id:38408 OR id:28303", #social network graded documents
    'rows': 10,
    'wt': 'json'  
}

resp = requests.post('http://localhost:8983/solr/tmdb/select',
                     data=logging_solr_query)
resp.json()

In [None]:
from ltr.client.solr_client import SolrClient

client = SolrClient(host='http://localhost:8983/solr')

In [None]:
from ltr.judgments import Judgment

Judgment(qid=1,keywords='social network',doc_id=37799,grade=1)

```
1	qid:1	1:18.135925	2:8.391596	3:2010.0 # 37799	social network
0	qid:1	1:0.0	2:13.237938	3:2013.0 # 267752	social network
0	qid:1	1:0.0	2:9.576859	3:2010.0 # 38408	social network
0	qid:1	1:7.5430527	2:6.839079	3:1970.0 # 28303	social network
1	qid:2	1:14.951998	2:0.0	3:1977.0 # 11	star wars
1	qid:2	1:0.0	2:4.3300323	3:1983.0 # 1892	star wars
0	qid:2	1:5.377082	2:0.0	3:2013.0 # 54138	star wars
0	qid:2	1:7.01165	2:0.0	3:1952.0 # 85783	star wars
0	qid:2	1:0.0	2:0.0	3:2003.0 # 325553	star wars
```


In [None]:
mini_judg_list=[
    Judgment(grade=1, qid=1, keywords='social network', doc_id=37799),
    Judgment(grade=0, qid=1, keywords='social network', doc_id=267752),
    Judgment(grade=0, qid=1, keywords='social network', doc_id=38408),
    Judgment(grade=0, qid=1, keywords='social network', doc_id=28303),
    Judgment(grade=1, qid=2, keywords='star wars', doc_id=11),
    Judgment(grade=1, qid=2, keywords='star wars', doc_id=1892),
    Judgment(grade=0, qid=2, keywords='star wars', doc_id=54138),
    Judgment(grade=0, qid=2, keywords='star wars', doc_id=85783),
    Judgment(grade=0, qid=2, keywords='star wars', doc_id=325553),    
]

from ltr.judgments import judgments_writer
from io import StringIO

string_file = StringIO()
with judgments_writer(string_file) as writer:
    for j in mini_judg_list:
        writer.write(j)

print(string_f.getvalue())




In [None]:
mini_judg_list[0].features

In [None]:
from ltr import download
from ltr.log import FeatureLogger
from ltr.judgments import judgments_open
from itertools import groupby

judgments='http://es-learn-to-rank.labs.o19s.com/title_judgments_binary.txt'
download([judgments], dest='data/')

ftr_logger=FeatureLogger(client, index='tmdb', feature_set='movie')

with judgments_open('data/title_judgments.txt') as judgment_list:
    for qid, query_judgments in groupby(judgment_list, key=lambda j: j.qid):
        ftr_logger.log_for_qid(judgments=query_judgments, 
                               qid=qid,
                               keywords=judgment_list.keywords(qid))
        
ftr_logger.logged