In [36]:
import json
import os
import pandas as pd
import elasticsearch

In [37]:
data_dir = "C:/Users/frede/movielens_v2/"

#### Load the Movielens Dataset

In [38]:
df_dbpedia = pd.read_csv(os.path.join(data_dir, "dbpedia.csv"))
df_dbpedia["dbpedia_content"] = df_dbpedia["dbpedia_content"].apply(json.loads) # Parse string to JSON
df_movie = pd.read_csv(os.path.join(data_dir, "movies.csv"))
df_movie["genres"] = df_movie["genres"].apply(lambda x: x.replace("|",","))
df_rating = pd.read_csv(os.path.join(data_dir, "ratings.csv"))
df_user = pd.read_csv(os.path.join(data_dir, "users.csv"))

### Elasticsearch provides a RESTful API. This is language independent
- For instance, you can do `curl -XGET http://localhost:9200/` to get information about the node
- To create an index, you can use:
    - curl -XPUT http://localhost:9200/movies/ -H "Content-Type: application/json" -d '{
            "mappings": {
                "movie": {
                    "properties": {
                        "title": {
                           "type": "text",
                           "analyzer": "whitespace",
                           "term_vector": "yes"
                       }
                    }
                }
           }
       }'
- It is easier to use the python elasticsearch library

In [4]:
import elasticsearch # Imports the library

In [5]:
es = elasticsearch.Elasticsearch() # Defines how to connect to a Elasticsearch node

In [6]:
es.indices.exists("movies")

True

In [7]:
if es.indices.exists("movies"):
    es.indices.delete("movies")

#### Let's load some data. Movie by movie

In [8]:
df_dbpedia_merged = df_dbpedia[["movie_id","dbpedia_content"]].merge(df_movie,on="movie_id")

In [9]:
def load_n_movies(n):
    for i, movie in df_dbpedia_merged.head(n).iterrows():
        print("Loading movie {}:{}".format(movie["movie_id"], movie["title"]))
        my_movie = {}
        my_movie["abstract"] = movie["dbpedia_content"]["abstract"]
        my_movie["title"] = movie["title"]
        my_movie["genres"] = movie["genres"]
        es.index(index="movies", body=my_movie, doc_type="movie", id=movie["movie_id"])

In [10]:
load_n_movies(5)

Loading movie 2:Jumanji (1995)
Loading movie 3:Grumpier Old Men (1995)
Loading movie 4:Waiting to Exhale (1995)
Loading movie 5:Father of the Bride Part II (1995)
Loading movie 6:Heat (1995)


In [11]:
# http://localhost:9200/movies/movie/2
es.get("movies", doc_type="movie", id=2)

{'_id': '2',
 '_index': 'movies',
 '_source': {'abstract': "Jumanji is a 1995 American family adventure film directed by Joe Johnston. It is an adaptation of the 1981 children's book of the same name by Chris Van Allsburg. The film was written by Greg Taylor, Jonathan Hensleigh, and Jim Strain, and stars Robin Williams, Kirsten Dunst, Bradley Pierce, David Alan Grier, Bonnie Hunt, Jonathan Hyde, and Bebe Neuwirth. The special effects were provided by Industrial Light & Magic for computer graphic elements and Amalgamated Dynamics for animatronics components. The film was dedicated to visual effects supervisor Stephen L. Price, who died before the film's release. The story centers on young Alan Parrish, who becomes trapped in a board game while playing with his best friend Sarah Whittle in 1969. Twenty-six years later in 1995, siblings Judy and Peter Shepherd find the game, begin playing and then unwittingly release the now-adult Alan. After tracking down Sarah, the quartet resolve to fi

#### This is slow, as it will make one HTTP request for each movie

In [12]:
from elasticsearch import helpers

In [13]:
def fast_load(bulk_size = 10000):
    tasks = []
    for i, movie in df_dbpedia_merged.iterrows():
        try:
            my_movie = {}
            my_movie["abstract"] = movie["dbpedia_content"]["abstract"]
            my_movie["title"] = movie["title"]
            my_movie["genres"] = movie["genres"]

            to_add = {
                "_index": "movies",
                "_type": "movie",
                "_source": my_movie,
                "_id": movie["movie_id"]
            }
            tasks.append(to_add)
            if len(tasks) % bulk_size == 0:
                helpers.bulk(es, tasks)
                tasks = []
        except Exception as ex:
            print(str(ex))
    helpers.bulk(es, tasks)

#### What happens if we insert a document using an id that already exists

In [14]:
fast_load()

In [15]:
es.get("movies", doc_type="movie", id=2)

{'_id': '2',
 '_index': 'movies',
 '_source': {'abstract': "Jumanji is a 1995 American family adventure film directed by Joe Johnston. It is an adaptation of the 1981 children's book of the same name by Chris Van Allsburg. The film was written by Greg Taylor, Jonathan Hensleigh, and Jim Strain, and stars Robin Williams, Kirsten Dunst, Bradley Pierce, David Alan Grier, Bonnie Hunt, Jonathan Hyde, and Bebe Neuwirth. The special effects were provided by Industrial Light & Magic for computer graphic elements and Amalgamated Dynamics for animatronics components. The film was dedicated to visual effects supervisor Stephen L. Price, who died before the film's release. The story centers on young Alan Parrish, who becomes trapped in a board game while playing with his best friend Sarah Whittle in 1969. Twenty-six years later in 1995, siblings Judy and Peter Shepherd find the game, begin playing and then unwittingly release the now-adult Alan. After tracking down Sarah, the quartet resolve to fi

#### Elasticsearch will create an index mapping automatically

In [16]:
es.indices.get("movies")

{'movies': {'aliases': {},
  'mappings': {'movie': {'properties': {'abstract': {'fields': {'keyword': {'ignore_above': 256,
        'type': 'keyword'}},
      'type': 'text'},
     'genres': {'fields': {'keyword': {'ignore_above': 256,
        'type': 'keyword'}},
      'type': 'text'},
     'title': {'fields': {'keyword': {'ignore_above': 256, 'type': 'keyword'}},
      'type': 'text'}}}},
  'settings': {'index': {'creation_date': '1520250808351',
    'number_of_replicas': '1',
    'number_of_shards': '5',
    'provided_name': 'movies',
    'uuid': 'X65E2qUjQUe3LMi9TOnTUg',
    'version': {'created': '6020299'}}}}}

#### Let's change the mapping, everytime you change a mapping you have to re-index all the documents

In [17]:
es.indices.delete("movies")

{'acknowledged': True}

In [18]:
es.indices.create(index="movies",
                               body={
                                   "mappings": {
                                       "movie": {
                                           "_all": {"enabled": False},
                                           "_source": {"enabled": True},
                                           "properties": {
                                               "title": {
                                                   "type": "text",
                                                   "analyzer": "my_english",
                                                   "term_vector": "yes"
                                               },
                                               "genres": {
                                                   "type": "text",
                                                   "analyzer": "genre_analyzer",
                                                   "term_vector": "yes"
                                               },
                                               "abstract": {
                                                   "type": "text",
                                                   "analyzer": "my_english",
                                                   "term_vector": "yes"
                                               }
                                           }
                                       }
                                   },
                                   "settings": {
                                       "analysis": {
                                           "analyzer": {
                                               "my_english": {
                                                   "tokenizer": "english",
                                                   "type": "english",
                                                   "stopwords": "_english_",
                                               },                                               
                                                "genre_analyzer": {
                                                  "tokenizer": "genre_tokenizer"
                                                }
                                           },
                                           "tokenizer": {
                                                "genre_tokenizer": {
                                                  "type": "pattern",
                                                  "pattern": ","
                                                }
                                              }
                                        }, "index" : {
                                            "number_of_shards" : 1, 
                                            "number_of_replicas" : 1
                                        }
                                   }
                               }
                 )

{'acknowledged': True, 'index': 'movies', 'shards_acknowledged': True}

In [19]:
es.indices.get("movies")

{'movies': {'aliases': {},
  'mappings': {'movie': {'_all': {'enabled': False},
    'properties': {'abstract': {'analyzer': 'my_english',
      'term_vector': 'yes',
      'type': 'text'},
     'genres': {'analyzer': 'genre_analyzer',
      'term_vector': 'yes',
      'type': 'text'},
     'title': {'analyzer': 'my_english',
      'term_vector': 'yes',
      'type': 'text'}}}},
  'settings': {'index': {'analysis': {'analyzer': {'genre_analyzer': {'tokenizer': 'genre_tokenizer'},
      'my_english': {'stopwords': '_english_',
       'tokenizer': 'english',
       'type': 'english'}},
     'tokenizer': {'genre_tokenizer': {'pattern': ',', 'type': 'pattern'}}},
    'creation_date': '1520250810295',
    'number_of_replicas': '1',
    'number_of_shards': '1',
    'provided_name': 'movies',
    'uuid': 'dV7FiKaQSfuZVEuRUbl9-w',
    'version': {'created': '6020299'}}}}}

In [20]:
fast_load()

In [21]:
terms = es.termvectors(index="movies", 
                        doc_type="movie", 
                        id=2,
                        params={"term_statistics": "true", "positions": "true"})

### This will return the stats of the shard containing the document
#### doc_freq: document frequency. The number of documents a term appears in. Computed per field.
#### term_freq: term frequency. The number times a term appears in a field in one specific document.
#### ttf: total term frequency. The number of times this term appears in all documents, that is, the sum of tf over all documents. Computed per field.

In [22]:
terms

{'_id': '2',
 '_index': 'movies',
 '_type': 'movie',
 '_version': 1,
 'found': True,
 'term_vectors': {'abstract': {'field_statistics': {'doc_count': 3261,
    'sum_doc_freq': 212022,
    'sum_ttf': 260644},
   'terms': {'10th': {'doc_freq': 13, 'term_freq': 1, 'ttf': 13},
    '15': {'doc_freq': 55, 'term_freq': 1, 'ttf': 60},
    '1969': {'doc_freq': 35, 'term_freq': 1, 'ttf': 37},
    '1981': {'doc_freq': 78, 'term_freq': 1, 'ttf': 95},
    '1995': {'doc_freq': 334, 'term_freq': 4, 'ttf': 415},
    '2005': {'doc_freq': 57, 'term_freq': 1, 'ttf': 60},
    '262,797,249': {'doc_freq': 1, 'term_freq': 1, 'ttf': 1},
    '65': {'doc_freq': 5, 'term_freq': 1, 'ttf': 5},
    'adapt': {'doc_freq': 459, 'term_freq': 2, 'ttf': 555},
    'addit': {'doc_freq': 86, 'term_freq': 1, 'ttf': 92},
    'adult': {'doc_freq': 31, 'term_freq': 1, 'ttf': 31},
    'adventur': {'doc_freq': 191, 'term_freq': 1, 'ttf': 233},
    'after': {'doc_freq': 476, 'term_freq': 1, 'ttf': 590},
    'alan': {'doc_freq': 10

In [23]:
terms["term_vectors"]["genres"]

{'field_statistics': {'doc_count': 3266,
  'sum_doc_freq': 5516,
  'sum_ttf': 5516},
 'terms': {'Adventure': {'doc_freq': 256, 'term_freq': 1, 'ttf': 256},
  "Children's": {'doc_freq': 236, 'term_freq': 1, 'ttf': 236},
  'Fantasy': {'doc_freq': 63, 'term_freq': 1, 'ttf': 63}}}

## Searching 

#### Querying

In [24]:
es.search(index="movies", body={"query":
                                         {"match": {
                                             "genres":"Sci-Fi"
                                             }
                                         }
                                    }
         )

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [{'_id': '680',
    '_index': 'movies',
    '_score': 3.1359875,
    '_source': {'abstract': "Alphaville: une étrange aventure de Lemmy Caution (Alphaville: A Strange Adventure of Lemmy Caution) is a 1965 French science fiction noir film directed by Jean-Luc Godard. It stars Eddie Constantine, Anna Karina, Howard Vernon and Akim Tamiroff. The film won the Golden Bear award of the 15th Berlin International Film Festival in 1965. Alphaville combines the genres of dystopian science fiction and film noir. There are no special props or futuristic sets; instead, the film was shot in real locations in Paris, the night-time streets of the capital becoming the streets of Alphaville, while modernist glass and concrete buildings (that in 1965 were new and strange architectural designs) represent the city's interiors. The film is set in the future but the characters also refer to twentieth century events; for e

In [25]:
es.search(index="movies", body={"query":
                                         {"match": {
                                             "title":"jumanji"
                                             }
                                         }
                                    }
         )

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [{'_id': '2',
    '_index': 'movies',
    '_score': 9.0000305,
    '_source': {'abstract': "Jumanji is a 1995 American family adventure film directed by Joe Johnston. It is an adaptation of the 1981 children's book of the same name by Chris Van Allsburg. The film was written by Greg Taylor, Jonathan Hensleigh, and Jim Strain, and stars Robin Williams, Kirsten Dunst, Bradley Pierce, David Alan Grier, Bonnie Hunt, Jonathan Hyde, and Bebe Neuwirth. The special effects were provided by Industrial Light & Magic for computer graphic elements and Amalgamated Dynamics for animatronics components. The film was dedicated to visual effects supervisor Stephen L. Price, who died before the film's release. The story centers on young Alan Parrish, who becomes trapped in a board game while playing with his best friend Sarah Whittle in 1969. Twenty-six years later in 1995, siblings Judy and Peter Shepherd find the g

In [30]:
es.search(index="movies", body={"query":
                                         {"match": {
                                             "abstract":"zombies aliens"
                                             }
                                         }
                                    }
         )

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [{'_id': '3696',
    '_index': 'movies',
    '_score': 11.877124,
    '_source': {'abstract': 'Night of the Creeps is a 1986 American science fiction horror comedy film written and directed by Fred Dekker, starring Tom Atkins, Jason Lively, Steve Marshall and Jill Whitlow. The film is an earnest attempt at a B movie and a homage to the genre. While the main plot of the film is related to zombies, the film also mixes in takes on slashers and alien invasion films. Night of the Creeps did not perform well at the box office, but it developed a cult following.',
     'genres': 'Comedy,Horror,Sci-Fi',
     'title': 'Night of the Creeps (1986)'},
    '_type': 'movie'},
   {'_id': '1983',
    '_index': 'movies',
    '_score': 9.704902,
    '_source': {'abstract': "Halloween II is a 2009 American slasher film written, directed, and produced by Rob Zombie. The film is a sequel to Zombie's 2007 remake of 1978'

In [35]:
# See https://www.elastic.co/guide/en/elasticsearch/reference/current/index-modules-similarity.html
# and https://www.elastic.co/guide/en/elasticsearch/guide/current/scoring-theory.html for further details
es.explain(index="movies", doc_type="movie", id=3696, body={"query":
                                         {"match": {
                                             "abstract":"zombies aliens"
                                             }
                                         }
                                    }
         )

{'_id': '3696',
 '_index': 'movies',
 '_type': 'movie',
 'explanation': {'description': 'sum of:',
  'details': [{'description': 'weight(abstract:zombi in 15) [PerFieldSimilarity], result of:',
    'details': [{'description': 'score(doc=15,freq=1.0 = termFreq=1.0\n), product of:',
      'details': [{'description': 'idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:',
        'details': [{'description': 'docFreq', 'details': [], 'value': 9.0},
         {'description': 'docCount', 'details': [], 'value': 3261.0}],
        'value': 5.838804},
       {'description': 'tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:',
        'details': [{'description': 'termFreq=1.0',
          'details': [],
          'value': 1.0},
         {'description': 'parameter k1', 'details': [], 'value': 1.2},
         {'description': 'parameter b', 'details': [], 'value': 0.75},
         {'description': 'avgFieldLength', 'details': [

#### Searching for phrases

In [32]:
es.search(index="movies", body={"query":
                                         {"match_phrase": {
                                             "title":"star wars"
                                             }
                                         }
                                    }
         )

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [{'_id': '260',
    '_index': 'movies',
    '_score': 7.1204076,
    '_source': {'abstract': 'Star Wars (later retitled Star Wars: Episode IV – A New Hope) is a 1977 American epic space opera film written and directed by George Lucas. The first installment in the original Star Wars trilogy, it stars Mark Hamill, Harrison Ford, Carrie Fisher, Peter Cushing, and Alec Guinness. David Prowse, James Earl Jones, Anthony Daniels, Kenny Baker and Peter Mayhew co-star in supporting roles. The plot focuses on the Rebel Alliance, led by Princess Leia (Fisher), and its attempt to destroy the Galactic Empire\'s space station, the Death Star. This conflict disrupts the isolated life of farmhand Luke Skywalker (Hamill) who inadvertently acquires a pair of droids that possess stolen architectural plans for the Death Star. When the Empire begins a destructive search for the missing droids, Skywalker accompanies Jedi

#### Searching for Similar movies, by content

#### More like this

In [33]:
mlts = es.search(index="movies", doc_type="movie", body = {"query": {
                                                                "more_like_this" : {
                                                                    "fields" : ["abstract"],
                                                                    "like" : [
                                                                        {
                                                                            "_index" : "movies",
                                                                            "_type" : "movie",
                                                                            "_id" : 1196
                                                                        }
                                                                    ],
                                                                    "max_query_terms" : 10,
                                                                    "min_term_freq" : 1,
                                                                }
                                                            }
                                                        }
)

In [34]:
for movie in mlts["hits"]["hits"]:
    print(movie["_source"]["title"])

Star Wars: Episode IV - A New Hope (1977)
Star Wars: Episode VI - Return of the Jedi (1983)
Jaws 2 (1978)
Body Heat (1981)
E.T. the Extra-Terrestrial (1982)
Pale Rider (1985)
Little Big League (1994)
King Kong (1976)
Back to the Future Part III (1990)
Best Years of Our Lives, The (1946)


## Cool, you built a movies recommender system 😎