In [1]:

# Let's first install Cohere's python SDK
# %pip install cohere
# %pip install python-dotenv

In [2]:
import cohere
import pandas as pd
import requests
import datetime
from tqdm import tqdm
pd.set_option('display.max_colwidth', None)

import os
from dotenv import load_dotenv

load_dotenv()
# Paste your API key here. Remember to not share publicly
api_key = os.getenv('COHERE_API_KEY')

In [3]:
def get_post_titles(**kwargs):
    """ Gets data from the pushshift api. Read more: https://github.com/pushshift/api """
    base_url = f"https://api.pushshift.io/reddit/search/submission/"
    payload = kwargs
    request = requests.get(base_url, params=payload)
    return [a['title'] for a in request.json()['data']]

In [5]:
# Create and retrieve a Cohere API key from os.cohere.ai
co = cohere.Client(api_key)

In [6]:
movie_examples = [
("Deadpool 2", "Deadpool 2 | Official HD Deadpool's \"Wet on Wet\" Teaser | 2018"),
("none", "Jordan Peele Just Became the First Black Writer-Director With a $100M Movie Debut"),
("Joker", "Joker Officially Rated “R”"),
("Free Guy", "Ryan Reynolds’ 'Free Guy' Receives July 3, 2020 Release Date - About a bank teller stuck in his routine that discovers he’s an NPC character in brutal open world game."),
("none", "James Cameron congratulates Kevin Feige and Marvel!"),
("Guardians of the Galaxy", "The Cast of Guardians of the Galaxy release statement on James Gunn"),
]

In [7]:
#@title Create the prompt (Run this cell to execute required code) {display-mode: "form"}

class cohereExtractor():
    def __init__(self, examples, example_labels, labels, task_desciption, example_prompt):
        self.examples = examples
        self.example_labels = example_labels
        self.labels = labels
        self.task_desciption = task_desciption
        self.example_prompt = example_prompt

    def make_prompt(self, example):
        examples = self.examples + [example]
        labels = self.example_labels + [""]
        return (self.task_desciption +
                "\n---\n".join( [examples[i] + "\n" +
                                self.example_prompt + 
                                 labels[i] for i in range(len(examples))]))

    def extract(self, example):
      extraction = co.generate(
          model='large',
          prompt=self.make_prompt(example),
          max_tokens=10,
          temperature=0.1,
          stop_sequences=["\n"])
      return(extraction.generations[0].text[:-1])


cohereMovieExtractor = cohereExtractor([e[1] for e in movie_examples], 
                                       [e[0] for e in movie_examples], [],
                                       "", 
                                       "extract the movie title from the post:")

# Uncomment to inspect the full prompt:
# print(cohereMovieExtractor.make_prompt('<input text here>'))

In [8]:
# This is what the prompt looks like:
print(cohereMovieExtractor.make_prompt('<input text here>'))

Deadpool 2 | Official HD Deadpool's "Wet on Wet" Teaser | 2018
extract the movie title from the post:Deadpool 2
---
Jordan Peele Just Became the First Black Writer-Director With a $100M Movie Debut
extract the movie title from the post:none
---
Joker Officially Rated “R”
extract the movie title from the post:Joker
---
Ryan Reynolds’ 'Free Guy' Receives July 3, 2020 Release Date - About a bank teller stuck in his routine that discovers he’s an NPC character in brutal open world game.
extract the movie title from the post:Free Guy
---
James Cameron congratulates Kevin Feige and Marvel!
extract the movie title from the post:none
---
The Cast of Guardians of the Galaxy release statement on James Gunn
extract the movie title from the post:Guardians of the Galaxy
---
<input text here>
extract the movie title from the post:


In [9]:
num_posts = 10

movies_list = get_post_titles(size=num_posts, 
      after=str(int(datetime.datetime(2021,1,1,0,0).timestamp())), 
      before=str(int(datetime.datetime(2022,1,1,0,0).timestamp())), 
      subreddit="movies", 
      sort_type="score", 
      sort="desc")

# Show the list
movies_list

['Hayao Miyazaki Got So Bored with Retirement He Started Directing Again ‘in Order to Live’',
 "First poster for Pixar's Luca",
 'New images from Space Jam: A New Legacy',
 'Official Poster for "Sonic the Hedgehog 2"',
 'Ng Man Tat, legendary HK actor and frequent collborator of Stephen Chow (Shaolin Soccer, God of Gambler) died at 70',
 'Zack Snyder’s Justice League has officially been Rated R for for violence and some language',
 'HBOMax and Disney+ NEED to improve their apps if they want to compete with Netflix.',
 'I want a sequel to Rat Race where John Cleese’s character dies and invites everyone from the first film to his funeral, BUT, he’s secretly set up a Rat Maze to trap them all in. A sort of post-mortem revenge on them for donating all his wealth to charity.',
 "'Trainspotting' at 25: How an Indie Film About Heroin Became a Feel-Good Classic",
 '‘Avatar: The Last Airbender’ Franchise To Expand With Launch Of Nickelodeon’s Avatar Studios, Animated Theatrical Film To Start Pr

In [10]:


results = []
for text in tqdm(movies_list):
    try:
        extracted_text = cohereMovieExtractor.extract(text)
        results.append(extracted_text)
    except Exception as e:
        print('ERROR: ', e)

100%|██████████| 10/10 [00:16<00:00,  1.61s/it]


In [11]:

pd.DataFrame(data={'text': movies_list, 'extracted_text': results})

Unnamed: 0,text,extracted_text
0,Hayao Miyazaki Got So Bored with Retirement He Started Directing Again ‘in Order to Live’,none
1,First poster for Pixar's Luca,Luca
2,New images from Space Jam: A New Legacy,Space Jam: A New Legacy
3,"Official Poster for ""Sonic the Hedgehog 2""",Sonic the Hedgehog 2
4,"Ng Man Tat, legendary HK actor and frequent collborator of Stephen Chow (Shaolin Soccer, God of Gambler) died at 70",none
5,Zack Snyder’s Justice League has officially been Rated R for for violence and some language,Justice League
6,HBOMax and Disney+ NEED to improve their apps if they want to compete with Netflix.,none
7,"I want a sequel to Rat Race where John Cleese’s character dies and invites everyone from the first film to his funeral, BUT, he’s secretly set up a Rat Maze to trap them all in. A sort of post-mortem revenge on them for donating all his wealth to charity.",Rat Race
8,'Trainspotting' at 25: How an Indie Film About Heroin Became a Feel-Good Classic,Trainspotting
9,"‘Avatar: The Last Airbender’ Franchise To Expand With Launch Of Nickelodeon’s Avatar Studios, Animated Theatrical Film To Start Production Later This Year",none


In [12]:
test_df = pd.read_csv('https://raw.githubusercontent.com/cohere-ai/notebooks/main/notebooks/data/movie_extraction_test_set_100.csv',index_col=0)
test_df


Unnamed: 0,text,label
0,Disney's streaming service loses some movies due to old licensing deals,none
1,"Hi, I’m Sam Raimi, producer of THE GRUDGE which hits theaters tonight. Ask Me Anything!",The Grudge
2,'Parasite' Named Best Picture by Australia's AACTA Awards,Parasite
3,Danny Trejo To Star In Vampire Spaghetti Western ‘Death Rider in the House of Vampires’,Death Rider in the House of Vampires
4,I really wish the 'realistic' CGI animal trend would end.,none
...,...,...
95,Hair Love | Oscar Winning Short Film (Full),Hair Love
96,First image of Jason Alexander in Christian film industry satire 'Faith Based',Faith Based
97,"'Borderlands' Movie in the Works From Eli Roth, Lionsgate",Borderlands
98,"Taika Waititi putting his Oscar ""away"" after winning best adapted screenplay for JOJO RABBIT",Jojo Rabbit


In [13]:
from concurrent.futures import ThreadPoolExecutor

extracted = []
# Run the model to extract the entities
with ThreadPoolExecutor(max_workers=8) as executor:
    for i in executor.map(cohereMovieExtractor.extract, test_df['text']):
        extracted.append(str(i).strip())
# Save results
test_df['extracted_text'] = extracted

In [14]:
test_df.head()

Unnamed: 0,text,label,extracted_text
0,Disney's streaming service loses some movies due to old licensing deals,none,none
1,"Hi, I’m Sam Raimi, producer of THE GRUDGE which hits theaters tonight. Ask Me Anything!",The Grudge,none
2,'Parasite' Named Best Picture by Australia's AACTA Awards,Parasite,Parasite
3,Danny Trejo To Star In Vampire Spaghetti Western ‘Death Rider in the House of Vampires’,Death Rider in the House of Vampires,Death Rider
4,I really wish the 'realistic' CGI animal trend would end.,none,none


In [15]:
# Compare the label to the extracted text
test_df['correct'] = (test_df['label'].str.lower() == test_df['extracted_text'].str.lower()).astype(int)

# Print the accuracy
print(f'Classification accuracy {test_df["correct"].mean() *100}%')

Classification accuracy 67.0%


In [16]:
test_df[test_df['correct']==0]

Unnamed: 0,text,label,extracted_text,correct
1,"Hi, I’m Sam Raimi, producer of THE GRUDGE which hits theaters tonight. Ask Me Anything!",The Grudge,none,0
3,Danny Trejo To Star In Vampire Spaghetti Western ‘Death Rider in the House of Vampires’,Death Rider in the House of Vampires,Death Rider,0
6,De Niro recreating a scene from Goodfellas to test Irishman deaging (3:30 in),Goodfellas,none,0
8,Laura Dern loves both of her awards movies — Marriage Story and Little Women — equally,Marriage Story,none,0
9,A Lot of the Sound Effects in Ad Astra Were Just Tommy Lee Jones's Voice,Ad Astra,none,0
11,'Cats' Visual Woes Began Early On In Production,Cats,none,0
13,Jake Gyllenhaal to Produce &amp; Star in Movie Musical Adaptation of Fun Home,Fun Home,none,0
19,Movies Like Knives Out,Knives Out,none,0
22,"Please do yourself a favor and consider watching 'The Big Year' (2011, Jack Black, Owen Wilson, Steve Martin)",The Big Year,none,0
23,the Flintstones movie (1994) might have some of best production design in film.,The Flintstones,none,0


In [17]:
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

print(classification_report(test_df['label'].str.lower(), test_df['extracted_text'].str.lower()))

                                      precision    recall  f1-score   support

                                1917       1.00      1.00      1.00         2
               2001: a space odyssey       1.00      1.00      1.00         1
                            ad astra       0.00      0.00      0.00         1
     alice doesn't live here anymore       0.00      0.00      0.00         1
                       austin powers       1.00      1.00      1.00         1
                  back to the future       0.00      0.00      0.00         0
               back to the future ii       0.00      0.00      0.00         1
                        blood simple       0.00      0.00      0.00         1
                   bohemian rhapsody       0.00      0.00      0.00         1
                         borderlands       1.00      1.00      1.00         1
                     brief encounter       1.00      1.00      1.00         1
                                cats       0.00      0.00      