In [1]:
from skllm.config import SKLLMConfig
from time import sleep
import pandas as pd
import utils_config

In [2]:
# Apply the configuration
config = utils_config.load_config('./config.json')

SKLLMConfig.set_openai_key(config.OPENAI_KEY)
SKLLMConfig.set_openai_org(config.OPENAI_ORG_ID)

## Use GPT-3.5 to build the model

In [3]:
from skllm import ZeroShotGPTClassifier
from skllm.datasets import get_classification_dataset

# Load the model and the dataset
model = ZeroShotGPTClassifier(openai_model="gpt-3.5-turbo")
X, y = get_classification_dataset()

# Fit the model
model.fit(X, y)

# Predict
labels = []
for x in X:
    labels.append(model.predict([x]))
    sleep(20)
labels = [label[0] for label in labels]

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [00:01<00:00,  1.03s/it]
100%|██████████| 1/1 [00:00<00:00,  1.57it/s]
100%|██████████| 1/1 [00:00<00:00,  1.41it/s]
100%|██████████| 1/1 [00:00<00:00,  1.66it/s]
100%|██████████| 1/1 [00:01<00:00,  1.15s/it]
100%|██████████| 1/1 [00:00<00:00,  1.28it/s]
100%|██████████| 1/1 [00:01<00:00,  1.08s/it]
100%|██████████| 1/1 [00:00<00:00,  1.16it/s]
100%|██████████| 1/1 [00:00<00:00,  1.12it/s]
100%|██████████| 1/1 [00:00<00:00,  1.35it/s]
100%|██████████| 1/1 [00:00<00:00,  1.72it/s]
100%|██████████| 1/1 [00:00<00:00,  1.01it/s]
100%|██████████| 1/1 [00:01<00:00,  1.01s/it]
100%|██████████| 1/1 [00:00<00:00,  1.28it/s]
100%|██████████| 1/1 [00:01<00:00,  1.23s/it]
100%|██████████| 1/1 [00:00<00:00,  1.15it/s]
100%|██████████| 1/1 [00:00<00:00,  1.02it/s]
100%|██████████| 1/1 [00:00<00:00,  1.54it/s]
100%|██████████| 1/1 [00:00<00:00,  1.42it/s]
100%|██████████| 1/1 [00:00<00:00,  1.33it/s]
100%|██████████| 1/1 [00:01<00:00,  1.19s/it]
100%|██████████| 1/1 [00:00<00:00,

In [4]:
# Print the results
pd.DataFrame({'phrase': X, 'true_label': y, 'predicted': labels})

Unnamed: 0,phrase,true_label,predicted
0,I was absolutely blown away by the performance...,positive,positive
1,The special effects in 'Star Battles: Nebula C...,positive,positive
2,'The Lost Symphony' was a masterclass in chara...,positive,positive
3,I was pleasantly surprised by 'Love in the Tim...,positive,positive
4,I went into 'Marble Street' with low expectati...,positive,positive
5,'The Great Plains' is a touching portrayal of ...,positive,positive
6,The screenwriting in 'Under the Willow Tree' w...,positive,positive
7,'Nightshade' is a brilliant take on the superh...,positive,positive
8,The cinematography in 'Awakening' was nothing ...,positive,positive
9,'Eternal Embers' was a cinematic delight. The ...,positive,positive


## Apply classifier to Rotten Tomatoes' reviews data base

In [5]:
# Load the datasets
reviews = pd.read_csv('../Data/Raw/rotten_tomatoes_critic_reviews.csv')
movies = pd.read_csv('../Data//Raw/rotten_tomatoes_movies.csv')

In [6]:
# Filter and save a new dataset to reduce the size
reviews = reviews[['rotten_tomatoes_link', 'review_content', 'review_score']].dropna()
reviews = reviews.drop_duplicates(subset='rotten_tomatoes_link', keep='first', ignore_index=True)
reviews = reviews.merge(movies, on='rotten_tomatoes_link')[['movie_title', 'review_content', 'review_score']]
reviews.to_csv('../Data/Filtered/rotten_tomatoes_reviews.csv')

In [7]:
# Sample for classification
sample_3 = reviews.sample(3)

# Predict
labels = []
for x in sample_3['review_content']:
    labels.append(model.predict([x])[0])

# Print the results
sample_3['sentiment'] = labels
sample_3

100%|██████████| 1/1 [00:01<00:00,  1.05s/it]
100%|██████████| 1/1 [00:00<00:00,  1.42it/s]
100%|██████████| 1/1 [00:00<00:00,  1.36it/s]


Unnamed: 0,movie_title,review_content,review_score,sentiment
4622,Civil Brand,Muckraking mess of a movie.,2.5/5,negative
1901,2016: Obama's America,Not interviewed by the filmmakers are Obama's ...,2.5/5,neutral
7322,Guilty by Suspicion,It teaches a lesson we are always in danger of...,3.5/4,positive
