In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

In [3]:
vectorised_data_df = pd.read_feather('vectorised_data_df.feather') 
reviews_df = pd.read_hdf('causal_review_df_new_independent2.5.h5').iloc[:, : 20]

In [4]:
len(vectorised_data_df)

863

In [5]:
title_reviews_dict = {}
grouped_reviews = reviews_df.groupby('movie_title')
for title in vectorised_data_df['title'].tolist():
    reviews_by_title = reviews_df[reviews_df['movie_title'] == title]
    #print(reviews_by_title.columns)
    if len(reviews_by_title) != 0:
        title_reviews_dict[title] = reviews_by_title[['individual_meta_score','text','universal communicability',
       'a sheer personal preference based on desire or inclination','positivity in sentiment',
    'a sheer personal dispreference based on an absence of desire and inclination','allows for no one to disagree']]
    else:
        if ', The' in title:
            adjusted_title = title[-3:] + ' ' + title[:-5]
            reviews_by_title = reviews_df[reviews_df['movie_title'] == adjusted_title]
            if len(reviews_by_title) != 0:
                title_reviews_dict[title] = reviews_by_title[['individual_meta_score','text','universal communicability','positivity in sentiment',
       'a sheer personal preference based on desire or inclination','a sheer personal dispreference based on an absence of desire and inclination',
                                                              'allows for no one to disagree']]


In [6]:
script_review_df = vectorised_data_df[vectorised_data_df['title'].isin(list(title_reviews_dict.keys()))]
script_review_df['reviews'] = script_review_df['title'].map(title_reviews_dict) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  script_review_df['reviews'] = script_review_df['title'].map(title_reviews_dict)


In [7]:
len(script_review_df)

776

In [8]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import gc

gc.collect()
torch.cuda.empty_cache()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased',
                                        torch_dtype=torch.bfloat16,attn_implementation="flash_attention_2").to(device)

model.half()  # Convert model to half precision

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertFlashAttention2(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2):

In [9]:
def encode(text, model, tokenizer, device):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze()  # Taking the first token ([CLS]) embedding

In [10]:
def encode_reviews(row, model, tokenizer, device):
    vector_list = []
    for review in row['reviews']['text']:
        vector = encode(review, model, tokenizer, device)
        vector_list.append(vector.cpu().numpy())  # Move vectors to CPU to save GPU memory
        torch.cuda.empty_cache()  # Frees up unutilized GPU memory
    row['reviews']['review_vector'] = vector_list
    return row

In [11]:
def vector_projection(a, b):
    """Project vector a onto vector b using numpy."""
    dot_product = np.dot(a, b)
    norm_b_squared = np.dot(b, b)
    return (dot_product / norm_b_squared) * b

In [12]:
tqdm.pandas()

In [13]:
vectorised_script_review_df = script_review_df.progress_apply(lambda x: encode_reviews(x, model, tokenizer, device), axis=1)

  0%|          | 0/776 [00:00<?, ?it/s]

In [14]:
script_lines_projected_vectors_list = []
for _, data in tqdm(vectorised_script_review_df.iterrows()):
    line_projected_vectors_list = []
    for line, line_vector in list(zip(data['split_script'], data['script_vectors'])):
        projected_vectors_list = []
        for _,review_data in data['reviews'].iterrows():
            normalised_line_vector = line_vector/np.linalg.norm(line_vector)
            normalised_review_vector = review_data['review_vector']/np.linalg.norm(review_data['review_vector'])
            projected_vector = vector_projection(normalised_line_vector, normalised_review_vector)
            projected_vectors_list.append(np.linalg.norm(projected_vector))
        line_projected_vectors_list.append(projected_vectors_list)
    script_lines_projected_vectors_list.append(line_projected_vectors_list)
            

0it [00:00, ?it/s]

In [15]:
vectorised_script_review_df['projected_magnitudes'] = script_lines_projected_vectors_list

In [16]:
vectorised_script_review_df['title']

0                             10 Things I Hate About You
1                                                     12
4                                       12 Years a Slave
5                                              127 Hours
6                             1492: Conquest of Paradise
8                                               17 Again
10                                 2001: A Space Odyssey
11                                                  2012
13                                    30 Minutes or Less
14                                                    42
15                                         44 Inch Chest
16                                               48 Hrs.
19                                                   8MM
20                                                     9
21                                        A Few Good Men
22                                   A Most Violent Year
23                                  A Prayer Before Dawn
24                             

In [20]:
row = vectorised_script_review_df[vectorised_script_review_df['title'] == "Boyhood"].squeeze(0)

In [21]:
row['reviews']

Unnamed: 0,individual_meta_score,text,universal communicability,a sheer personal preference based on desire or inclination,positivity in sentiment,a sheer personal dispreference based on an absence of desire and inclination,allows for no one to disagree,review_vector
32185,100,"There’s not a great theme, a great performance or even a great scene in Boyhood. But I think it might be a great picture.",0.053572,0.676268,0.14192,0.644752,0.005038,"[0.00279, -0.1608, 0.0283, -0.0665, -0.04617, -0.3237, 0.1119, 0.4316, -0.1204, -0.159, -0.01386, -0.06616, 0.1385, 0.3542, -0.011635, 0.1641, 0.000988, 0.06586, 0.2037, -0.1976, -0.1897, -0.06604, -0.0763, 0.0891, -0.1207, -0.2399, 0.1758, -0.06216, 0.2964, -0.01297, -0.04712, 0.08246, -0.358, -0.2517, -0.002987, -0.2147, 0.02623, -0.1981, 0.02966, -0.1804, 0.08606, 0.03528, 0.0215, -0.0769, -0.129, -0.03006, -2.355, 0.000529, 0.004738, -0.303, 0.2443, 0.05728, 0.3564, 0.3188, 0.2316, 0.2216, -0.2361, 0.4624, -0.0319, 0.11993, 0.1157, -0.0059, -0.02411, -0.11224, -0.0564, 0.0436, 0.02313, 0.1407, -0.1506, 0.2322, -0.1703, -0.1544, 0.2208, -0.3127, 0.05902, 0.04092, 0.1068, 0.1953, -0.1704, 0.0777, 0.06235, 0.3352, 0.16, 0.1697, -0.05487, 0.0923, -0.06046, -0.12427, 0.2834, 0.3892, -0.413, -0.1597, 0.0481, 0.2439, 0.514, -0.3413, 0.02121, -0.08154, 0.1975, 0.322, ...]"
32186,100,"The greatest movies, the ones that stick with us, are those that hold up a mirror to the human condition and reflect something back at us that we too often manage to overlook. Boyhood is one of those movies, and with it Linklater proves he is among the best practitioners of that art.",0.839547,0.083272,0.003778,0.005,0.34522,"[0.1609, 0.05338, -0.3076, -0.09766, -0.00289, -0.2467, 0.136, 0.2734, -0.10974, -0.2651, 0.04956, -0.075, 0.04434, 0.3257, 0.1598, 0.1052, 0.02744, 0.03342, 0.1117, -0.108, -0.355, -0.1312, 0.2212, 0.2052, -0.1902, -0.05362, -0.12256, -0.0883, 0.0898, 0.0953, 0.1797, 0.1833, -0.2698, -0.4216, -0.01662, -0.323, 0.1556, -0.114, 0.1744, 0.1788, -0.01263, 0.2727, 0.11304, -0.2415, -0.1476, -0.1993, -2.846, 0.04416, -0.2717, -0.12164, 0.1847, 0.2014, 0.108, 0.3142, 0.3977, 0.5664, -0.405, 0.391, -0.06335, 0.09296, 0.3188, 0.08673, -0.12067, -0.06696, -0.1786, 0.204, -0.1307, 0.1425, -0.1426, 0.466, -0.2169, -0.1732, 0.4236, -0.11816, 0.1648, 0.01481, 0.0405, 0.0434, -0.2008, -0.0578, 0.127, 0.3345, 0.1849, 0.3408, 0.01196, 0.235, 0.1142, 0.009, 0.1716, 0.3545, -0.348, -0.1637, 0.1139, 0.302, 0.605, -0.5513, 0.1534, -0.10095, -0.0693, 0.3691, ...]"
32187,100,The film would be incalculably different if the lead role had been divided between two or three young actors for a conventional shoot. But Linklater’s patience allows us to see a thoughtful personality being formed both on and off the screen.,0.394183,0.283687,0.35771,0.014962,0.17123,"[0.06824, -0.10034, -0.1742, 0.0049, 0.18, -0.003136, 0.2164, 0.03992, 0.04623, -0.1621, 0.1947, 0.13, 0.1467, 0.3042, 0.02832, 0.18, 0.0737, 0.09937, 0.1967, -0.286, -0.113, -0.2302, 0.11505, 0.2522, -0.1553, -0.1471, 0.0816, 0.01694, 0.1339, -0.00828, 0.1373, -0.02121, -0.365, -0.256, 0.01868, -0.224, 0.0284, 0.0674, 0.04007, -0.0984, 0.04074, 0.2534, 0.0591, -0.2156, -0.1329, -0.1533, -2.607, -0.0012045, -0.1545, -0.3066, 0.2207, 0.2217, 0.4197, 0.2388, 0.2754, 0.251, -0.2888, 0.3752, 0.007065, 0.194, 0.0414, 0.0199, 0.05862, -0.0655, -0.01811, 0.1162, -0.11475, -0.01152, -0.0534, 0.3962, -0.1936, -0.1699, 0.08527, -0.1565, -0.0625, -0.1884, 0.0382, 0.2974, -0.1926, -0.02069, 0.2063, 0.2064, -0.01486, 0.1293, -0.0948, 0.1167, 0.199, -0.0418, 0.378, 0.486, -0.4314, -0.1725, -0.07465, 0.0989, 0.5005, -0.3318, 0.2322, 0.0452, 0.1521, 0.3845, ...]"
32188,100,"A home movie of a fictional home life, an epic assembled from vignettes, Boyhood shimmers with unforced reality. It shows how an ordinary life can be reflected in an extraordinary movie.",0.981731,0.017478,0.52491,0.001098,0.203928,"[-0.06384, -0.2491, 0.0379, 0.00503, 0.1364, -0.2676, 0.1777, 0.31, 0.1231, -0.1985, 0.013756, -0.2097, -0.03574, 0.5957, 0.02629, 0.1633, 0.1693, 0.1333, 0.1862, -0.3638, -0.0784, -0.2998, 0.1718, 0.11646, -0.1742, 0.0954, 0.0893, 0.1619, 0.09937, 0.03323, -0.01413, -0.0881, -0.3945, -0.565, 0.2012, -0.1364, 0.05746, -0.195, -0.1737, -0.0999, 0.0754, 0.1705, 0.0766, -0.2974, 0.00676, -0.4104, -2.658, -0.04013, -0.278, -0.1927, 0.325, 0.12, 0.4165, 0.2502, 0.3003, 0.5884, -0.293, 0.2766, 0.07587, 0.1598, 0.2463, 0.01622, -0.1242, -0.1724, -0.2235, 0.06335, -0.0958, 0.2091, -0.05862, 0.3723, -0.33, -0.03217, 0.1127, -0.0592, -0.002213, -0.1616, -0.06805, 0.308, -0.3423, 0.02095, 0.11896, 0.3892, 0.4429, 0.2097, -0.1694, 0.307, -0.2013, -0.057, 0.1022, 0.2803, -0.6016, -0.153, -0.02965, 0.4111, 0.5044, -0.4998, -0.0962, -0.05612, 0.2451, 0.548, ...]"
32189,100,"I'm as reluctant to stop writing about this movie as I was to stop watching it: At 166 minutes, it flies by, and you don't want to leave that world. But one thing is certain: This isn't the last word. People will be writing about this film for years - and looking at it to discover the lost history of our time.",0.990971,0.141836,0.237057,0.000793,0.005324,"[0.1951, 0.01323, -0.091, -0.1984, 0.02406, -0.186, 0.1764, 0.4456, -0.11334, -0.3347, -0.04434, 0.036, 0.03204, 0.4536, 0.09717, 0.2493, -0.06445, 0.0701, 0.08936, 0.0611, -0.2355, 0.0632, 0.114, 0.1508, -0.1392, -0.1824, 0.0601, -0.075, 0.1198, -0.01642, 0.00616, 0.03488, -0.3623, -0.188, 0.1437, -0.2603, 0.03995, -0.2184, -0.04947, 0.0796, -0.0819, 0.3218, -0.1647, -0.1754, -0.05994, -0.04944, -2.754, -0.0207, -0.08624, -0.1648, 0.388, 0.08044, 0.2788, 0.3484, 0.3909, 0.3906, -0.3062, 0.3762, -0.09125, 0.144, 0.3325, 0.01389, -0.04196, -0.0521, 0.0523, 0.1469, -0.02681, 0.12317, -0.183, 0.323, -0.266, -0.2566, 0.1378, -0.2318, -0.007557, -0.1727, -0.04877, 0.2554, -0.122, 0.10126, 0.0836, 0.5117, 0.287, -0.0253, -0.02637, -0.04236, -0.2299, 0.05408, 0.3745, 0.2274, -0.4285, -0.1647, -0.1197, 0.3418, 0.58, -0.546, 0.1151, -0.0455, -0.0401, 0.4365, ...]"
32190,100,"Is it dumb to say, ""Wow?""...I don't care. Wow.",0.76716,0.714563,0.676503,0.029542,0.032034,"[0.02426, 0.002285, 0.04825, -0.2277, -0.06714, -0.2698, 0.1832, 0.4412, -0.0877, -0.1548, -0.05804, -0.18, -0.0923, 0.2335, 0.1942, 0.4253, 0.02072, 0.2417, -0.0957, -0.0903, 0.05652, -0.1602, 0.01883, -0.05966, -0.02948, -0.0715, 0.004288, -0.1489, 0.06003, -0.0576, 0.1387, 0.1244, -0.1746, -0.1282, -0.1016, -0.05225, -0.1442, -0.0876, 0.04047, -0.004368, -0.1112, 0.00528, -0.001403, -0.1742, -0.2605, -0.2837, -2.545, -0.3184, -0.2361, -0.1428, 0.298, 0.039, 0.03041, 0.3457, 0.1901, 0.2294, -0.156, 0.2205, 0.09644, -0.02122, 0.4358, 0.04675, -0.1808, -0.1294, -0.0575, 0.0919, 0.01318, 0.1451, -0.2184, 0.4285, -0.2067, -0.1914, 0.2905, -0.1472, -0.0111, 0.01576, 0.0774, 0.2357, -0.05133, 0.2455, -0.1396, 0.3865, 0.383, 0.05682, 0.001579, 0.1157, -0.2612, 0.0994, 0.1644, 0.3586, -0.2273, -0.11084, -0.01153, 0.1218, 0.4893, -0.4475, -0.01176, -0.1055, 0.07886, 0.2311, ...]"
32191,100,"This bold movie may sound like a stunt, but it’s so much more than that. Linklater is an effortless, genial auteur, and his passions are woven through “Dazed and Confused,” “School of Rock” and the “Before Sunrise” trilogy. Here, his mellow groove becomes an everyday rhythm.",0.917725,0.037076,0.712483,0.000579,0.022742,"[0.0551, -0.0609, -0.0593, -0.1131, 0.2307, -0.2783, 0.1257, 0.0844, 0.05927, -0.3257, -0.009544, 0.02075, 0.0535, 0.3762, 0.1963, 0.3252, 0.02556, 0.0715, 0.1515, -0.2281, -0.06934, -0.0793, 0.2944, 0.08405, -0.128, 0.01516, -0.1902, -0.12445, -0.0796, 0.07605, 0.1196, 0.1265, -0.1735, -0.355, 0.1953, -0.1719, 0.02292, -0.06866, 0.1626, -0.078, 0.0591, 0.05408, 0.0071, -0.1846, -0.0266, -0.2424, -2.432, -0.1447, -0.2905, -0.1886, 0.326, 0.0491, 0.1263, 0.1077, 0.2399, 0.1329, -0.3765, 0.397, 0.08606, 0.1713, 0.1661, 0.0402, -0.1825, 0.00422, -0.1536, 0.03998, -0.1124, 0.0485, -0.1965, 0.4385, -0.2032, -0.06122, 0.3123, -0.0991, 0.0849, -0.003614, 0.0337, 0.1979, -0.2815, 0.2189, 0.138, 0.3027, 0.31, 0.1581, -0.0725, 0.1643, 0.1815, 0.1273, 0.2744, 0.4211, -0.3694, -0.3008, 0.06076, 0.3381, 0.4585, -0.3887, 0.1469, -0.0987, 0.2162, 0.3691, ...]"
32192,100,"In its own quiet way, it’s a world of marvels.",0.870321,0.055327,0.992889,0.00028,0.002026,"[-0.05145, -0.03995, -0.0409, -0.05997, -0.071, -0.2296, 0.1211, 0.551, -0.2208, -0.2983, -0.06122, -0.2302, -0.02145, 0.4546, 0.0986, 0.1488, -0.085, 0.236, 0.2937, -0.092, -0.1021, -0.1355, 0.0338, 0.0762, -0.0345, -0.1038, -0.1383, -0.0439, 0.1246, 0.2458, 0.02779, 0.05014, -0.1918, -0.2522, 0.05206, -0.1155, -0.04007, -0.1387, -0.04236, 0.1063, -0.06064, 0.04608, 0.1353, 0.05185, -0.1335, -0.0757, -2.4, 0.096, -0.1619, -0.007072, 0.1425, -0.05002, 0.2025, 0.2269, 0.3076, 0.3013, -0.3206, 0.6016, 0.1247, 0.0894, 0.1704, 0.05707, -0.07043, 0.005245, -0.1204, 0.2137, -0.003967, 0.005302, -0.06085, 0.2766, -0.1971, -0.09125, 0.1625, -0.1442, -0.0697, -0.0839, -0.1709, 0.1952, -0.00704, 0.213, 0.0901, 0.3804, 0.3228, 0.0721, -0.0323, 0.099, -0.342, -0.1317, 0.2146, 0.275, -0.3367, -0.1572, 0.2216, 0.2961, 0.321, -0.4697, 0.03336, -0.1545, 0.1008, 0.2744, ...]"
32193,100,"The revelation is Arquette. While the focus is on Coltrane and how he grew up onscreen, it's Arquette that's at the center of this incredible journey. She puts herself out there year after year, getting knocked down and getting up stronger. Her final scenes have the power and heartbreak every parent knows -- it's all about holding a child's hand, then letting it go.",0.996242,0.45427,0.007661,0.001955,0.961254,"[-0.12067, -0.3508, -0.2184, -0.1716, 0.2046, 0.00486, 0.3071, 0.1655, -0.03165, -0.2974, 0.185, -0.1032, 0.2211, 0.3113, -0.001915, 0.2732, 0.2449, 0.0755, 0.0826, -0.4253, -0.09973, -0.0936, 0.2411, 0.248, -0.1164, 0.10394, 0.01061, -0.09375, 0.2869, 0.0443, -0.0953, -0.1085, -0.4097, -0.4272, 0.2146, -0.3906, -0.01205, 0.04092, -0.0006742, -0.0769, -0.1307, -0.04398, -0.0465, -0.3555, 0.006557, -0.1956, -2.963, 0.0673, 0.01643, -0.2153, 0.1238, 0.0751, 0.498, 0.2335, 0.3467, 0.1338, -0.3772, 0.47, -0.00908, 0.0838, 0.137, 0.06055, -0.1857, -0.08484, 0.0914, -0.04462, -0.0728, 0.1609, -0.1414, 0.1096, -0.309, -0.3313, 0.1362, -0.3398, -0.093, -0.0222, -0.01828, 0.09357, -0.1571, 0.2225, 0.2668, 0.3083, 0.2179, 0.1375, 0.0641, 0.1005, 0.0737, 0.05728, 0.1643, 0.497, -0.572, -0.1753, -0.06555, 0.278, 0.6, -0.3232, 0.355, -0.1024, 0.2108, 0.4397, ...]"
32194,100,"The good news is you’re feeling stuff, you know? And you’ve got to hold on to that. You get older, and you don’t feel as much, your skin gets tough.” This remarkable, wonderful movie helps you remember.",0.929835,0.130413,0.991741,0.00106,0.040104,"[0.1372, -0.0977, 0.03427, -0.0658, 0.1984, -0.238, 0.1406, 0.4304, -0.2317, -0.2642, 0.11914, -0.04218, 0.01125, 0.4321, -0.04483, 0.2229, 0.02017, 0.1284, 0.03342, -0.11346, -0.2627, -0.1042, 0.1094, 0.2317, -0.1682, -0.09076, -0.1017, -0.0593, 0.11505, -0.1001, -0.0752, -0.04782, -0.3582, -0.2476, 0.10254, 0.019, -0.04886, -0.1338, 0.079, -0.09106, -0.08234, 0.1787, -0.11053, -0.1411, -0.06094, -0.0707, -2.3, -0.12006, -0.01489, -0.1583, 0.4387, 0.05984, 0.1554, 0.0815, 0.1597, 0.2145, -0.3018, 0.4888, -0.07587, 0.1517, 0.1278, -0.0702, -0.244, -0.11664, -0.11206, -0.01481, -0.013794, 0.1166, -0.11487, 0.2278, -0.1466, -0.2644, 0.2128, -0.2478, 0.084, 0.02979, -0.04617, 0.1344, -0.06274, 0.1344, 0.0364, 0.318, 0.1622, 0.04962, 0.0488, 0.02292, -0.186, -0.1526, 0.2174, 0.376, -0.3289, -0.0774, 0.00742, 0.331, 0.48, -0.2646, 0.12366, -0.03665, 0.3096, 0.285, ...]"


In [22]:
line_review_magnitudes_dict = {}
for line,review_magnitudes in list(zip(row['split_script'], row['projected_magnitudes'])):
    line_review_magnitudes_dict[line] = review_magnitudes

In [24]:
for i, review in enumerate(row['reviews'].iterrows()):
    print(review[1]['individual_meta_score'], review[1]['text'])
    most_influenced_line_magnitude_list = []
    for line, magnitudes in line_review_magnitudes_dict.items():
        most_influenced_line_magnitude_list.append((line, magnitudes[i]))
    most_influenced_line_magnitude_list_sorted = sorted(most_influenced_line_magnitude_list,key=lambda x: x[1], reverse=True)
    print(most_influenced_line_magnitude_list_sorted[0:3])

100 There’s not a great theme, a great performance or even a great scene in Boyhood. But I think it might be a great picture.
[("\n                      DAD\r\n          I know, I know, right? It's gonna be\r\n          uh, fifteen years till I have an\r\n          empty nest. But hey, I'd love to\r\n          pitch in, help with this, if I could.\r", 0.955), ("\n            MASON\r\nShe wasn't a silly girl, though.   I\r\nmean, she's a serious person. I\r\nreally thought we were --\r", 0.954), ("\n                       TEACHER\r\n           Well, it's time to finish it.\r", 0.9536)]
100 The greatest movies, the ones that stick with us, are those that hold up a mirror to the human condition and reflect something back at us that we too often manage to overlook. Boyhood is one of those movies, and with it Linklater proves he is among the best practitioners of that art.
[('\n                      MOM (O.S.)\r\n          His cognitive series, his interviews\r\n          with orphans, Etho

In [65]:
from sklearn.metrics.pairwise import cosine_similarity

In [210]:
art_vector = encode('art', model, tokenizer, device).cpu()
entertainment_vector = encode('entertainment', model, tokenizer, device).cpu()
#odyssey_vector = vectorised_script_review_df[vectorised_script_review_df['title'] == '2001: A Space Odyssey'].squeeze(0)['script_vectors'].sum()
#ventura_vector = vectorised_script_review_df[vectorised_script_review_df['title'] == 'Ace Ventura: Pet Detective'].squeeze(0)['script_vectors'].sum()
result_list = []
for i, data in tqdm(vectorised_script_review_df.iterrows()):
    projected_magnitudes_scripts = np.array(data['projected_magnitudes']).T
    for projection_magnitudes, (review_columns,review) in list(zip(projected_magnitudes_scripts, data['reviews'].iterrows())):
        script_weighted_sum = np.dot(projection_magnitudes, data['script_vectors'])
        first_comparision = cosine_similarity(art_vector.reshape(1, -1), script_weighted_sum.reshape(1, -1)).item()
        second_comparision = cosine_similarity(entertainment_vector.reshape(1, -1), script_weighted_sum.reshape(1, -1)).item()
        result_list.append({**review.to_dict(), **{'title':data['title'], 'first_comparison':first_comparision, 'second_comparison':second_comparision}})
    

0it [00:00, ?it/s]

In [211]:
result_df = pd.DataFrame(result_list)

In [213]:
grouped_result_df = result_df.groupby('title')[['universal communicability', 'first_comparison', 'second_comparison']].describe()

In [220]:
corr_df = pd.DataFrame(list(zip(grouped_result_df.index, grouped_result_df['universal communicability']['mean'],grouped_result_df['first_comparison']['mean']))).set_index(0)

In [223]:
import pingouin as pg

  return warn(


In [224]:
pg.corr(corr_df[1], corr_df[2])

Unnamed: 0,n,r,CI95%,p-val,BF10,power
pearson,776,0.013844,"[-0.06, 0.08]",0.700208,0.048,0.067137
