## Separating Transcription into Background Context and Actual Review

In [1]:
import re
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/ayannair/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Load the transcript from a file
file_name = '/Users/ayannair/Documents/projects/fantanosize/backend/transcript.txt'
with open(file_name, 'r') as file:
    text = file.read()

# Split the text into sentences
sentences = nltk.sent_tokenize(text)

# Define keywords for each topic
keywords = {
    'lyrics': ['lyrics', 'words', 'writing', 'verses', 'chorus', 'hook', 'poetry', 'lines', 'storytelling', 'themes', 'message', 'narrative', 'bars', 'line'],
    'production': ['beat', 'melody', 'harmony', 'rhythm', 'production', 'sound', 'instrumentation', 'arrangement', 'synths', 'bass', 'drums', 'guitar', 'keys', 'mix', 'mastering', 'sonically'],
    'features': ['feature', 'collaboration', 'guest', 'featuring', 'appearance', 'cameo', 'contribution'],
    'vocals': ['vocals', 'singing', 'rap', 'voice', 'delivery', 'performance', 'flow'],
    'concept': ['concept', 'theme', 'cohesion', 'consistency', 'flow', 'structure', 'production quality', 'about'],
}

topic_sentences = {topic: '' for topic in keywords}

for sentence in sentences:
    for topic, words in keywords.items():
        if any(word in sentence.lower() for word in words):
            topic_sentences[topic] += sentence + ' '


# Print sentences about each topic
for topic, sent in topic_sentences.items():
    print(f"\n{topic.capitalize()} Sentences:")
    print(sent)

# Find the last sentence with the word "feeling"
target_index = None
for i, sentence in enumerate(sentences):
    if 'feeling a' in sentence.lower() or 'feeling' in sentence.lower() or 'strong' in sentence.lower() or 'light' in sentence.lower() or 'decent' in sentence.lower() or 'not good' in sentence.lower():
        target_index = i

# Check if we found a sentence with "feeling"
if target_index is not None:
    # Ensure we have enough sentences before
    start_index = max(target_index - 5, 0)
    end_index = target_index+1

    # Extract the segment
    review_seg = ' '.join(sentences[start_index:end_index])
    print("Review Segment:")
    print(review_seg)
else:
    print("No sentence containing 'feeling a' was found.")


Lyrics Sentences:
There are some bops in the mix, but some of the themes addressed on this record are heavy. As she avoids agreements, business dealings, and traditional rules, looking to box her in, limit her capacity, the track has a chill vibe, thoughtful bars, and beautiful vocal traps. It's also pretty interesting that a lot of the group and choral vocals on this LP, as they are singing, the lyrics they deliver have often a religious or spiritual angle to them. But there are similar themes of control and autonomy on these songs Silhouette, where Sims voices once again distaste and unhappiness with being minimized in her own music, her creative process, her output, made to feel not human, but like a representation of a human or a human shape, a Silhouette. I feel much the same way about the instrumental arrangements and messaging on the song X, the themes and low-key melancholy vocal chops on the song Hard on Fire 2. Because Sims did spend a fair amount of time on that album, defi

## BERT Model Sentiment Analysis

In [3]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
import json
import torch

In [4]:
MODEL = "cardiffnlp/twitter-roberta-base-sentiment"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [5]:
# Define weights for each sentiment type
weights = {
    'neg': 0.1,
    'neu': 0.2,
    'pos': 0.7
}

## Lyrics Analysis

In [6]:
# Tokenize the input text
encoded_text = tokenizer(topic_sentences["lyrics"], return_tensors='pt', truncation=True, padding=True, max_length=512)

# Ensure no issues with input dimensions
input_ids = encoded_text['input_ids']
attention_mask = encoded_text['attention_mask']

# Perform sentiment analysis
with torch.no_grad():
    output = model(input_ids=input_ids, attention_mask=attention_mask)
    scores = output.logits[0].detach().numpy()
    scores = softmax(scores)
    
# Extract numerical values
lyrics_scores_dict = {
    'roberta_neg': scores[0],
    'roberta_neu': scores[1],
    'roberta_pos': scores[2]
}

print(lyrics_scores_dict)

# Compute the combined sentiment score
combined_score = (lyrics_scores_dict['roberta_neg']*weights['neg'] + lyrics_scores_dict['roberta_neu']*weights['neu'] + lyrics_scores_dict['roberta_pos']*weights['pos'])

# Normalize the score
lyrics_normalized_score = combined_score/0.7*100

print(lyrics_normalized_score)

{'roberta_neg': 0.035030607, 'roberta_neu': 0.3502545, 'roberta_pos': 0.61471486}
71.97919493275029


## Production Analysis

In [7]:
# Tokenize the input text
encoded_text = tokenizer(topic_sentences["production"], return_tensors='pt', truncation=True, padding=True, max_length=512)

# Ensure no issues with input dimensions
input_ids = encoded_text['input_ids']
attention_mask = encoded_text['attention_mask']

# Perform sentiment analysis
with torch.no_grad():
    output = model(input_ids=input_ids, attention_mask=attention_mask)
    scores = output.logits[0].detach().numpy()
    scores = softmax(scores)
    
# Extract numerical values
prod_scores_dict = {
    'roberta_neg': scores[0],
    'roberta_neu': scores[1],
    'roberta_pos': scores[2]
}

print(prod_scores_dict)

# Compute the combined sentiment score
combined_score = (prod_scores_dict['roberta_neg']*weights['neg'] + prod_scores_dict['roberta_neu']*weights['neu'] + prod_scores_dict['roberta_pos']*weights['pos'])

# Normalize the score
prod_normalized_score = combined_score/0.7*100

print(prod_normalized_score)

{'roberta_neg': 0.039478056, 'roberta_neu': 0.32296997, 'roberta_pos': 0.63755196}
73.5468820801803


## Features Analysis

In [8]:
# Tokenize the input text
encoded_text = tokenizer(topic_sentences["features"], return_tensors='pt', truncation=True, padding=True, max_length=512)

# Ensure no issues with input dimensions
input_ids = encoded_text['input_ids']
attention_mask = encoded_text['attention_mask']

# Perform sentiment analysis
with torch.no_grad():
    output = model(input_ids=input_ids, attention_mask=attention_mask)
    scores = output.logits[0].detach().numpy()
    scores = softmax(scores)
    
# Extract numerical values
feat_scores_dict = {
    'roberta_neg': scores[0],
    'roberta_neu': scores[1],
    'roberta_pos': scores[2]
}

print(feat_scores_dict)

# Compute the combined sentiment score
combined_score = (feat_scores_dict['roberta_neg']*weights['neg'] + feat_scores_dict['roberta_neu']*weights['neu'] + feat_scores_dict['roberta_pos']*weights['pos'])

# Normalize the score
feat_normalized_score = combined_score/0.7*100

print(feat_normalized_score)

{'roberta_neg': 0.22720535, 'roberta_neu': 0.5859094, 'roberta_pos': 0.18688528}
38.67458828857966


## Vocals Analysis

In [9]:
# Tokenize the input text
encoded_text = tokenizer(topic_sentences["vocals"], return_tensors='pt', truncation=True, padding=True, max_length=512)

# Ensure no issues with input dimensions
input_ids = encoded_text['input_ids']
attention_mask = encoded_text['attention_mask']

# Perform sentiment analysis
with torch.no_grad():
    output = model(input_ids=input_ids, attention_mask=attention_mask)
    scores = output.logits[0].detach().numpy()
    scores = softmax(scores)
    
# Extract numerical values
vocals_scores_dict = {
    'roberta_neg': scores[0],
    'roberta_neu': scores[1],
    'roberta_pos': scores[2]
}

print(vocals_scores_dict)

# Compute the combined sentiment score
combined_score = (vocals_scores_dict['roberta_neg']*weights['neg'] + vocals_scores_dict['roberta_neu']*weights['neu'] + vocals_scores_dict['roberta_pos']*weights['pos'])

# Normalize the score
vocals_normalized_score = combined_score/0.7*100

print(vocals_normalized_score)

{'roberta_neg': 0.016838856, 'roberta_neu': 0.19196156, 'roberta_pos': 0.79119956}
84.84512748462812


## Concept Analysis

In [10]:
# Tokenize the input text
encoded_text = tokenizer(topic_sentences["concept"], return_tensors='pt', truncation=True, padding=True, max_length=512)

# Ensure no issues with input dimensions
input_ids = encoded_text['input_ids']
attention_mask = encoded_text['attention_mask']

# Perform sentiment analysis
with torch.no_grad():
    output = model(input_ids=input_ids, attention_mask=attention_mask)
    scores = output.logits[0].detach().numpy()
    scores = softmax(scores)
    
# Extract numerical values
concept_scores_dict = {
    'roberta_neg': scores[0],
    'roberta_neu': scores[1],
    'roberta_pos': scores[2]
}

print(concept_scores_dict)

# Compute the combined sentiment score
combined_score = (concept_scores_dict['roberta_neg']*weights['neg'] + concept_scores_dict['roberta_neu']*weights['neu'] + concept_scores_dict['roberta_pos']*weights['pos'])

# Normalize the score
concept_normalized_score = combined_score/0.7*100

print(concept_normalized_score)

{'roberta_neg': 0.06255698, 'roberta_neu': 0.49183345, 'roberta_pos': 0.44560954}
59.50700938701631


## Overall Analysis

In [11]:
# Tokenize the input text
encoded_text = tokenizer(review_seg, return_tensors='pt', truncation=True, padding=True, max_length=512)

# Ensure no issues with input dimensions
input_ids = encoded_text['input_ids']
attention_mask = encoded_text['attention_mask']

# Perform sentiment analysis
with torch.no_grad():
    output = model(input_ids=input_ids, attention_mask=attention_mask)
    scores = output.logits[0].detach().numpy()
    scores = softmax(scores)
    
# Extract numerical values
review_scores_dict = {
    'roberta_neg': scores[0],
    'roberta_neu': scores[1],
    'roberta_pos': scores[2]
}

print(review_scores_dict)

# Compute the combined sentiment score
combined_score = (review_scores_dict['roberta_neg']*weights['neg'] + review_scores_dict['roberta_neu']*weights['neu'] + review_scores_dict['roberta_pos']*weights['pos'])

# Normalize the score
review_normalized_score = combined_score/0.7*100

print(review_normalized_score)

{'roberta_neg': 0.042553872, 'roberta_neu': 0.14424068, 'roberta_pos': 0.8132055}
86.04962272303443


In [12]:
scores_dict = {
    'lyrics_score': lyrics_normalized_score,
    'production_score': prod_normalized_score,
    'features_score': feat_normalized_score,
    'vocals_score': vocals_normalized_score,
    'concept_score': concept_normalized_score,
    'overall_score' : review_normalized_score
}

json_output = json.dumps(scores_dict, indent=4)

print(json_output)

results_fp = '/Users/ayannair/Documents/projects/fantanosize/backend/results.json'
with open(results_fp, 'w') as json_file:
    json.dump(scores_dict, json_file, indent=4)

{
    "lyrics_score": 71.97919493275029,
    "production_score": 73.5468820801803,
    "features_score": 38.67458828857966,
    "vocals_score": 84.84512748462812,
    "concept_score": 59.50700938701631,
    "overall_score": 86.04962272303443
}
