In [1]:
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()
import re
import spacy
import seaborn as sns
from spacy.language import Language
pipeline = spacy.load('en_core_web_sm')
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from time import time
import matplotlib.pyplot as plt
from sklearn.decomposition import LatentDirichletAllocation

from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# No of Topic
k = 10

KeyboardInterrupt: 

# Scraped Data

In [None]:
df_Netflix_trailers = pd.read_csv('df_Netflix_trailers.csv')
df_Netflix_highlights = pd.read_csv('df_Netflix_highlights.csv')
df_Formula1_trailers = pd.read_csv('df_Formula1_trailers.csv')
df_Formula1_highlights = pd.read_csv('df_Formula1_highlights.csv')
df_Miscellaneous = pd.read_csv('df_Miscellaneous.csv')

In [None]:
df_Netflix_trailers

In [None]:
df_Netflix_highlights

In [None]:
df_Formula1_trailers

In [None]:
df_Formula1_highlights

In [None]:
df_Miscellaneous

# Cleaning

In [None]:
email_re = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""

replace = [
    (r"<a[^>]*>(.*?)</a>", r"\1"),  # Matches most URLs
    (r"(?<=\d),(?=\d)", ""),        # Remove commas in numbers
    (r"\d+", "number"),              # Map digits to special token <numbr>
    (r"[\t\n\r\*\.\@\,\-\/]", " "), # Punctuation and other junk
    (r"\s+", " "),                   # Stips extra whitespace
    (r"[<br>]+", " ") ,
    (r"[#&]+", " "),
]
    
@Language.component("preprocesser")
def ng20_preprocess(doc):
    tokens = [token for token in doc 
              if not any((token.is_stop, token.is_punct))]
    tokens = [token.lemma_.lower().strip() for token in tokens]
    tokens = [token for token in tokens if token]
    return " ".join(tokens)

pipeline.add_pipe("preprocesser")

In [None]:
def get_cleaned_data(data):
    train_sentences = []

    for i, d in enumerate(data['textDisplay']):
        for repl in replace:
            d = re.sub(repl[0], repl[1], d)
        train_sentences.append(d)

    docs = []
    for sent in tqdm(train_sentences):
        docs.append(pipeline(sent))
    
    vocab_size = len(set(" ".join(docs).split(" ")))

    return docs,vocab_size

# topic modeling

In [None]:
def Analysis(docs,vocab_size):
    # bow_featurizer = CountVectorizer(max_features=vocab_size, max_df=0.95, min_df=0.005, stop_words='english')
    tfidf_featurizer = TfidfVectorizer(max_features=vocab_size, max_df=0.95, stop_words='english')

    # X_bow = bow_featurizer.fit_transform(docs)
    X_tfidf = tfidf_featurizer.fit_transform(docs)
    feature_names = tfidf_featurizer.get_feature_names_out()
    X = X_tfidf.toarray()
    
    return X,feature_names

In [None]:
def plot_top_words(model, feature_names, n_top_words, title):
    plt.clf()
    cols = 5
    rows = k // 5 + k % 5
    fig, axes = plt.subplots(rows, cols, figsize=(5 * cols, 4 * rows), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[::-1][:n_top_words]
        top_features = [feature_names[i] for i in top_features_ind]
        weights = topic[top_features_ind]
        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f'Topic {topic_idx + 1}',
                     fontdict={'fontsize': 30})
        ax.invert_yaxis()
        ax.tick_params(axis='both', which='major', labelsize=20)
        for i in 'top right left'.split():
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=40)
    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

# sentiment analysis

In [None]:
def get_sentiment(sent):
    ss = sid.polarity_scores(sent)
    sentiment = sorted(ss.items(),key=lambda x:x[1])[-1][0]
    return sentiment

# Analysis for df_Netflix_trailers

In [None]:
def get_topic_plot(data):
    docs, vocab_size = get_cleaned_data(data)
    data['Clean_text'] = docs
    data['Sentiment'] = data['Clean_text'].progress_apply(get_sentiment)
    X,feature_names =  Analysis(docs,vocab_size)
    lda = LatentDirichletAllocation(n_components=10, random_state=42)
    lda.fit(X)
    plot_top_words(lda, feature_names, 10, 'LDA')

In [None]:
get_topic_plot(df_Netflix_trailers)
df_Netflix_trailers.to_csv('df_Netflix_trailers_cleaned.csv',index=False)

# df_Netflix_highlights

In [None]:
get_topic_plot(df_Netflix_highlights)
df_Netflix_highlights.to_csv('df_Netflix_highlights_cleaned.csv',index=False)

# df_Formula1_trailers

In [None]:
get_topic_plot(df_Formula1_trailers)
df_Formula1_trailers.to_csv('df_Formula1_trailers_cleaned.csv',index=False)

# df_Formula1_highlights

In [None]:
get_topic_plot(df_Formula1_highlights)
df_Formula1_highlights.to_csv('df_Formula1_highlights.csv',index=False)

# df_Miscellaneous

In [None]:
get_topic_plot(df_Miscellaneous)
df_Miscellaneous.to_csv('df_Miscellaneous_cleaned.csv',index=False)