# Preliminaries

In [None]:
import pandas as pd

In [None]:
# For regular expressions:
import re

In [None]:
# For web scraping:
import requests
from bs4 import BeautifulSoup

In [None]:
# Scikit-learn package for unsupervised learning:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

In [None]:
# For further cleaning
import string
from nltk.stem import PorterStemmer, WordNetLemmatizer 

## `nltk` package for textual analysis

In [None]:
# Import and download popular text corpa
import nltk
nltk.download('popular')

In [None]:
# For sentiment analysis:
import nltk.sentiment
nltk.download('vader_lexicon')

In [None]:
# For text cleaning:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer 
import string

---

# Textual analysis with some Friends

The file [Friends_Transcript.txt](https://raw.githubusercontent.com/ahmedgc/2024_insead_python_workshop/main/Friends_Transcript.txt) contains text data: the script for the pilot episode of the TV series "Friends".

<img src="https://m.media-amazon.com/images/M/MV5BNDVkYjU0MzctMWRmZi00NTkxLTgwZWEtOWVhYjZlYjllYmU4XkEyXkFqcGdeQXVyNTA4NzY1MzY@._V1_FMjpg_UX1000_.jpg" 
alt="Friends" width="50%"/>

## Reading data from the text file

We begin by inspecting the first 10 lines of this text file:

In [None]:
line_number = 1
with open("Friends_Transcript.txt", "r") as f:
    for line in f:
        if line_number <= 10:
            print(line)
        line_number = line_number + 1

Let's read in all the lines in the file, and gather them in a list of strings:

In [None]:
all_lines = []
with open("Friends_Transcript.txt", "r") as f:
    for line in f:
        all_lines = all_lines + [line]

In [None]:
len(all_lines)

In [None]:
all_lines[0:20]

In the show, there are 6 recurring characters (Chandler, Joey, Monica, Phoebe, Rachel, Ross). Let's extract only the lines that pertain to dialog spoken by one of these main characters:

In [None]:
main_dialog_lines = [line for line in all_lines if (
    line.startswith('Chandler:') | line.startswith('Joey:') | line.startswith('Monica:') 
    | line.startswith('Phoebe:') | line.startswith('Rachel:') | line.startswith('Ross:')
)]

In [None]:
len(main_dialog_lines)

In [None]:
main_dialog_lines[0:20]

But we can see that there is still text that is situational, and not part of a dialog: it's the text in between parentheses () and square brackets []. So we need to eliminate it. 

This uses a **regular expression** to detect any text that is in between parenthese () and replace it with an empty string.

In [None]:
main_dialog_lines = [re.sub("\(.*?\)", "", line) for line in main_dialog_lines]
main_dialog_lines[0:20]

This uses another regular expression to detect any text that is in between square brackets [] and replace it with an empty string.

In [None]:
main_dialog_lines = [re.sub("\[.*?\]", "", line) for line in main_dialog_lines]
main_dialog_lines[0:20]

Let's separate out the subject doing the talking from what they say. This will help us identify speech later.

So instead of a list of strings, where each string is a line, we want a list of *tuples*, split into the person talking and the words being said.

Here are some examples of what we intend:

In [None]:
'Chandler: Happy birthday, pal! \n'

In [None]:
'Chandler: Happy birthday, pal! \n'.split(':', maxsplit=1)

In [None]:
tuple('Chandler: Happy birthday, pal! \n'.split(':', maxsplit=1))

In [None]:
main_dialog_tupes = [tuple(line.split(':', maxsplit=1)) for line in main_dialog_lines]
main_dialog_tupes[0:20]

In [None]:
# Eliminate leading whitespace in the second element of each tupe
main_dialog_tupes = [(subject, speech.strip()) for (subject, speech) in main_dialog_tupes]
main_dialog_tupes

## Quiz: "crazy"

Using the data above (`main_dialog_lines` or `main_dialog_tupes`), count how many times each of the main characters says the word "crazy" (or "Crazy").

*Hint*: you may want to use the function `.count()`


In [None]:
main_dialog_tupes[:10]

In [None]:
# TODO: solution using main_dialog_tupes
all_counts = [(subject, line.lower().count('crazy')) for (subject, line) in main_dialog_tupes]

def extract_count(subject_name):
    subject_counts = [count for (subject, count) in all_counts if subject == subject_name]
    return sum(subject_counts)

print('Chandler says it ' + str(extract_count('Chandler')) + ' times')
print('Joey says it '     + str(extract_count('Joey'))     + ' times')
print('Monica says it '   + str(extract_count('Monica'))   + ' times')
print('Phoebe says it '   + str(extract_count('Phoebe'))   + ' times')
print('Rachel says it '   + str(extract_count('Rachel'))   + ' times')
print('Ross says it '     + str(extract_count('Ross'))     + ' times')


## Using Pandas Data Frames with text

Pandas dataframes are always useful, so let's convert from a list of tupes to a dataframe of columns:

In [None]:
main_dialog_df = pd.DataFrame(main_dialog_tupes, columns =['Subject', 'Speech'])
main_dialog_df

In [None]:
main_dialog_df['Speech']

In [None]:
main_dialog_df['Speech'].str.lower()

In [None]:
main_dialog_df['Speech'].str.lower().str.count('crazy')

In [None]:
main_dialog_df['crazy_counts'] = main_dialog_df['Speech'].str.lower().str.count('crazy')

In [None]:
main_dialog_df.groupby('Subject')['crazy_counts'].sum()

**Quiz** Why does the next cell give a different answer?

In [None]:
main_dialog_df['crazy_mentioned'] = main_dialog_df['Speech'].str.lower().str.contains('crazy')
main_dialog_df.groupby('Subject')['crazy_mentioned'].sum()

## Sentiment analysis using VADER

VADER is one methodology developed in an [academic paper](https://ojs.aaai.org/index.php/ICWSM/article/view/14550) to measure the sentiment of text. There's an implementation of this tool in the `nltk` Python package, which we will use now.

In [None]:
# Initialize an object that will perform sentiment analysis for us
analyzer = nltk.sentiment.vader.SentimentIntensityAnalyzer()

In [None]:
analyzer.polarity_scores('I hate Mondays')

In [None]:
analyzer.polarity_scores('I love Fridays')

In [None]:
analyzer.polarity_scores('Learning Python is interesting')

In [None]:
analyzer.polarity_scores('Learning Python is interesting')['compound']

In [None]:
main_dialog_df['Speech'].values

In [None]:
main_dialog_df['vader_sentiment'] = [analyzer.polarity_scores(speech)['compound'] for speech in main_dialog_df['Speech'].values] 
main_dialog_df

In [None]:
main_dialog_df.groupby('Subject')['vader_sentiment'].mean().sort_values(ascending=False)

In [None]:
main_dialog_df.hist(column='vader_sentiment', by='Subject', sharex=True, sharey=True)

## Length

Let's count how many words are used in each sentence:

In [None]:
main_dialog_df['Speech'].str.split()

In [None]:
main_dialog_df['word_count'] = [len(words) for words in main_dialog_df['Speech'].str.split()]
main_dialog_df

In [None]:
main_dialog_df.groupby('Subject')['word_count'].mean().sort_values(ascending=False)

In [None]:
main_dialog_df.groupby('Subject')['word_count'].max().sort_values(ascending=False)

In [None]:
main_dialog_df.hist(column='word_count', by='Subject', sharex=True, sharey=True)

Which characters get the most airtime in total?

In [None]:
main_dialog_df.groupby('Subject')['Speech'].count().sort_values(ascending=False)

In [None]:
main_dialog_df.groupby('Subject')['word_count'].sum().sort_values(ascending=False)

---

# Unsupervised learning at the United Nations General Assembly

## Loading and very basic cleaning

In [None]:
united_nations_df = pd.read_csv('un-general-debates-blueprint.csv')

In [None]:
united_nations_df.shape

In [None]:
united_nations_df.head()

In [None]:
united_nations_df.loc[0,'text']

In [None]:
united_nations_df.tail()

In [None]:
# Remove all numbers using a regular expression
united_nations_df['text'] = united_nations_df['text'].str.replace('\d', '', regex=True)

In [None]:
united_nations_df.loc[0,'text']

In [None]:
# As part of the processing, it's common to exclude common words that are uninformative ("stop words").
# We'll use a predefined set in the nltk package. 
# We'll pass this to the next step.
stop_words = set(stopwords.words('english'))
stop_words

## NMF on word counts

We'll use the `scikit-learn` [CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) object to split each speech into words (tokens), remove stop-words, set words to lowercase, and extract the count of each word that appears:

In [None]:
count_text_vectorizer = CountVectorizer(stop_words=stop_words)
count_text_vectors = count_text_vectorizer.fit_transform(united_nations_df['text'])
count_text_vectors.shape

In [None]:
print(count_text_vectors[0,:])

In [None]:
count_text_vectorizer.get_feature_names_out()[0:50]

In [None]:
# Fit an NMF model with 5 topics to vectors of word counts
nmf_text_model = NMF(n_components=5, random_state=42)
W_text_matrix = nmf_text_model.fit_transform(count_text_vectors)
H_text_matrix = nmf_text_model.components_

In [None]:
# Here's a function to display the most important words in each topic:
def display_topics(model, features, no_top_words=10):
    for topic, word_vector in enumerate(model.components_):
        total = word_vector.sum()
        largest = word_vector.argsort()[::-1] # invert sort order
        print("\nTopic %02d" % topic)
        for i in range(0, no_top_words):
            print(" %s (%2.2f)" % (features[largest[i]],
            word_vector[largest[i]]*100.0/total))

In [None]:
# Interpreting topics from the fitted model based on the most important words in each:
display_topics(nmf_text_model, count_text_vectorizer.get_feature_names_out())

The above topics don't look very distinct to my eyes, so this didn't do a great job. We can:
* Improve our processing
* Try a different number of topics
* Try a different machine learning model


Let's work on our processing. Instead of using every speech, let's use the [CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) object to do some extra processing steps:
* Ignore very rare words that appear less than 5 times in all speeches: these introduce noise.
* Ignore very common words that appear in more than 70% of all speeches: these also introduce noise.

In [None]:
count_text_vectorizer = CountVectorizer(stop_words=stop_words, min_df=5, max_df=0.7)
count_text_vectors = count_text_vectorizer.fit_transform(united_nations_df['text'])
count_text_vectors.shape


In [None]:
# Fit an NMF model with 5 topics to vectors of word counts
nmf_text_model = NMF(n_components=5, random_state=42)
W_text_matrix = nmf_text_model.fit_transform(count_text_vectors)
H_text_matrix = nmf_text_model.components_

In [None]:
# Display topics from the fitted model:
display_topics(nmf_text_model, count_text_vectorizer.get_feature_names_out())

The above topics look more interpretable to me.


## NMF on TF-IDF scores

Let's try out another form of pre-processing. Instead of simple word counts, which are also called term frequencies $TF$, let's use TF-IDF scores.

A Term Frequency-Inverse Document Frequency (TF-IDF) score is another way (besides word count) of measuring the importance of a term $t$ in a document $d$:
$$ \text{TF-IDF}(t,d) = \text{TF}(t,d) * \text{IDF}(t) $$

The `scikit-learn` [TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html) object computes the IDF as follows:

`idf(t) = log [ n / df(t) ] + 1`, where `n` is the total number of documents.

As you can see, this score normalizes the term frequency by how common or rare it is across all documents. The IDF is the logarithmically scaled inverse fraction of the documents that contain the word (obtained by dividing the total number of documents by the number of documents containing the term, and then taking the logarithm of that quotient).


In [None]:
# Create vectors of TF-IDF scores
tfidf_text_vectorizer = TfidfVectorizer(stop_words=stop_words, min_df=5, max_df=0.7)
tfidf_text_vectors = tfidf_text_vectorizer.fit_transform(united_nations_df['text'])
tfidf_text_vectors.shape

In [None]:
print(tfidf_text_vectors[0,:])

### With 5 topics

In [None]:
# Fit an NMF model with 5 topics to vectors of TF-IDF scores
nmf_text_model = NMF(n_components=5, random_state=42)
W_text_matrix = nmf_text_model.fit_transform(tfidf_text_vectors)
H_text_matrix = nmf_text_model.components_

In [None]:
# Display topics from the fitted model:
display_topics(nmf_text_model, tfidf_text_vectorizer.get_feature_names_out())

### Understanding the NMF model

Here's a conceptual illustration of how this works. We began by creating a Document-Term Matrix (DTM), which counts how many times certain words appear in each document. Under the hood, this is a matrix. And matrices can be factorized into products of other matrices, under certain conditions.

NMF stands for Non-Negative Matrix Factorization. The unsupervised machine learning model we used above performs NMF on the DTM to extract a list of topics:

<img src="https://miro.medium.com/v2/resize:fit:1400/1*Cdk8UXkHqkLxfPEFTNEU4A.jpeg" width="75%" />




There are other ways to estimate the topics in text, with their own strengths & weaknesses. The NMF model:
* is relatively simple
* is very fast
* can perform well: an NMF algorithm won the famous [Netflix prize](https://en.wikipedia.org/wiki/Netflix_Prize)



Having said that, probabilistic methods seem to be favored nowadays, especially Latent Dirichlet Allocation (LDA). However, these are too slow to demonstrate in the workshop, so you'll do this as an exercise yourself (later). I will shortly show you another model (SVD), but for now let's explore the NMF results further...


### Using the weights/loadings

In [None]:
W_text_matrix.shape

In [None]:
weights_df = pd.DataFrame(W_text_matrix)
weights_df.columns = [('weight_on_topic_' + str(c)) for c in weights_df.columns.values]
weights_df

In [None]:
united_nations_df.shape

In [None]:
united_nations_df.head()

In [None]:
# Concatenating DataFrames side-by-side/horizontally can be dangeous, but we know that they refer to the same subjects...
year_country_weights_df = pd.concat([
      united_nations_df[['year', 'country_name']]
    , weights_df
], axis=1)

In [None]:
year_country_weights_df.shape

In [None]:
year_country_weights_df.head(10)

In [None]:
# For each year + country, there is 1 speech only
year_country_weights_df.groupby(['year', 'country_name']).value_counts().max()

In [None]:
# Summarize at a country level
year_country_weights_df.groupby('country_name').mean()

In [None]:
# Re-running this line from above to remind us what the fitted model topics are:
display_topics(nmf_text_model, tfidf_text_vectorizer.get_feature_names_out())

In [None]:
year_country_weights_df.drop(columns='year').groupby('country_name').mean().sort_values('weight_on_topic_0', ascending=False)

In [None]:
year_country_weights_df.drop(columns='year').groupby('country_name').mean().sort_values('weight_on_topic_1', ascending=False)

In [None]:
year_country_weights_df.drop(columns='year').groupby('country_name').mean().sort_values('weight_on_topic_2', ascending=False)

In [None]:
year_country_weights_df.drop(columns='year').groupby('country_name').mean().sort_values('weight_on_topic_3', ascending=False)

In [None]:
year_country_weights_df.drop(columns='year').groupby('country_name').mean().sort_values('weight_on_topic_4', ascending=False)

In [None]:
year_country_weights_df.drop(columns=['year', 'country_name']).corr()

How shall we interpret the high negative correlations between weights on topic_0 and topic_1?

### With 10 topics

In [None]:
# Fit an NMF model with 10 topics to vectors of TF-IDF scores
nmf_text_model = NMF(n_components=10, random_state=42)
W_text_matrix = nmf_text_model.fit_transform(tfidf_text_vectors)
H_text_matrix = nmf_text_model.components_

In [None]:
# Display topics from the fitted model:
display_topics(nmf_text_model, tfidf_text_vectorizer.get_feature_names_out())

## SVD on TF-IDF scores

In [None]:
svd_text_model = TruncatedSVD(n_components = 5, random_state=42)
W_svd_text_matrix = svd_text_model.fit_transform(tfidf_text_vectors)
H_svd_text_matrix = svd_text_model.components_

In [None]:
display_topics(svd_text_model, tfidf_text_vectorizer.get_feature_names())

Note the negative values: this means that the words reject the topic. 

In this case, it's harder to interpret the topics because there are so many of these.

### LDA on word counts - try this later

LDA (Latent Dirichlet Allocation) is a popular probabilistic model, and estimating it requires some simulation. Therefore it takes a longer time to estimate. 

I'll leave it for you to **try out later**.

#### Running LDA using the scikit-learn package

Here's the code, commented-out. Simply uncomment and run yourself after class.

In [None]:
#lda_text_model = LatentDirichletAllocation(n_components = 5, random_state=42)
#W_lda_text_matrix = lda_text_model.fit_transform(count_text_vectors)
#H_lda_text_matrix = lda_text_model.components_
    
#display_topics(lda_text_model, count_text_vectorizer.get_feature_names())

#### Running LDA using the gensim package

**Quiz for later**: a popular implementation of LDA is in the `gensim` package. As an exercise, import that package yourself, and use it to fit an LDA model to the same United Nations speeches dataset.

In [None]:
# TODO

## More cleaning

Here's a recap of the cleaning we have applied in our analysis above:
* Removing digits
* Splitting into words
* Lowercasing all words
* Filtering out stop-words
* Filtering out very rare terms
* Filtering out very common terms

And we also saw a preprocessing step of scaling to TF-IDF scores.

There may be other forms of cleaning and proessing that are appropriate for your specific example.

Two forms of cleaning that are popular are "stemming" and "lemmatization".

In [None]:
# Reload (dirty) United Nations speech data
united_nations_df = pd.read_csv('un-general-debates-blueprint.csv')


In [None]:
text = united_nations_df.loc[0,'text']
text

### Some standard steps

In [None]:
# Make lowercase
text = text.lower()
text

In [None]:
# Remove line breaks, replacing with spaces
text = re.sub(r'\n', ' ', text)
text

In [None]:
# Remove punctuation
translator = str.maketrans('', '', string.punctuation)
text = text.translate(translator)
text


In [None]:
# Split into a list of words at runs of whitespace
text = text.split()
text

In [None]:
# Remove stop words
stop_words = nltk.corpus.stopwords.words("english")
# You could add more:
#stop_words = stop_words + ['hi', 'im']
text = [word for word in text if not word in stop_words]
text

In [None]:
# More sophisticated removal of digits
text = [re.sub(r'\w*\d\w*', '', w) for w in text]
text

In [None]:
# Exclude any empty strings remaining
text = [w for w in text if w != '']
text


### Stemming

Stemmers remove morphological affixes from words, leaving only the word stem.

In [None]:
# Stemmer object to be used below
stemmer = PorterStemmer()

In [None]:
print(stemmer.stem("things"))
print(stemmer.stem("geese"))

print(stemmer.stem("running"))
print(stemmer.stem("ran"))


In [None]:
text_stemmed = [stemmer.stem(w) for w in text]
text_stemmed

### Lemmatization

The goal of lemmatization is to convert a word to its dictionary form ("lemma"), which may involve analyzing the entire sentence with more sophisticated techniques in order to determine the part-of-speech. We'll do something simple:

In [None]:
# Lemmatizer object to be used below
lemmatizer = WordNetLemmatizer()

In [None]:
print(lemmatizer.lemmatize("things"))
print(lemmatizer.lemmatize("geese"))

print(lemmatizer.lemmatize("running"))
print(lemmatizer.lemmatize("ran"))

In [None]:
text_lemmatized = [lemmatizer.lemmatize(w) for w in text]
text_lemmatized

---

# Web-scraping BoxOfficeMojo, with a little help from ChatGPT

What are the top-grossing movies of all time? 

BoxOfficeMojo calculated the numbers, and displays them here: https://www.boxofficemojo.com/chart/top_lifetime_gross/


ChatGPT is often very useful for web-scraping. I asked it to write the first draft of code to scrape the above page:

> Write Python code to extract the tables from this web page: https://www.boxofficemojo.com/chart/top_lifetime_gross/

Here's what ChatGPT replied to me:

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the page
url = "https://www.boxofficemojo.com/chart/top_lifetime_gross/"

# Sending a request to fetch the content of the page
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Finding the table
table = soup.find('table')

# Extracting the table headers
headers = [header.text.strip() for header in table.find_all('th')]

# Extracting the rows
rows = []
for row in table.find_all('tr')[1:]:  # Skipping the header row
    cells = [cell.text.strip() for cell in row.find_all('td')]
    if cells:  # Avoid empty rows
        rows.append(cells)

# Creating a DataFrame
df = pd.DataFrame(rows, columns=headers)

# Displaying the DataFrame
df.head()


The suggested code did a fantastic job! But what's going on? 

Load up https://www.boxofficemojo.com/chart/top_lifetime_gross/ in your favorite browser (Chrome, Safari, ...) Then right-click on the page and View Source. 

Alternatively, if you are running Chrome like I am, copy-paste this into your address bar:

`view-source:https://www.boxofficemojo.com/chart/top_lifetime_gross/`

We'll look at it together and review the HTML. It should look something like this:

In [None]:
response.content

The variable `soup` contains a parsed HTML document. HTML is a hierarchical way of storing data on (every) website, including BoxOfficeMojo:

In [None]:
soup

The Python variable called `table` contains the contents of the one and only HTML `<table>` element on the page. (It's possible to have more than one table per page, so you'd need to extract the one you want).

The hierarchy looks something like this:

* `<table>`
  * `<tr>` (for table rows)
    * `<td>` for cells or `<td>` for header cells
        * Content of the cells, which itself be yet another HTML tag
   
And there are other HTML elements in there, such as `<a>` for links, which have an `href` "attribute". Other elements can have an "id" attribute, which might be useful for web-parsing too.

In [None]:
table

Let's wrap this up into a function:

In [None]:
def scrape_boxoffice_mojo_table_text(url):
    # Sending a request to fetch the content of the page
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Finding the table
    table = soup.find('table')
    
    # Extracting the table headers
    headers = [header.text.strip() for header in table.find_all('th')]
    
    # Extracting the rows
    rows = []
    for row in table.find_all('tr')[1:]:  # Skipping the header row
        cells = [cell.text.strip() for cell in row.find_all('td')]
        if cells:  # Avoid empty rows
            rows.append(cells)
    
    # Creating a DataFrame
    df = pd.DataFrame(rows, columns=headers)

    return df

Running the function:

In [None]:
scrape_boxoffice_mojo_table_text("https://www.boxofficemojo.com/chart/top_lifetime_gross/")

That page only shows the first 200 rows. 

The next 200 are displayed on another page:
https://www.boxofficemojo.com/chart/top_lifetime_gross/?offset=200

So we can run our function there: 


In [None]:
scrape_boxoffice_mojo_table_text("https://www.boxofficemojo.com/chart/top_lifetime_gross/?offset=200")

There seems to be a pattern:

https://www.boxofficemojo.com/chart/top_lifetime_gross/?offset=0

https://www.boxofficemojo.com/chart/top_lifetime_gross/?offset=200

https://www.boxofficemojo.com/chart/top_lifetime_gross/?offset=400

https://www.boxofficemojo.com/chart/top_lifetime_gross/?offset=600

https://www.boxofficemojo.com/chart/top_lifetime_gross/?offset=800

Let's put these together in a list comprehension:

In [None]:
domestic_lifetime_gross_df = pd.concat([
    scrape_boxoffice_mojo_table_text("https://www.boxofficemojo.com/chart/top_lifetime_gross/?offset=" + str(offset)) 
    for offset in [0, 200, 400, 600, 800]
])

In [None]:
len(domestic_lifetime_gross_df)

In [None]:
domestic_lifetime_gross_df

These ranks are since the release date, but we should really normalize by the number of years and see the gross per year.

In [None]:
domestic_lifetime_gross_df.dtypes

In [None]:
domestic_lifetime_gross_df['Year'] = domestic_lifetime_gross_df['Year'].astype(int)
domestic_lifetime_gross_df['Year']

In [None]:
domestic_lifetime_gross_df['Lifetime Gross'] = domestic_lifetime_gross_df['Lifetime Gross'].str.replace('$','')
domestic_lifetime_gross_df['Lifetime Gross'] = domestic_lifetime_gross_df['Lifetime Gross'].str.replace(',','')
domestic_lifetime_gross_df['Lifetime Gross']

In [None]:
domestic_lifetime_gross_df['Lifetime Gross'] = pd.to_numeric(domestic_lifetime_gross_df['Lifetime Gross'])
domestic_lifetime_gross_df['Lifetime Gross']

In [None]:
domestic_lifetime_gross_df['years_since_release'] = 2025 - domestic_lifetime_gross_df['Year']
domestic_lifetime_gross_df['years_since_release']

In [None]:
domestic_lifetime_gross_df.plot(x='years_since_release', y='Lifetime Gross', kind='scatter')

In [None]:
domestic_lifetime_gross_df['annualized_lifetime_gross'] = domestic_lifetime_gross_df['Lifetime Gross'] / domestic_lifetime_gross_df['years_since_release']
domestic_lifetime_gross_df['annualized_lifetime_gross']

In [None]:
domestic_lifetime_gross_df.sort_values('annualized_lifetime_gross', ascending=False)

Let's make an important modification to the code that ChatGPT suggested to us:

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL of the page
url = "https://www.boxofficemojo.com/chart/top_lifetime_gross/"

# Sending a request to fetch the content of the page
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Finding the table
table = soup.find('table')

# Extracting the table headers
headers = [header.text.strip() for header in table.find_all('th')]

# Extracting the rows
rows = []
for row in table.find_all('tr')[1:]:  # Skipping the header row
    cells = []
    for cell in row.find_all('td'):

        # MODIFIED:
        if cell.find('a'):
            # Keeping the <a> tag as a string
            link = cell.find('a')
            cells.append(str(link))
        else:
            cells.append(cell.text.strip())
            
    if cells:  # Avoid empty rows
        rows.append(cells)

# Creating a DataFrame
df = pd.DataFrame(rows, columns=headers)

# Displaying the DataFrame
df.head()


Now, the code checks inside table cells (`<td>...</td>`) and if the cell contains an HTML link (`<a>`) it returns the actual HTML tag instead of the innermost text content.

In [None]:
# URL of the page
url = "https://www.boxofficemojo.com/chart/top_lifetime_gross/"

# Sending a request to fetch the content of the page
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Finding the table
table = soup.find('table')

# Extracting the table headers
headers = [header.text.strip() for header in table.find_all('th')]

# Extracting the rows
rows = []
for row in table.find_all('tr')[1:]:  # Skipping the header row
    cells = []
    for cell in row.find_all('td'):
        link = cell.find('a')
        if link:
            
            # MODIFIED:
            # Extracting only the href attribute
            cells.append(link.get('href'))
            
        else:
            cells.append(cell.text.strip())
    if cells:  # Avoid empty rows
        rows.append(cells)

# Creating a DataFrame
df = pd.DataFrame(rows, columns=headers)

# Displaying the DataFrame
df.head()

You can see that we have now extracted the contents of the `href` attribute within the `<a>` link tags, where they exist. Otherwise, the text is extracted.

The title identifiers seem to be valuable. If you click on "Star Wars: Episode VII - The Force Awakens" in the first row, it takes you to this page:

https://www.boxofficemojo.com/title/tt2488496/?ref_=bo_cso_table_1

And you can see that we have this data in the first row of our scraped table now:

In [None]:
df.loc[1, 'Title']

We could use this to construct a URL ourselves:


In [None]:
df.loc[1, 'Title'].split('/')

In [None]:
df.loc[1, 'Title'].split('/')[2]

In [None]:
'https://www.boxofficemojo.com/title/' + df.loc[1, 'Title'].split('/')[2]

We'll have to change the code to run on the entire column, rather than a single cell:

In [None]:
df['Title'].str.split('/').str[2]

In [None]:
'https://www.boxofficemojo.com/title/' + df['Title'].str.split('/').str[2]

Let's combine this with the previous code block:

In [None]:
# URL of the page
url = "https://www.boxofficemojo.com/chart/top_lifetime_gross/"

# Sending a request to fetch the content of the page
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Finding the table
table = soup.find('table')

# Extracting the table headers
headers = [header.text.strip() for header in table.find_all('th')]

# Extracting the rows
rows = []
for row in table.find_all('tr')[1:]:  # Skipping the header row
    cells = []
    for cell in row.find_all('td'):
        link = cell.find('a')
        if link:
            # Extracting only the href attribute
            cells.append(link.get('href'))
        else:
            cells.append(cell.text.strip())
    if cells:  # Avoid empty rows
        rows.append(cells)

# Creating a DataFrame
df = pd.DataFrame(rows, columns=headers)

# NEW STEPS: 
df['title_id'] = df['Title'].str.split('/').str[2]
df['title_page_url'] = 'https://www.boxofficemojo.com/title/' + df['title_id']

# Displaying the DataFrame
df.head()

Really, all we care about is the `title_id` and `title_page_url`, so just return these. Plus wrap it all up into a function:

In [None]:
def scrape_title_ids(url):

    # Sending a request to fetch the content of the page
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Finding the table
    table = soup.find('table')

    # Extracting the table headers
    headers = [header.text.strip() for header in table.find_all('th')]

    # Extracting the rows
    rows = []
    for row in table.find_all('tr')[1:]:  # Skipping the header row
        cells = []
        for cell in row.find_all('td'):
            link = cell.find('a')
            if link:
                # Extracting only the href attribute
                cells.append(link.get('href'))
            else:
                cells.append(cell.text.strip())
        if cells:  # Avoid empty rows
            rows.append(cells)

    # Creating a DataFrame
    df = pd.DataFrame(rows, columns=headers)

    # Extract title information...
    df['title_id'] = df['Title'].str.split('/').str[2]
    df['title_page_url'] = 'https://www.boxofficemojo.com/title/' + df['title_id']
    
    # ... and drop everything else:
    df = df[['title_id', 'title_page_url']]
    
    return df


Now let's run it on the entire 1000 movie list, spread over 5 pages:

In [None]:
domestic_title_identifiers_df = pd.concat([
    scrape_title_ids("https://www.boxofficemojo.com/chart/top_lifetime_gross/?offset=" + str(offset)) 
    for offset in [0, 200, 400, 600, 800]
])

In [None]:
domestic_title_identifiers_df

We have 1000 title IDs now, in the same order as the original data frame.

So what? Well, we can use identifiers such as these to extract more information, including linking to other pages.

Remember, we already have code to scrape the first table that appears on a webpage. Let's run it on the first link in our table (https://www.boxofficemojo.com/title/tt4154796):

In [None]:
scrape_boxoffice_mojo_table_text('https://www.boxofficemojo.com/title/tt4154796')

So it pulls out some useful information. But there's other information on different parts of that page.

Let's focus on the "MPAA", which in this case is "PG-13". It's not technically part of a table (although it looks like that to our eyes). Let's ask ChatGPT for help:

> Extract the value associated with the "MPAA" on the website https://www.boxofficemojo.com/title/tt4154796/

This is the code ChatGPT suggested:

In [None]:
import requests
from bs4 import BeautifulSoup

# URL of the page
url = "https://www.boxofficemojo.com/title/tt4154796/"

# Sending a request to fetch the content of the page
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Finding the MPAA rating
mpaa_label = soup.find(text="MPAA")
if mpaa_label:
    mpaa_value = mpaa_label.find_next('span').text.strip()
    print(f"MPAA Rating: {mpaa_value}")
else:
    print("MPAA Rating not found")


That's very useful! Let's simplify a little, and update the function call based on the warning above:

In [None]:
url = "https://www.boxofficemojo.com/title/tt4154796/"

response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

mpaa_label = soup.find(string="MPAA")
if mpaa_label:
    mpaa_value = mpaa_label.find_next('span').text.strip()
else:
    mpaa_value = None
    
mpaa_value

Let's wrap this up into a function:

In [None]:
def get_mpaa_rating(title_page_url):
    response = requests.get(title_page_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    mpaa_label = soup.find(string="MPAA")
    if mpaa_label:
        mpaa_value = mpaa_label.find_next('span').text.strip()
    else:
        mpaa_value = None
    return mpaa_value

In [None]:
get_mpaa_rating("https://www.boxofficemojo.com/title/tt4154796/")

In [None]:
domestic_title_identifiers_df

In [None]:
domestic_title_identifiers_df['title_page_url'].values[0:20]

In [None]:
top20_mpaa_ratings = [ get_mpaa_rating(title_page_url) for title_page_url in domestic_title_identifiers_df['title_page_url'].values[0:20] ]
top20_mpaa_ratings


In [None]:
top20_domestic_lifetime_gross_df = domestic_lifetime_gross_df.head(20)
top20_domestic_lifetime_gross_df['mpaa_rating'] = top20_mpaa_ratings
top20_domestic_lifetime_gross_df

## Quiz, for later

* Repeat what I have done for another characteristic of each movie, such as the `Budget` characteristic.
* Modify your code (and mine) to extract the MPAA rating and the Budget at the same time, i.e. from the same `soup` variable. This avoids having to download and parse each title page once per attribute. It's best to return them as a dataframe.
* If you have time, run this for every one of the 1000 movies in the original `domestic_lifetime_gross_df` dataframe. 
* Combine all these 1000 results into a single dataframe.
* Finally, combine your 1000-row dataframe with the original 1000-row `domestic_lifetime_gross_df` dataframe. 



In [None]:
# TODO