In [None]:
#this file analyzes the movie ratings that my friends and I have given on the Letterboxd app!
#each file contains our movie ratings as well as a link to each movie's landing page
#i then do some web scraping and pull the cast list for each movie, assigning them whatever score that 'critic' gave the movie
#the goal of this fun exercise is to identify which actors we 'like' the best in aggregate - measuring this by average score we give to the movies they are in

import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

In [None]:
#pull Letterboxd .csvs that my friends have sent me, as well as mine
cam=pd.read_csv('ratings.csv')
cam['Critic']='Cam'
emmett=pd.read_csv('EmmettLB.csv')
emmett['Critic']='Emmett'
sam=pd.read_csv('SamLB.csv')
sam['Critic']='Sam'
cam.head()

In [None]:
#show average ratings for each person
df=pd.concat([cam,emmett,sam],axis=0)
df['Rating']=df['Rating'].apply(lambda x: float("{:.2f}".format(x)))
df.groupby('Critic')['Rating'].mean().apply(lambda x: float("{:.2f}".format(x))).reset_index()

In [None]:
#show how far off average rating is from 3 (designated midpoint rating)
df1=df.groupby('Critic')['Rating'].mean().reset_index()
df1['Standard']=3
df1['Away']=df1['Rating']/df1['Standard']
df1['Away']=df1['Away'].apply(lambda x: float("{:.2f}".format(x)))
df1=df1[['Critic','Away']]
df1.head()

In [None]:
#adjust ratings
df=pd.merge(df,df1,on='Critic',how='left')
df['New Rating']=df['Rating']/df['Away']
df=df[['Date','Name','Year','Letterboxd URI','New Rating']]
df.rename(columns={'New Rating':'Rating'},inplace=True)

In [None]:
#group movies by mean adjusted rating
df=df.groupby(['Name','Letterboxd URI'])['Rating'].mean().reset_index()

#build master data frame
columns = ['Movie', 'Actor', 'Rating']
new_df = pd.DataFrame(columns=columns)

#pull cast list from each movie, and assign scores
for index, row in df.iterrows():
    
    # request content and fetching a-tag inside cast_list div
    r = requests.get(row['Letterboxd URI'])
    soup = bs(r.content, 'lxml')
    # as each cast encapsulate inside <a> and <div cast-list...>
    cast_soup = soup.select_one('[class="cast-list text-sluglist"]')
    try: 
        cast_list = [a.get_text().strip() for a in cast_soup.find_all("a")] 
    except: 
        pass 
    
    # putting data into new_df
    movie, rating = row['Name'], row['Rating'] # old value from tables
    for cast in cast_list:
        df_ = pd.DataFrame([[movie, cast, rating]], columns=columns)
        new_df = pd.concat([new_df, df_])
        
new_df = new_df.reset_index(drop=True)
new_df #each cast member from each movie in list with their rating for that movie

In [None]:
#group by actor, show mean rating, list of movies they are in, and number of movies
master=new_df.groupby(['Actor']).agg({'Rating': 'mean', 'Movie':'count', 'Movie':'unique'}).reset_index()
master['Rating']=master['Rating'].apply(lambda x: float("{:.2f}".format(x)))
master['Movie Count']=master['Movie'].str.len()

#various queries
master[master['Movie Count']>10].sort_values(by='Rating', ascending=False).head() #willem dafoe is the winner
#master.sort_values(by='Rating', ascending=False).head(25)
#master.sort_values(by='Rating', ascending=True).head(25)
#master[master['Actor'].str.startswith('Brad Pitt')]