In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
from pandas import ExcelWriter
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from collections import Counter
import re

In [None]:
# create list of films Gal Gadot participated in from Wikipedia sorted by date.
# the table presents with the following fields per film: year, title, role and director(s).
# using functions find, find_all and get_text

wiki = "https://en.wikipedia.org/wiki/Gal_Gadot"
page = urlopen(wiki)
soup = BeautifulSoup(page, "lxml")
filmography = soup.find('table', class_='wikitable sortable')
tmp = filmography.find_all('tr')
first = tmp[0]
allRows = tmp[1:]

headers = [header.get_text() for header in first.find_all('th')]
results = [[data.get_text() for data in row.find_all('td')] for row in allRows]
rowspan = []
movies = []
for no, tr in enumerate(allRows):
    found = False
    for td_no, data in enumerate(tr.find_all('td')):
        if td_no == 0:
            try:
                movies.append(data.find('a', href=True)['href'])
                found = True
            except:
                pass
        elif td_no == 1 and not found:
            try:
                movies.append(data.find('a', href=True)['href'])
            except:
                pass
        if data.has_attr("rowspan"):
            rowspan.append((no, td_no, int(data["rowspan"]), data.get_text()))

if rowspan:
    for i in rowspan:
        for j in range(1, i[2]):
            results[i[0] + j].insert(i[1], i[3])

# create dataframe from lists
df_films_Gal_Gadot = pd.DataFrame(data=results, columns=headers)
del df_films_Gal_Gadot['Notes\n']
df_films_Gal_Gadot = df_films_Gal_Gadot.replace('\n','', regex=True)
df_films_Gal_Gadot

In [None]:
# trying to find the year of birth by search span tag with class 'bday'
# or search td tag after the th tag with text 'Born'
# return unknown otherwise
# using functions find, findall, findNext and get_text
def find_year_of_birth(page_actor):
    try:
        date = page_actor.find('span', {'class': 'bday'}).get_text()
        return datetime.strptime(date, '%Y-%m-%d').year
    except:
        try:
            data = page_actor.find('th', {'scope': 'row'}, text=re.compile("Born")).findNext('td').get_text()
            return max(list(map(int, re.findall('\d+',data) )))
        except:
            return "Unknown"

# trying to find the place of birth by search span tag with class 'birthplace' split by ','
# or search a tag after span tag with class 'noprint ForceAgeToShow'
# or search span tag with class 'birthplace'
# return unknown otherwise
# using functions find, find_next_siblings and get_text
def find_place_of_birth(page_actor):
    try:
        return page_actor.find('span', {'class': 'birthplace'}).get_text().split(',')[-1]
    except:
        try:
            return page_actor.find('span', {'class': 'noprint ForceAgeToShow'}).find_next_siblings('a')[
                    1].get_text()
        except:
            try:
                return page_actor.find('span', {'class': 'birthplace'}).get_text()
            except:
                return "Unknown"

# trying to find the number of awards by search td tag with class 'yes table-yes2' after table after span tag with id 'Awards_and_nominations'
# or search td tag with class 'yes table-yes2' after the second table after span tag with id 'Awards_and_nominations'
# or search the all awards of actor in the page 'list of awards and nominations' td tag with class 'yes table-yes2' and text 'Won'
# return unknown otherwise
# using functions find, find_all, findAll and findNext
def find_number_of_awards(page_actor):
    awards1 = awards2 = awards3 = -1
    try:
        awards1 =  len(
            page_actor.find('span', {'id': 'Awards_and_nominations'}).findNext('table').find_all('td', {
                'class': 'yes table-yes2'}))
    except:
        pass
    try:
        awards2 = len(page_actor.find('span', {'id': 'Awards_and_nominations'}).findNext('table').findNext('table').find_all('td', {
            'class': 'yes table-yes2'}))
    except:
        pass
    try:
        url = page_actor.find('a', {'title': re.compile('List of awards and nominations received by')})['href']
        page_awards = BeautifulSoup(urlopen("https://en.wikipedia.org/" + url), "lxml")
        awards3 = len(page_awards.findAll('td', {'class': 'yes table-yes2'}, text=re.compile("Won")))
    except:
        pass
    if awards1 != -1 or awards2 != -1 or awards3 != -1:
        return max(awards1, awards2, awards3)
    return "Unknown"
        
# list of all the actors who played with Gal Gadot on the same movie (any movie).
# the table presents the following fields per co-actor\actress: name, year of birth,
# country of birth and number of awards that he\she got.
# using functions find, find_all and findNext
actors = []
birth_years = []
birth_places = []
numbers_of_awards = []
actors_counter = Counter()
for url in movies:
    page = BeautifulSoup(urlopen("https://en.wikipedia.org/" + url), "lxml")
    try:
        cast = page.find('span', {'id': 'Cast'}).findNext('ul').find_all('li')
        if len(cast) == 1:
            # if it found only one actor in cast add one by one
            cast = page.find('span', {'id': 'Cast'}).findNext('ul').find_all('li')
            cast = cast + page.find('span', {'id': 'Cast'}).findNext('ul').findNext('ul').find_all('li')
            cast = cast + page.find('span', {'id': 'Cast'}).findNext('ul').findNext('ul').findNext('ul').find_all('li')
            cast = cast + page.find('span', {'id': 'Cast'}).findNext('ul').findNext('ul').findNext('ul').findNext('ul').find_all('li')
            cast = cast + page.find('span', {'id': 'Cast'}).findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').find_all('li')
            cast = cast + page.find('span', {'id': 'Cast'}).findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').find_all('li')
            cast = cast + page.find('span', {'id': 'Cast'}).findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').find_all('li')
            cast = cast + page.find('span', {'id': 'Cast'}).findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').find_all('li')
            cast = cast + page.find('span', {'id': 'Cast'}).findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').find_all('li')
            cast = cast + page.find('span', {'id': 'Cast'}).findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').find_all('li')
            cast = cast + page.find('span', {'id': 'Cast'}).findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').find_all('li')
            cast = cast + page.find('span', {'id': 'Cast'}).findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').find_all('li')
            cast = cast + page.find('span', {'id': 'Cast'}).findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').findNext('ul').find_all('li')
            
    except:
        # if the cast was not found, search voice cast
        cast = page.find('span', {'id': 'Voice_cast'}).findNext('ul').find_all('li')

    for actor in cast:
        try:
            page_actor = BeautifulSoup(urlopen("https://en.wikipedia.org/" + actor.find('a', href=True)['href']),
                                       "lxml")
            actors_counter[actor.find('a', href=True)['title']] += 1
            if actor.find('a', href=True)['title'] in actors or actor.find('a', href=True)['title'] == "Gal Gadot":
                continue
            actors.append(actor.find('a', href=True)['title'])
            birth_years.append(find_year_of_birth(page_actor))
            birth_places.append(find_place_of_birth(page_actor))
            numbers_of_awards.append(find_number_of_awards(page_actor))
        except:
            pass

del actors_counter['Gal Gadot']
# create dataframe from lists
df_actors = pd.DataFrame(actors, columns=['Name'])
df_actors['Year of Birth'] = birth_years
df_actors['Country of Birth'] = birth_places
df_actors['Number of Awards'] = numbers_of_awards
df_actors

In [None]:
# table wich presents the number of joint movies for each co-actor\actress with Gal Gadot
# create dataframe from counter
df_joint_movies = pd.DataFrame.from_dict(actors_counter, orient='index').reset_index()
df_joint_movies = df_joint_movies.rename(columns={'index': 'Name', 0: 'Number of Joint Movies'})
df_joint_movies

In [None]:
# histogram which presents the distribution of joint movies
# (number of co-actors per number of joint movies)
# create histogram from counter
data = pd.Series(actors_counter)
plt.bar(data.value_counts().keys(), data.value_counts())
plt.xticks(np.arange(min(data.value_counts().keys()), max(data.value_counts().keys())+1, 1.0))
plt.xlabel('number of joint movies')
plt.ylabel('number of co-actors')
plt.title('Distribution of Joint Movies')
plt.grid(True)
plt.show()