###### Importing libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from IPython.display import display
import shutil
import re

###### Extracting all actor links from Wikipedia

In [None]:
urls = ["https://en.wikipedia.org/wiki/List_of_Indian_film_actors", "https://en.wikipedia.org/wiki/List_of_Indian_film_actresses"] 

base = "https://en.wikipedia.org"
actor_urls = []

for i, url in enumerate(urls, 0):    
    source = requests.get(url).text
    soup = BeautifulSoup(source, 'lxml')
    
    divs = soup.find_all('div', class_="div-col columns column-width")
    for div in divs:
        lis = div.findChildren("a", recursive="False")
        for li in lis:
            actor_urls.append((base+li['href'], i))
            
print("Total {} actors got.".format(len(actor_urls)))
display(actor_urls[:5])

###### Download images from Google

In [None]:
search_url = "https://www.google.com/search?hl=en&site=imghp&tbm=isch&tbs=isz:l&q="

def google_image(name, fn):
    headers= {"User.Agent":"Chrome/80.0.3987.122"}
    url = search_url+name
    
    req= requests.get(url,headers=headers)
    html=req.content
    
    a = requests.get(url).text
    result_url=""
    b=a.find("http://t1.gstatic.com")
    for i in range(b,b+1000):
        if a[i]=='"':
            break
        result_url+=a[i]
    #print(result_url)
    response = requests.get(result_url, stream=True)
    
    file = open("./Images/{}.jpg".format(fn), 'wb')
    
    response.raw.decode_content = True
    shutil.copyfileobj(response.raw, file)
    del response

###### Download images from Wikipedia only

In [None]:
def download_image(url, fn):
    response = requests.get(url, stream=True)
    
    file = open("./Images/{}.jpg".format(fn), 'wb')
    
    response.raw.decode_content = True
    shutil.copyfileobj(response.raw, file)
    del response

###### Get actor age

In [None]:
def get_age(query):
    query += " age"
    query = query.replace(' ', '+')
    URL = f"https://google.com/search?q={query}"
    
    headers = {"User.Agent": "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"}
    response = requests.get(URL, headers=headers)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "lxml")
    else:
        # print("{} - not found".format(URL))
        return "Unavailable"
    
    div = soup.find("div", class_="BNeawe").find("div", class_="BNeawe")
    if div:
        if len(div.text) < 50:
            return div.text
        else:
            return "Unavailable"
    else:
        return "Unavailable"

###### Get actor heights

In [None]:
def get_height(query):
    query += " height"
    query = query.replace(' ', '+')
    URL = f"https://google.com/search?q={query}"
    
    headers = {"User.Agent": "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"}
    response = requests.get(URL, headers=headers)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "lxml")
    else:
        # print("{} - not found".format(URL))
        return "Unavailable"
    
    div = soup.find('div', class_="BNeawe iBp4i AP7Wnd")
    if div:
        txt = div.text
        if len(txt) < 10:
            return txt
        else:
            return txt.split('.')[0]
    else:
        table = soup.find("table", class_="LnMnt")
        if table:
            trs = table.find_all('tr')
            for tr in trs:
                if tr.find_all('td')[0].text == 'HEIGHT':
                    return tr.find_all('td')[1].text.split('-')[2][1:7]
                    break
        else:
            div = soup.find('div', class_="BNeawe").find('div', class_="BNeawe")
            if div:
                txt = div.text
                if len(txt) > 10:
                    return txt.split(".")[0]
                else:
                    return div.text
            else:
                return "Unavailable"

###### Check if actor is dead or alive

In [None]:
def is_alive(query):
    query = "is "+ query + " alive"
    query = query.replace(' ', '+')
    URL = f"https://google.com/search?q={query}"
    
    headers = {"User-Agent": "Mozilla/5.0 (Linux; Android 7.0; SM-G930V Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"}
    response = requests.get(URL, headers=headers)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "lxml")
    else:
        # print("{} - not found".format(URL))
        return "Unavailable"
    soup = BeautifulSoup(response.text, "lxml")
    #print(soup.prettify())
    d = soup.find(text="Deceased")
    c = soup.find(text="Cause of death")
    #print(d, c)
    
    if d or c:
        return "No"
    else:
        return "Yes"

#is_alive("Mahesh Babu")

###### Get all information about each actor

In [None]:
data = {'name':[], 'image':[], 'gender':[], 'age':[], 'height':[], 'is_alive':[], 'description':[]}

for i, url in enumerate(actor_urls, 1):
    #print(actor_url)
    actor_url = url[0]
    gender = url[1]
    
    source = requests.get(actor_url).text
    soup = BeautifulSoup(source, 'lxml')
    
    name = soup.find('h1', id="firstHeading").text
    
    try:
        paras = soup.find('div', class_="mw-parser-output").find_all('p')
        for para in paras:
            txt = para.text
            if len(txt) > 3: 
                txt = re.sub(r'\[.*?\]', '', txt)
                txt = txt.strip()
                data['description'].append(txt)
                #print("\r{}".format(name), end="\r")
                break
    except AttributeError:
        data['description'].append("NaN")
    
    box = soup.find('table', class_="infobox")
    if box:
        img = box.find("a", class_="image")
        if img:
            img = "https:"+img.find_all("img")[0]['src']
            download_image(img, i)
        else:
            google_image(name+' '+box.find('tr').text, i)
    else:
        google_image(name, i)
        
    age = get_age(name)
    
    height = get_height(name)
    
    alive = is_alive(name)
    
    data["name"].append(name)
    data["image"].append(str(i)+'.jpg')
    data["age"].append(age)
    data["height"].append(height)
    data["is_alive"].append(alive)
    data["gender"].append("Male" if gender == 0 else "Female")
    
    print("{}/{} Actors downloaded. [{}, {}, {}]".format(i, len(actor_urls), name, age, height))

df = pd.DataFrame(data)
df.to_csv('./data.csv', encoding='utf-8', index=False)

###### Check the data downloaded

In [2]:
df = pd.read_csv('./data.csv')
display(df.head())
print("Total actors got - {}".format(df.shape[0]))

Unnamed: 0,name,image,gender,age,height,is_alive,description
0,A. K. Hangal,1.jpg,Male,98 years,1.63 m,No,Avtar Kishan Hangal (1 February 1914 – 26 Augu...
1,Aadhi Pinisetty,2.jpg,Male,37 years,1.72 m,Yes,Aadhi Pinisetty is an Indian film actor who ap...
2,Aadi (actor),3.jpg,Male,30 years,1.68 m,Yes,Aadi Saikumar (born Aditya Pudipeddi) is an In...
3,Aamir Khan,4.jpg,Male,55 years,1.63 m,Yes,Mohammed Aamir Hussain Khan (pronounced ; born...
4,Ashish Chaudhary,5.jpg,Male,41 years,1.78 m,Yes,Ashish Chaudhary (born 21 July 1978) is a Bol...


Total actors got - 1732


###### Some simple analysis

In [3]:
ages = df['age'].value_counts()['Unavailable']
heights = df['height'].value_counts()['Unavailable']
des = df['description'].isnull().sum()

print("Total actors: {}".format(df.shape[0]))
male, female = df[df['gender'] == "Male"].shape[0], df[df['gender'] == "Female"].shape[0]
print("Total male actors: {}".format(male))
print("Total female actors: {}\n".format(female))
print("Actors alive : {} ({}/{} Male, {}/{}, Female)\n".format(df[df['is_alive'] == 'Yes'].shape[0],
                                                            df[(df['is_alive'] == 'Yes') & (df['gender'] == 'Male')].shape[0],
                                                            male,
                                                            df[(df['is_alive'] == 'Yes') & (df['gender'] == 'Female')].shape[0],
                                                            female))
print('Number of ages unavailable: {}.\nNumber of heights unavailable: {}.\nNumber of descriptions unavailable: {}'.format(ages, heights, des))

Total actors: 1732
Total male actors: 805
Total female actors: 927

Actors alive : 1472 (631/805 Male, 841/927, Female)

Number of ages unavailable: 102.
Number of heights unavailable: 186.
Number of descriptions unavailable: 2
