In [1]:
import requests
import re
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import pandas as pd
import json
import project_utils as pu

In [2]:
data = pd.read_csv('celebrities.csv')

data

Unnamed: 0,Celebrity
0,Cristiano Ronaldo
1,Ariana Grande
2,Dwayne Johnson
3,Selena Gomez
4,Kylie Jenner
5,Lionel Messi
6,Beyoncé
7,Neymar
8,Justin Bieber
9,Taylor Swift


Collecting the HTMLs for each social media search, which we'll use to find each social media URL.

In [3]:
social_media_search_html_dict = {}

for celeb in data.Celebrity:
    social_media_search_html_dict[celeb] = pu.get_social_media_search_html_list(celeb)

Checking the outputs, if response == 200, it means we're good to go.

In [4]:
social_media_search_html_dict

{'Cristiano Ronaldo': [<Response [200]>, <Response [200]>],
 'Ariana Grande': [<Response [200]>, <Response [200]>],
 'Dwayne Johnson': [<Response [200]>, <Response [200]>],
 'Selena Gomez': [<Response [200]>, <Response [200]>],
 'Kylie Jenner': [<Response [200]>, <Response [200]>],
 'Lionel Messi': [<Response [200]>, <Response [200]>],
 'Beyoncé': [<Response [200]>, <Response [200]>],
 'Neymar': [<Response [200]>, <Response [200]>],
 'Justin Bieber': [<Response [200]>, <Response [200]>],
 'Taylor Swift': [<Response [200]>, <Response [200]>]}

Creating a new DataFrame with all the celebrities and the URLs for their social media pages

In [5]:
social_data = None

for celeb in data.Celebrity:
    if social_data is None:
        social_data = pu.get_social_media_url(celeb, social_media_search_html_dict[celeb])
    else:
        social_data = social_data.append(pu.get_social_media_url(celeb, social_media_search_html_dict[celeb]), ignore_index=True)
        
social_data.rename(columns={'pt-br.facebook.com': 'Facebook', 'www.instagram.com': 'Instagram'}, inplace=True)

social_data

Unnamed: 0,Facebook,Instagram,Celebrity
0,https://pt-br.facebook.com/Cristiano/,https://www.instagram.com/cristiano/,Cristiano Ronaldo
1,https://pt-br.facebook.com/arianagrande,https://www.instagram.com/arianagrande/,Ariana Grande
2,https://pt-br.facebook.com/DwayneJohnson/,https://www.instagram.com/therock/,Dwayne Johnson
3,https://pt-br.facebook.com/Selena,https://www.instagram.com/selenagomez/,Selena Gomez
4,https://pt-br.facebook.com/KylieJenner,https://www.instagram.com/kyliejenner/,Kylie Jenner
5,https://pt-br.facebook.com/leomessi,https://www.instagram.com/leomessi/,Lionel Messi
6,https://pt-br.facebook.com/beyonce/,https://www.instagram.com/beyonce/,Beyoncé
7,https://pt-br.facebook.com/neymarjr/,https://www.instagram.com/neymarjr/,Neymar
8,https://pt-br.facebook.com/JustinBieber,https://www.instagram.com/justinbieber/,Justin Bieber
9,https://pt-br.facebook.com/TaylorSwift/,https://www.instagram.com/taylorswift/,Taylor Swift


In [6]:
social_data.set_index('Celebrity', inplace=True)

social_data.head()

Unnamed: 0_level_0,Facebook,Instagram
Celebrity,Unnamed: 1_level_1,Unnamed: 2_level_1
Cristiano Ronaldo,https://pt-br.facebook.com/Cristiano/,https://www.instagram.com/cristiano/
Ariana Grande,https://pt-br.facebook.com/arianagrande,https://www.instagram.com/arianagrande/
Dwayne Johnson,https://pt-br.facebook.com/DwayneJohnson/,https://www.instagram.com/therock/
Selena Gomez,https://pt-br.facebook.com/Selena,https://www.instagram.com/selenagomez/
Kylie Jenner,https://pt-br.facebook.com/KylieJenner,https://www.instagram.com/kyliejenner/


Getting the HTMLs for each social media URL.

In [7]:
social_media_html_list = []

for i, row in social_data[['Facebook', 'Instagram']].iterrows():
    celeb_list = [row.name]
    
    for site, url in row.iteritems():
        celeb_list.append(requests.get(url))

    social_media_html_list.append(celeb_list)

Checking outputs for response. Reminder: Response [200] == good.

In [8]:
social_media_html_list

[['Cristiano Ronaldo', <Response [200]>, <Response [200]>],
 ['Ariana Grande', <Response [200]>, <Response [200]>],
 ['Dwayne Johnson', <Response [200]>, <Response [200]>],
 ['Selena Gomez', <Response [200]>, <Response [200]>],
 ['Kylie Jenner', <Response [200]>, <Response [200]>],
 ['Lionel Messi', <Response [200]>, <Response [200]>],
 ['Beyoncé', <Response [200]>, <Response [200]>],
 ['Neymar', <Response [200]>, <Response [200]>],
 ['Justin Bieber', <Response [200]>, <Response [200]>],
 ['Taylor Swift', <Response [200]>, <Response [200]>]]

Transforming `social_media_html_list` into a DataFrame.

In [9]:
social_media_html_df = pd.DataFrame(social_media_html_list, columns=['Celebridade', 'Facebook', 'Instagram'])

social_media_html_df.set_index('Celebridade', inplace=True)

social_media_html_df.head()

Unnamed: 0_level_0,Facebook,Instagram
Celebridade,Unnamed: 1_level_1,Unnamed: 2_level_1
Cristiano Ronaldo,<Response [200]>,<Response [200]>
Ariana Grande,<Response [200]>,<Response [200]>
Dwayne Johnson,<Response [200]>,<Response [200]>
Selena Gomez,<Response [200]>,<Response [200]>
Kylie Jenner,<Response [200]>,<Response [200]>


Getting the number of Facebook and Instagram Followers for each person in the dataset.

In [10]:
followers_data = []

for i, row in social_media_html_df.iterrows():
    celeb_list = [row.name]
    
    celeb_list.append(pu.get_facebook_followers(row['Facebook']))
        
    celeb_list.append(pu.get_instagram_followers(row['Instagram']))
    
    followers_data.append(celeb_list)

Creating a DataFrame and showing the data gathered.

In [11]:
followers_data_df = pd.DataFrame(followers_data, columns=['Celebridade', 'Facebook_Followers', 'Instagram_Followers'])

followers_data_df.set_index('Celebridade', inplace=True)

followers_data_df

Unnamed: 0_level_0,Facebook_Followers,Instagram_Followers
Celebridade,Unnamed: 1_level_1,Unnamed: 2_level_1
Cristiano Ronaldo,126387064,218487096
Ariana Grande,34702119,185512670
Dwayne Johnson,57871577,183138316
Selena Gomez,68225527,176647334
Kylie Jenner,23110546,175619689
Lionel Messi,93789690,150835477
Beyoncé,57859571,146162958
Neymar,59187694,138806255
Justin Bieber,77998674,135486800
Taylor Swift,69519072,132396069
