In [1]:
#data dependencies
import pandas as pd
import numpy as np
from pandas import DataFrame, Series, read_html
#web scraping
import requests
from bs4 import BeautifulSoup

In [2]:
#thread where ranking was taken from
import webbrowser as wb
thread = 'https://twitter.com/SimpPilgrim/status/1277654193549930496'
#wb.open(thread)

I copy pasted the thread manually into NotePad. While importing, I noticed that Simp double counted some ranks and had duplicate names. Morgan Lee and Julia Roca are both rank 167, while Claudia Marie and Canela Skin are both rank 182. For accuracy, we will be using pandas automatic indexing as the correct ranking system, and assume proper ranking is in the order originally listed. For the stars who appear more than once, their lower rank entries will be dropped. For documentation, Simp's ranking will be placed into a column titled 'Simp's Rank.' The duplicate names are shown below.

In [3]:
df = pd.read_csv('thelist.csv')
df.columns=["Simp's Rank",'Name']
name_duplicates = df[df.duplicated("Name") == True]
rank_duplicates = df[df.duplicated("Simp's Rank") == True]
df

Unnamed: 0,Simp's Rank,Name
0,1,Mia Malkova
1,2,Lana Rhoades
2,3,Abella Danger
3,4,Nikki Benz
4,5,Angela White
...,...,...
357,366,Syren De Mer
358,367,Jada Fire
359,368,Diamond Kitty
360,369,Rebecca Moore


In [4]:
#dropping these duplicates
df.drop_duplicates("Name",keep='first',inplace=True)
df

Unnamed: 0,Simp's Rank,Name
0,1,Mia Malkova
1,2,Lana Rhoades
2,3,Abella Danger
3,4,Nikki Benz
4,5,Angela White
...,...,...
357,366,Syren De Mer
358,367,Jada Fire
359,368,Diamond Kitty
360,369,Rebecca Moore


In [5]:
#recorrecting the ranking index now that duplicates are dropped
df['Rank'] = range(1,len(df)+1)
#also adding columns for the data we will later scrape for
df['Ethnicity'] = np.nan
df['Nationality'] = np.nan
df['Measurements'] = np.nan
df['Bra Size'] = np.nan
df['Boobs'] = np.nan
df['Boobpedia URL'] = np.nan
df['Underscore'] = np.nan
df.set_index('Rank',inplace=True)

In [6]:
df['Boobpedia URL'] = 'http://boobpedia.com/boobs/'
df.Underscore = df.Name.replace(' ', '_', regex=True)
df['Underscore'] = df['Underscore'].str[1:]
df['Boobpedia URL'] += df['Underscore']
df

Unnamed: 0_level_0,Simp's Rank,Name,Ethnicity,Nationality,Measurements,Bra Size,Boobs,Boobpedia URL,Underscore
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,1,Mia Malkova,,,,,,http://boobpedia.com/boobs/Mia_Malkova,Mia_Malkova
2,2,Lana Rhoades,,,,,,http://boobpedia.com/boobs/Lana_Rhoades,Lana_Rhoades
3,3,Abella Danger,,,,,,http://boobpedia.com/boobs/Abella_Danger,Abella_Danger
4,4,Nikki Benz,,,,,,http://boobpedia.com/boobs/Nikki_Benz,Nikki_Benz
5,5,Angela White,,,,,,http://boobpedia.com/boobs/Angela_White,Angela_White
...,...,...,...,...,...,...,...,...,...
345,366,Syren De Mer,,,,,,http://boobpedia.com/boobs/Syren_De_Mer,Syren_De_Mer
346,367,Jada Fire,,,,,,http://boobpedia.com/boobs/Jada_Fire,Jada_Fire
347,368,Diamond Kitty,,,,,,http://boobpedia.com/boobs/Diamond_Kitty,Diamond_Kitty
348,369,Rebecca Moore,,,,,,http://boobpedia.com/boobs/Rebecca_Moore,Rebecca_Moore


In [7]:
del df['Underscore']
del df["Simp's Rank"]
df

Unnamed: 0_level_0,Name,Ethnicity,Nationality,Measurements,Bra Size,Boobs,Boobpedia URL
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Mia Malkova,,,,,,http://boobpedia.com/boobs/Mia_Malkova
2,Lana Rhoades,,,,,,http://boobpedia.com/boobs/Lana_Rhoades
3,Abella Danger,,,,,,http://boobpedia.com/boobs/Abella_Danger
4,Nikki Benz,,,,,,http://boobpedia.com/boobs/Nikki_Benz
5,Angela White,,,,,,http://boobpedia.com/boobs/Angela_White
...,...,...,...,...,...,...,...
345,Syren De Mer,,,,,,http://boobpedia.com/boobs/Syren_De_Mer
346,Jada Fire,,,,,,http://boobpedia.com/boobs/Jada_Fire
347,Diamond Kitty,,,,,,http://boobpedia.com/boobs/Diamond_Kitty
348,Rebecca Moore,,,,,,http://boobpedia.com/boobs/Rebecca_Moore


In [8]:
#we're going to get the response codes for every link to test if every star is actually on boobpedia
#the ones who aren't will be dropped
response_codes = []
for i in df['Boobpedia URL']:
    response = requests.get(i)
    response_codes.append(response)
df['Response Codes'] = response_codes
df

Unnamed: 0_level_0,Name,Ethnicity,Nationality,Measurements,Bra Size,Boobs,Boobpedia URL,Response Codes
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Mia Malkova,,,,,,http://boobpedia.com/boobs/Mia_Malkova,<Response [200]>
2,Lana Rhoades,,,,,,http://boobpedia.com/boobs/Lana_Rhoades,<Response [200]>
3,Abella Danger,,,,,,http://boobpedia.com/boobs/Abella_Danger,<Response [200]>
4,Nikki Benz,,,,,,http://boobpedia.com/boobs/Nikki_Benz,<Response [200]>
5,Angela White,,,,,,http://boobpedia.com/boobs/Angela_White,<Response [200]>
...,...,...,...,...,...,...,...,...
345,Syren De Mer,,,,,,http://boobpedia.com/boobs/Syren_De_Mer,<Response [200]>
346,Jada Fire,,,,,,http://boobpedia.com/boobs/Jada_Fire,<Response [200]>
347,Diamond Kitty,,,,,,http://boobpedia.com/boobs/Diamond_Kitty,<Response [200]>
348,Rebecca Moore,,,,,,http://boobpedia.com/boobs/Rebecca_Moore,<Response [200]>


In [9]:
df.to_csv('responsecodelist.csv')

In [9]:
# def scraper_boobs():
#     boob_list = []
#     for i in df['Boobpedia URL']:
#         table = pd.io.html.read_html(i)
#         table = table[0]
#         boob_list.append(table.iloc[12,1])
#     return boob_list
# df.Boobs = scraper_boobs()
# df

In [None]:
# for i in df['Boobpedia URL']:
#     table = pd.io.html.read_html(i)
#     table = table[0]
#     df.Ethnicity = table.iloc[7,1]
#     df.Nationality = table.iloc[8,1]
#     df.Measurements = table.iloc[10,1]
#     df['Bra Size'] = table.iloc[11,1].split(' ')[0]
#     df.Boobs = table.iloc[12,1]
#     break
# df