In [195]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re

In [304]:
def principal_period(s):
    i = (s+s).find(s, 1, -1)
    return None if i == -1 else s[:i]

In [196]:
url = 'https://www.bjjheroes.com/a-z-bjj-fighters-list'
data = requests.get(url)

with open('bjj_fighters.html', 'w+') as f:
    f.write(data.text)

In [197]:
with open('bjj_fighters.html') as f:
    page = f.read()

In [354]:
soup = BeautifulSoup(page, "html.parser")

In [199]:
fighters_html = soup.find_all(id="tablepress-8")

In [262]:
fighters_html

[<table class="tablepress tablepress-id-8" id="tablepress-8">
 <thead>
 <tr class="row-1 odd">
 <th class="column-1">First Name</th><th class="column-2">Last Name</th><th class="column-3">Nickname</th><th class="column-4">Team</th>
 </tr>
 </thead>
 <tbody class="row-hover">
 <tr class="row-2 even">
 <td class="column-1"><a href="/?p=8141">Aarae</a> </td><td class="column-2"><a href="/?p=8141">Alexander</a></td><td class="column-3"></td><td class="column-4">Team Lloyd Irvin</td>
 </tr>
 <tr class="row-3 odd">
 <td class="column-1"><a href="/?p=9246">Aaron</a> </td><td class="column-2"><a href="/?p=9246">Johnson</a> </td><td class="column-3"><a href="/?p=9246">Tex</a> </td><td class="column-4">Unity JJ</td>
 </tr>
 <tr class="row-4 even">
 <td class="column-1"><a href="/?p=8494">Abdurakhman</a> </td><td class="column-2"><a href="/?p=8494">Bilarov</a> </td><td class="column-3"></td><td class="column-4">Team Nogueira</td>
 </tr>
 <tr class="row-5 odd">
 <td class="column-1"><a href="/?p=3

### Teams names and number of fighters

In [246]:
fighters = pd.read_html(str(fighters_html))[0]

In [266]:
fighters_count = len(soup.find_all("td", class_="column-1"))
fighters_id_columns = ['Id', 'First Name', 'Last Name', 'Nickname', 'Team']
fighters_id_data = [] 

for i in range(fighters_count):
    fighter_id = ((str(soup.find_all("td", class_="column-1")[i]).split('"')[3]).split('=')[1])
    fighter_first_name = (soup.find_all("td", class_="column-1")[i].getText())
    fighter_last_name = (soup.find_all("td", class_="column-2")[i].getText())
    fighter_nickname = (soup.find_all("td", class_="column-3")[i].getText())
    fighter_team = (soup.find_all("td", class_="column-4")[i].getText())
    
    fighters_id_data.append([fighter_id,
                       fighter_first_name,
                       fighter_last_name,
                       fighter_nickname,
                       fighter_team])
    
    

In [274]:
fighters_id_data[0]

['8141', 'Aarae ', 'Alexander', '', 'Team Lloyd Irvin']

In [283]:
fighters_df = pd.DataFrame(data=fighters_id_data,
                           columns=fighters_id_columns)
fighters_df.set_index('Id', inplace=True)

fighters_df.Id

AttributeError: 'DataFrame' object has no attribute 'Id'

In [281]:
teams = fighters_df.groupby(['Team']).count().sort_values(by=['First Name'], ascending=False).drop(columns=['Last Name', 'Nickname']).rename(columns={'First Name': 'Fighter count'})

In [282]:
teams.head(10)

Unnamed: 0_level_0,Fighter count
Team,Unnamed: 1_level_1
,273
Alliance,88
Checkmat,80
Gracie Barra,77
GF Team,48
Atos,46
Nova Uniao,37
Gracie Humaita,32
Carlson Gracie,20
Brazil 021,14


## Getting grapplers page links

In [9]:
fighters_links = []

for link in soup.find_all('a'):
    if "p=" in str(link.get('href')):
        fighters_links.append(link.get('href'))
        

In [10]:
# HTML COUNT WAS HIGHER THAN THE COUNT ON THE LIST, SO I DECIDED TO TAKE A LOOK AND SEE IF THERE WERE DUPLICATE RECORDS ON THE HTML TABLE
# THE SAME LINK COULD ONLY APPEAR AT MOST 3 TIMES (FIRST NAME, LAST NAME AND NICKNAME)

link_counts = pd.DataFrame(fighters_links)

link_counts['Count'] = 1
link_counts.rename(columns={0: 'Links'}, inplace=True)
link_counts.groupby('Links').count().sort_values(by='Count', ascending=False).head(10)

Unnamed: 0_level_0,Count
Links,Unnamed: 1_level_1
/?p=714,5
/?p=4943,5
/?p=4912,5
/?p=548,4
/?p=1377,4
/?p=690,4
/?p=4687,4
/?p=9344,4
/?p=2956,3
/?p=12542,3


In [11]:
# TRANSFORMING TO A SET SO I COULD CLEAR ALL DUPLICATE RECORDS
fighters_links = list(set(fighters_links))

In [18]:
# GETTING ALL THE FIGHTER PAGES

url_start = 'https://www.bjjheroes.com/bjj-fighters{}'
for link in fighters_links:
    url = url_start.format(link)
    data = requests.get(url)
    
    with open("fighters/{}.html".format(link[4:]), "w+", encoding='utf-8') as f:
        f.write(data.text)

In [357]:
soup.find("table", {"class": "table table-striped sort_table"}).find_all('a')

AttributeError: 'NoneType' object has no attribute 'find_all'

In [329]:
fighters_list = []
df_list = []
for id in fighters_links:
    with open("fighters/{}.html".format(id[4:]), encoding="utf-8") as f:
        page = f.read()

    soup = BeautifulSoup(page, "html.parser")   

    # GATHER FIGHTER DATA
    try:
        fighter_name = soup.find('strong', text=re.compile("Full Name")).parent.getText().split(": ",1)[1]
    except:
        fighter_name = ""
    try:
        fighter_weight_class = soup.find('strong', text=re.compile("Weight Division")).parent.getText().split(": ",1)[1]
    except:
        fighter_weight_class = ""  
    try:
        fighter_team = soup.find('strong', text=re.compile("Team/Association")).parent.getText().split(": ",1)[1]
    except:
        fighter_team = ""

    fighters_list.append([id[4:], fighter_name, fighter_weight_class, fighter_team])

    try:
        fighter_record = pd.read_html(str(soup.find("table", {"class": "table table-striped sort_table"})))[0]
    #     fighter_record['Opponent'] = principal_period(fighter_record['Opponent'])
        fighter_record['Fighter Id'] = id[4:]
        df_list.append(fighter_record)
    except:
        data = []
        fighter_record = pd.DataFrame(data)

In [406]:
matches_df = pd.concat(df_list)

x,y=0,0

teste = matches_df.Opponent.str.split().dropna()
for i in range(len(teste.values)):
    if len(teste.values[i]) >= 3:
        x += 1
    else:
        y += 1
        
print(x)
print(y)    

18936
15090


In [294]:
lista_lutadores = pd.DataFrame(fighters_list)

lista_lutadores.rename(columns={0:'Id',1:'Full Name', 2:'Weight Division', 3:'Team'}, inplace=True)
lista_lutadores.set_index('Id', inplace=True)

lista_lutadores

Unnamed: 0_level_0,Full Name,Weight Division,Team
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
8223,Jena Rae Bishop,"Peso Pena (64,00 kg / 141.5 lbs)",Alliance
10596,Cleber Luciano Costa,"Peso Leve (76,00 kg / 168.0 lbs)",Cleber BJJ
3546,Mario Claudio Tallarico,Peso Pena (70kg/154lbs),Gracie Jiu Jitsu
2402,Christian Graugart,Peso Medio (82kg/181lbs),BJJ Globetrotters
650,Josef Manuel Junior,: Competed as “Pena” – Featherweight in the be...,Nova Uniao (Robson Moura Nova Uniao – RMNU)
...,...,...,...
10155,Malachi James C. Edmond,"Peso Pluma (64,00 kg / 141.5 lbs)",Team Lloyd Irvin
5869,Jorge Pereira,Meio Pesado (88kg/194lbs),Jorge Pereira Jiu Jitsu
3286,John Crouch,,John Crouch Jiu Jitsu/MMA Lab
1785,Marcus Norat,Peso Galo (57kg/126lbs),Norfight/Gracie Academy
