### Necessary Imports

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests

### Use Beautiful Soup to pull all contents of the web-page

In [2]:
page = requests.get('https://en.wikipedia.org/wiki/European_Cup_and_UEFA_Champions_League_records_and_statistics')
soup = BeautifulSoup(page.content,'html.parser')

### Find the necessary table on the page. First let us consider the finalist data.

In [3]:
tables = soup.findAll("table")

### Build Table Header

In [4]:
table_header = tables[0].findAll("tr")
header_list=[]
for i in range(1,len(table_header[0])):
    if(i==1):
        header_list.append(table_header[0].contents[i].contents[1].lstrip().rstrip())
    elif (i%2 != 0):
        header_list.append(table_header[0].contents[i].contents[0].lstrip().rstrip())

In [5]:
header_list.append('League')

### Create an empty data frame with all the columns

In [6]:
cl_final_record = pd.DataFrame(columns=header_list)

### Function to pull League information

In [7]:
def get_league(temp_link):
    club_link = 'https://en.wikipedia.org/' + temp_link
    club_page = requests.get(club_link)
    club_soup = BeautifulSoup(club_page.content,'html.parser')
    info_table = club_soup.find('table',attrs={"class":"infobox vcard"})
    info_table_rows = info_table.find_all('tr')
    try:
        for j in range(0,12):
            if(info_table_rows[j].contents[0].contents[0]=='League'):
                league_name=(info_table_rows[j].contents[1].contents[0].get('title'))
                break
    except IndexError:
        league_name = 'Unavailable'
    return(league_name)

### Consume all the row element columns and populate the data frame for the finalists

In [8]:
rows = tables[0].findAll("tr")
for club in range(1,len(rows)):
    club_name = rows[club].contents[1].contents[2].contents[0]
    temp_link = rows[club].contents[1].contents[2].get('href')
    league_name = get_league(temp_link)
    titles = rows[club].contents[3].contents[0].rstrip()
    runners = rows[club].contents[5].contents[0].rstrip()
    x = rows[club].contents[7].findAll("a")
    win_list = []
    for i in range(0,len(x)):
        win_list.append(x[i].contents[0])
    y = rows[club].contents[9].findAll("a")
    runner_list = []
    for j in range(0,len(y)):
        runner_list.append(y[j].contents[0])
    cl_final_record.loc[club-1] = [club_name,titles,runners,win_list,runner_list,league_name]

### Find the data for the semi-finalists and build a seperate data frame

In [9]:
semi_table_hdr = tables[4].findAll("tr")
semi_hdr_list = []
for i in range(1,len(semi_table_hdr[0])):
    if (i%2 != 0):
        semi_hdr_list.append(semi_table_hdr[0].contents[i].contents[0].lstrip().rstrip())

In [10]:
semi_hdr_list.append('League')

### Consume all the row element columns and populate the data frame for the finalists

In [11]:
cl_semi_record = pd.DataFrame(columns=semi_hdr_list)
rows = tables[4].findAll("tr")
for club in range(1,len(rows)):
    club_name = rows[club].contents[1].contents[2].contents[0]
    temp_link = rows[club].contents[1].contents[2].get('href')
    league_name = get_league(temp_link)
    number_of_semis = rows[club].contents[3].contents[0].rstrip()
    x = rows[club].contents[5].findAll("a")
    years = []
    for i in range(0,len(x)):
        years.append(x[i].contents[0])
    cl_semi_record.loc[club-1] = [club_name,number_of_semis,years,league_name]

### Using the finalists data frame, Build a dictionary to house data for winners in each season

In [12]:
season_dict={}
league_dict={}
for i in range(0,len(cl_final_record)):
    club_name = cl_final_record['Club'][i]
    league_name = cl_final_record['League'][i]
    winning_list = cl_final_record[cl_final_record.Club == club_name]['Seasons won'].tolist()
    for i in range(0,len(winning_list)):
            season_list = winning_list[i]
            if(len(season_list)!=0):
                for i in season_list:
                    season_dict[i] = [club_name]
                    league_dict[i] = [league_name]

### Using the finalists data frame, Build a dictionary to house data for runner-ups in each season

In [13]:
for i in range(0,len(cl_final_record)):
    club_name = cl_final_record['Club'][i]
    league_name = cl_final_record['League'][i]
    runner_list = cl_final_record[cl_final_record.Club == club_name]['Seasons runner-up'].tolist()
    for i in range(0,len(runner_list)):
            season_list = runner_list[i]
            if(len(season_list)!=0):
                for i in season_list:
                    season_dict[i].append(club_name)
                    league_dict[i].append(league_name)

### Now add the other semi-finalists from the semi finalist data frame to complete the dataset

In [14]:
teams = cl_semi_record['Team']
for team in teams:
    league_name = cl_semi_record[cl_semi_record['Team']==team]['League']
    season_list=[]
    league_list=[]
    team_list = cl_semi_record[cl_semi_record['Team']==team]['Years']
    team_index = team_list.index[0]
    season_list = team_list[team_index]
    for season in season_list:
        if(team!=season_dict[season][0]):
            if(team!=season_dict[season][1]):
                season_dict[season].append(team)
                league_dict[season].append(league_name.values[0])

### Clean the data-set to remove duplication caused due to the use of different names for the same team

In [15]:
season_dict_items = season_dict.items()
league_dict_items = league_dict.items()
for i in season_dict_items:
    if (len(i[1])>4):
        if((season_dict[i[0]][2][0:5]==season_dict[i[0]][0][0:5]) or (season_dict[i[0]][2][0:5]==season_dict[i[0]][1][0:5])):
            del(season_dict[i[0]][2])
            del(league_dict[i[0]][2])
        elif((season_dict[i[0]][3][0:5]==season_dict[i[0]][0][0:5]) or (season_dict[i[0]][3][0:5]==season_dict[i[0]][1][0:5])):
            del(season_dict[i[0]][3])
            del(league_dict[i[0]][3])
        elif((season_dict[i[0]][4][0:5]==season_dict[i[0]][0][0:5]) or (season_dict[i[0]][4][0:5]==season_dict[i[0]][1][0:5])):
            del(season_dict[i[0]][4])
            del(league_dict[i[0]][4])

### Convert the list of dictionaries into a self-contained data-frame which has winners, runner-ups and finalists for each season

In [16]:
champions_league_data = pd.DataFrame(list(season_dict.items()))
champions_league_data.columns = ['Season','Teams']
winner_list=[]
runner_list=[]
sf1_list=[]
sf2_list=[]
for i in range(0,len(champions_league_data)):
    winner_list.append(champions_league_data.Teams[i][0])
    runner_list.append(champions_league_data.Teams[i][1])
    sf1_list.append(champions_league_data.Teams[i][2])
    sf2_list.append(champions_league_data.Teams[i][3])
champions_league_data['Winners'] = winner_list
champions_league_data['Runner Up'] = runner_list
champions_league_data['SF1'] = sf1_list
champions_league_data['SF2'] = sf2_list
champions_league_data = champions_league_data.drop(columns='Teams')
champions_league_data = champions_league_data.sort_values('Season')
champions_league_data = champions_league_data.set_index('Season')

### Creating a dataframe for Leagues

In [18]:
champions_league_teams = pd.DataFrame(list(league_dict.items()))
champions_league_teams.columns = ['Season','Teams']
winner_list=[]
runner_list=[]
sf1_list=[]
sf2_list=[]
for i in range(0,len(champions_league_teams)):
    winner_list.append(champions_league_teams.Teams[i][0])
    runner_list.append(champions_league_teams.Teams[i][1])
    sf1_list.append(champions_league_teams.Teams[i][2])
    sf2_list.append(champions_league_teams.Teams[i][3])
champions_league_teams['Winners'] = winner_list
champions_league_teams['Runner Up'] = runner_list
champions_league_teams['SF1'] = sf1_list
champions_league_teams['SF2'] = sf2_list
champions_league_teams = champions_league_teams.drop(columns='Teams')
champions_league_teams = champions_league_teams.sort_values('Season')
champions_league_teams = champions_league_teams.set_index('Season')

### Creating a dataframe to capture counts of wins for each league

In [21]:
unique_leagues = champions_league_teams['Winners'].unique()
unique_leagues = np.append(unique_leagues,champions_league_teams['Runner Up'].unique())
unique_leagues = np.append(unique_leagues,champions_league_teams['SF1'].unique())
unique_leagues = np.append(unique_leagues,champions_league_teams['SF2'].unique())
lg_unique = np.unique(unique_leagues)
header = ['range1win','range2win','range3win','range4win','range5win','range1run','range2run','range3run','range4run','range5run','range1sf1','range2sf1','range3sf1','range4sf1','range5sf1','range1sf2','range2sf2','range3sf2','range4sf2','range5sf2']
league_df = pd.DataFrame(columns=header,index=lg_unique)
league_df = league_df.fillna(0)

In [24]:
range1_start = int(champions_league_teams.index.max())
range2_start = range1_start-3
range3_start = range2_start-3
range4_start = range3_start-3
range5_start = range4_start-3
range_dict = {"range1_start":range1_start,"range2_start":range2_start,"range3_start":range3_start,"range4_start":range4_start,"range5_start":range5_start}

In [25]:
for i in champions_league_teams.index:
    if (range_dict['range1_start'] >= int(i) > range_dict['range2_start']):
        winning_league = (champions_league_teams.loc[i]['Winners'])
        runner_league = (champions_league_teams.loc[i]['Runner Up'])
        sf1_league = (champions_league_teams.loc[i]['SF1'])
        sf2_league = (champions_league_teams.loc[i]['SF2'])
        league_df.loc[winning_league]['range1win'] = (league_df.loc[winning_league]['range1win']) + 1
        league_df.loc[runner_league]['range1run'] = (league_df.loc[runner_league]['range1run']) + 1
        league_df.loc[sf1_league]['range1sf1'] = (league_df.loc[sf1_league]['range1sf1']) + 1
        league_df.loc[sf2_league]['range1sf2'] = (league_df.loc[sf2_league]['range1sf2']) + 1
    elif (range_dict['range2_start'] >= int(i) > range_dict['range3_start']):
        winning_league = (champions_league_teams.loc[i]['Winners'])
        runner_league = (champions_league_teams.loc[i]['Runner Up'])
        sf1_league = (champions_league_teams.loc[i]['SF1'])
        sf2_league = (champions_league_teams.loc[i]['SF2'])
        league_df.loc[winning_league]['range2win'] = (league_df.loc[winning_league]['range2win']) + 1
        league_df.loc[runner_league]['range2run'] = (league_df.loc[runner_league]['range2run']) + 1
        league_df.loc[sf1_league]['range2sf1'] = (league_df.loc[sf1_league]['range2sf1']) + 1
        league_df.loc[sf2_league]['range2sf2'] = (league_df.loc[sf2_league]['range2sf2']) + 1
    elif (range_dict['range3_start'] >= int(i) > range_dict['range4_start']):
        winning_league = (champions_league_teams.loc[i]['Winners'])
        runner_league = (champions_league_teams.loc[i]['Runner Up'])
        sf1_league = (champions_league_teams.loc[i]['SF1'])
        sf2_league = (champions_league_teams.loc[i]['SF2'])
        league_df.loc[winning_league]['range3win'] = (league_df.loc[winning_league]['range3win']) + 1
        league_df.loc[runner_league]['range3run'] = (league_df.loc[runner_league]['range3run']) + 1
        league_df.loc[sf1_league]['range3sf1'] = (league_df.loc[sf1_league]['range3sf1']) + 1
        league_df.loc[sf2_league]['range3sf2'] = (league_df.loc[sf2_league]['range3sf2']) + 1
    elif (range_dict['range4_start'] >= int(i) > range_dict['range5_start']):
        winning_league = (champions_league_teams.loc[i]['Winners'])
        runner_league = (champions_league_teams.loc[i]['Runner Up'])
        sf1_league = (champions_league_teams.loc[i]['SF1'])
        sf2_league = (champions_league_teams.loc[i]['SF2'])
        league_df.loc[winning_league]['range4win'] = (league_df.loc[winning_league]['range4win']) + 1
        league_df.loc[runner_league]['range4run'] = (league_df.loc[runner_league]['range4run']) + 1
        league_df.loc[sf1_league]['range4sf1'] = (league_df.loc[sf1_league]['range4sf1']) + 1
        league_df.loc[sf2_league]['range4sf2'] = (league_df.loc[sf2_league]['range4sf2']) + 1
    elif (range_dict['range5_start'] >= int(i) >= range_dict['range5_start']-2):
        winning_league = (champions_league_teams.loc[i]['Winners'])
        runner_league = (champions_league_teams.loc[i]['Runner Up'])
        sf1_league = (champions_league_teams.loc[i]['SF1'])
        sf2_league = (champions_league_teams.loc[i]['SF2'])
        league_df.loc[winning_league]['range5win'] = (league_df.loc[winning_league]['range5win']) + 1
        league_df.loc[runner_league]['range5run'] = (league_df.loc[runner_league]['range5run']) + 1
        league_df.loc[sf1_league]['range5sf1'] = (league_df.loc[sf1_league]['range5sf1']) + 1
        league_df.loc[sf2_league]['range5sf2'] = (league_df.loc[sf2_league]['range5sf2']) + 1

In [26]:
league_df = league_df.fillna(0)

In [28]:
x=league_df.loc[(league_df!=0).any(axis=1)]

In [29]:
x

Unnamed: 0,range1win,range2win,range3win,range4win,range5win,range1run,range2run,range3run,range4run,range5run,range1sf1,range2sf1,range3sf1,range4sf1,range5sf1,range1sf2,range2sf2,range3sf2,range4sf2,range5sf2
Bundesliga,0,0,1,0,0,0,0,2,1,0,1,2,0,0,0,0,1,1,0,0
Eredivisie,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
La Liga,2,3,1,1,1,0,2,0,0,0,2,1,3,2,0,0,0,2,0,1
Ligue 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
Premier League,1,0,1,1,1,2,0,1,2,2,0,0,0,1,2,0,2,0,2,1
Serie A,0,0,0,1,1,1,1,0,0,1,0,0,0,0,1,1,0,0,0,0
