### Necessary Imports

In [46]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests

### Use Beautiful Soup to pull all contents of the web-page

In [47]:
page = requests.get('https://en.wikipedia.org/wiki/European_Cup_and_UEFA_Champions_League_records_and_statistics')
soup = BeautifulSoup(page.content,'html.parser')

### Find the necessary table on the page. First let us consider the finalist data.

In [48]:
tables = soup.findAll("table")

### Build Table Header

In [49]:
table_header = tables[0].findAll("tr")
header_list=[]
for i in range(1,len(table_header[0])):
    if(i==1):
        header_list.append(table_header[0].contents[i].contents[1].lstrip().rstrip())
    elif (i%2 != 0):
        header_list.append(table_header[0].contents[i].contents[0].lstrip().rstrip())

### Create an empty data frame with all the columns

In [50]:
cl_final_record = pd.DataFrame(columns=header_list)

### Consume all the row element columns and populate the data frame for the finalists

In [51]:
rows = tables[0].findAll("tr")
for club in range(1,len(rows)):
    club_name = rows[club].contents[1].contents[2].contents[0]
    titles = rows[club].contents[3].contents[0].rstrip()
    runners = rows[club].contents[5].contents[0].rstrip()
    x = rows[club].contents[7].findAll("a")
    win_list = []
    for i in range(0,len(x)):
        win_list.append(x[i].contents[0])
    y = rows[club].contents[9].findAll("a")
    runner_list = []
    for j in range(0,len(y)):
        runner_list.append(y[j].contents[0])
    cl_final_record.loc[club-1] = [club_name,titles,runners,win_list,runner_list]

### Find the data for the semi-finalists and build a seperate data frame

In [52]:
semi_table_hdr = tables[4].findAll("tr")
semi_hdr_list = []
for i in range(1,len(semi_table_hdr[0])):
    if (i%2 != 0):
        semi_hdr_list.append(semi_table_hdr[0].contents[i].contents[0].lstrip().rstrip())

### Consume all the row element columns and populate the data frame for the finalists

In [53]:
cl_semi_record = pd.DataFrame(columns=semi_hdr_list)
rows = tables[4].findAll("tr")
for club in range(1,len(rows)):
    club_name = rows[club].contents[1].contents[2].contents[0]
    number_of_semis = rows[club].contents[3].contents[0].rstrip()
    x = rows[club].contents[5].findAll("a")
    years = []
    for i in range(0,len(x)):
        years.append(x[i].contents[0])
    cl_semi_record.loc[club-1] = [club_name,number_of_semis,years]

### Using the finalists data frame, Build a dictionary to house data for winners in each season

In [54]:
season_dict={}
for i in range(0,len(cl_final_record)):
    club_name = cl_final_record['Club'][i]
    winning_list = cl_final_record[cl_final_record.Club == club_name]['Seasons won'].tolist()
    for i in range(0,len(winning_list)):
            season_list = winning_list[i]
            if(len(season_list)!=0):
                for i in season_list:
                    season_dict[i] = [club_name]

### Using the finalists data frame, Build a dictionary to house data for runner-ups in each season

In [55]:
for i in range(0,len(cl_final_record)):
    club_name = cl_final_record['Club'][i]
    runner_list = cl_final_record[cl_final_record.Club == club_name]['Seasons runner-up'].tolist()
    for i in range(0,len(runner_list)):
            season_list = runner_list[i]
            if(len(season_list)!=0):
                for i in season_list:
                    season_dict[i].append(club_name)

### Now add the other semi-finalists from the semi finalist data frame to complete the dataset

In [57]:
teams = cl_semi_record['Team']
for team in teams:
    season_list=[]
    team_list = cl_semi_record[cl_semi_record['Team']==team]['Years']
    team_index = team_list.index[0]
    season_list = team_list[team_index]
    for season in season_list:
        if(team!=season_dict[season][0]):
            if(team!=season_dict[season][1]):
                season_dict[season].append(team)

### Clean the data-set to remove duplication caused due to the use of different names for the same team

In [58]:
season_dict_items = season_dict.items()
for i in season_dict_items:
    if (len(i[1])>4):
        if((season_dict[i[0]][2][0:5]==season_dict[i[0]][0][0:5]) or (season_dict[i[0]][2][0:5]==season_dict[i[0]][1][0:5])):
            season_dict[i[0]].remove(season_dict[i[0]][2])
        elif((season_dict[i[0]][3][0:5]==season_dict[i[0]][0][0:5]) or (season_dict[i[0]][3][0:5]==season_dict[i[0]][1][0:5])):
            season_dict[i[0]].remove(season_dict[i[0]][3])
        elif((season_dict[i[0]][4][0:5]==season_dict[i[0]][0][0:5]) or (season_dict[i[0]][4][0:5]==season_dict[i[0]][1][0:5])):
            season_dict[i[0]].remove(season_dict[i[0]][4])

### Convert the list of dictionaries into a self-contained data-frame which has winners, runner-ups and finalists for each season

In [59]:
champions_league_data = pd.DataFrame(list(season_dict.items()))
champions_league_data.columns = ['Season','Teams']
winner_list=[]
runner_list=[]
sf1_list=[]
sf2_list=[]
for i in range(0,len(champions_league_data)):
    winner_list.append(champions_league_data.Teams[i][0])
    runner_list.append(champions_league_data.Teams[i][1])
    sf1_list.append(champions_league_data.Teams[i][2])
    sf2_list.append(champions_league_data.Teams[i][3])
champions_league_data['Winners'] = winner_list
champions_league_data['Runner Up'] = runner_list
champions_league_data['SF1'] = sf1_list
champions_league_data['SF2'] = sf2_list
champions_league_data = champions_league_data.drop(columns='Teams')
champions_league_data = champions_league_data.sort_values('Season')
champions_league_data = champions_league_data.set_index('Season')

In [60]:
champions_league_data

Unnamed: 0_level_0,Winners,Runner Up,SF1,SF2
Season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1956,Real Madrid,Reims,Milan,Hibernian
1957,Real Madrid,Fiorentina,Manchester United,Red Star Belgrade
1958,Real Madrid,Milan,Manchester United,Vasas
1959,Real Madrid,Reims,Atlético Madrid,Young Boys
1960,Real Madrid,Eintracht Frankfurt,Barcelona,Rangers
1961,Benfica,Barcelona,Hamburg,Rapid Wien
1962,Benfica,Real Madrid,Tottenham Hotspur,Standard Liège
1963,Milan,Benfica,Feyenoord,Dundee
1964,Internazionale,Real Madrid,Borussia Dortmund,Zürich
1965,Internazionale,Benfica,Liverpool,Győri ETO
