# Olympic Project - road to gold

## Questions:
 1. How has the performance of athletes changed based on gender, and has this led to a reduction in the performance gap?
 2. Can past Olympic results reliably predict future outcomes?

## Hypothesis:


1. As women have increasingly engaged in the Olympic Games and gained more equitable chances to train and compete, the performance gap should have diminished over the last century and is expected to continue decreasing in every competition.

# 1-How has the performance of athletes changed based on gender, and has this led to a reduction in the performance gap?

## Olympic performance

In [None]:
#import of the data frame
from selenium import webdriver 
import time
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os
import re


## Getting the imformation

## Functions

In [None]:
def info_extraction(soup):
    ''' this function extract the information of a specifique event from the olympic web page using web scrapping
    '''
    
    #location and indentification of the required information
    table = soup.find('div',attrs={'data-cy':'table-content'})
    rows = table.find_all('div',attrs={'data-row-id':True})
    #list of the info to capture
    countries = []
    participant=[]
    results=[]
    #loop to extract all events information of the dicipline selected
    for row in rows:
        try:
            countries.append(row.find('span',attrs={'class':'styles__CountryName-sc-1r5phm6-1 eQULfE'}).text)
        except:
            countries.append(None)
        try:
            participant.append(row.find('h3',attrs={'data-cy':'athlete-name'}).text)
        except:
            participant.append(None)
        try:
            results.append(row.find('span',attrs={'data-cy':'result-info-content'}).text)
        except:
            results.append(None)
    info_event=pd.DataFrame({'country':countries,'participant':participant,'result':results})
    #add a column with the olympics games name
    olympicg= soup.find('button',attrs={'data-cy':'game-select'}).text
    info_event['olympic_game']=olympicg
    #add a column with the dicipline name
    discipline=soup.find('button',attrs={'data-cy':'discipline-select'}).text
    info_event['discipline']=discipline
    #add a column with the event name
    event=soup.find('button',attrs={'data-cy':'event-select'}).text
    info_event['event']=event
    #return the entire dataset with all the required information
    return info_event

def data_grab(url):
    ''' from the url of a specific discipline at the select olympic game, this function upload the pages of all event to be able to extract the information.
    It return a csv files in order to use the information later.'''
    #create the list with the required information
    final_result=[]
    #calculate the number of event in the dicipline selected
    driver.get(url)
    time.sleep(5)
    button_event = driver.find_element('css selector','button[data-cy=event-select]')
    button_event.click()
    events = len(driver.find_elements('css selector','button[data-cy=event-button]'))
    button_event.click()
    #loop to go though all the url and grab the required information or print the url where it did't work
    try:    
        for i in range(11,events):
            button_event = driver.find_element('css selector','button[data-cy=event-select]')
            button_event.click()
            eventfor=driver.find_elements('css selector','button[data-cy=event-button]')
            eventfor=eventfor[i]
            eventfor.click()
            #Go to the result of the event selected
            button_go = driver.find_element('css selector','a[data-cy=go-link]')
            try:
                button_go.click()
            except:
                driver.get(button_go.get_attribute('href'))  
            time.sleep(5)
            soup= BeautifulSoup(driver.page_source)
            final_result.append(info_extraction(soup))
        pd.concat(final_result).to_csv(url.split('olympic-games/')[1].replace('/','_')+'.csv')
    except Exception as error:
        print(error)        
        print(url)
        return final_result

def check_valid_urls(urls):
    """Removes urls from the list that have a 400 status
        Parameters:
            urls- a iterable with url in string format
        Return:
            a list of working urls
    """
    driver = webdriver.Chrome()
    valid_urls=[]
    for url in urls:
        driver.get(url)
        if not driver.current_url.endswith('404.html'):
            valid_urls.append(url)
    return valid_urls.to_csv('olympics_url'+'.csv')

## extracting code

In [None]:
url_list=pd.read_csv('olympics_url.csv').iloc[:,1].to_list()
w_list=[fil for fil in url_list if fil.endswith('weightlifting')]
driver = webdriver.Chrome()
driver.get('https://olympics.com/en/olympic-games/beijing-2022/results/alpine-skiing')
time.sleep(5)
cookies_button = driver.find_element('css selector','#onetrust-accept-btn-handler')
cookies_button.click()
time.sleep(5)
list(map(data_grab,w_list))

## Cleanning

In [28]:
#select the event with comparable result
csv_list = [pd.read_csv('../Project-branches/Data events/'+fil) for fil in os.listdir('../Project-branches/Data events/') if fil.endswith('swimming.csv') or fil.endswith('weightlifting.csv') or fil.endswith('athletics.csv')
             or fil.endswith('cycling-track.csv') or fil.endswith('rowing.csv') or fil.endswith('sailing.csv')
             ]
data_olympic=pd.concat(csv_list)
data_olympic.rename(columns={'Unnamed: 0': 'rank'},inplace=True)
#drop rows if result==nan
data_olympic.dropna(subset='result',inplace=True)
#divide olympic_game columns into two columns olympic_host and olympic_game_year
data_olympic['olympic_host']=data_olympic['olympic_game'].str.split(' ').str[0]
data_olympic['olympic_game_year']=data_olympic['olympic_game'].str.split(' ').str[1]
data_olympic.drop(columns='olympic_game',inplace=True)
data_olympic['olympic_host']=data_olympic['olympic_host'].str.replace('Los','Los Angeles').str.replace('Mexico','Mexico City') #Manual solution :(
data_olympic['olympic_game_year']=data_olympic['olympic_game_year'].str.replace('Angeles','1984').str.replace('City','1968') #Manual solution :(
#create and separete by gender using column event and drop rows of mix events
data_olympic['gender'] = data_olympic['event'].apply(lambda x: re.findall(r'\b(men|women)\b', x, flags=re.IGNORECASE)[0].lower() if re.findall(r'\b(men|women)\b', x, flags=re.IGNORECASE) else 'mix')
data_olympic=data_olympic[data_olympic['gender']!='mix']
#cleaning event
data_olympic['event'] = data_olympic['event'].apply(lambda x: re.sub(r'\b(?:men|women)\b', '', x, flags=re.IGNORECASE).lower().strip())
data_olympic['event']=data_olympic['event'].str.replace("'s",'',)
#filter by rank
data_olympic=data_olympic[data_olympic['rank']<=2]

## Filtering

In [None]:
pd.pivot_table(data_olympic,index=['discipline','event'],columns='gender',values='result',aggfunc='count').sort_values(by='women',ascending=False).head(20) #this can be a function that give you this list or dict with x top events

## Creating comparable tables

In [None]:
#first event
relay400=data_olympic[data_olympic['event']=='4x100m relay']
relay400['result']=relay400['result'].str.replace('w','')
relay400['result']=relay400['result'].astype(float)
relay400s=pd.pivot_table(relay400, index=['olympic_game_year'], columns='gender', values='result', aggfunc='mean')
relay400s['gap']=relay400s['men']-relay400s['women']
relay400s.round(2).sort_index()

In [None]:
#Second event
free100=data_olympic[data_olympic['event']=='100m']
free100['result']=free100['result'].str.replace('w','')
free100['result']=free100['result'].astype(float)
free100s=pd.pivot_table(free100, index=['olympic_game_year'], columns='gender', values='result', aggfunc='mean')
free100s['gap']=free100s['men']-free100s['women']
free100s.round(2).sort_index()

In [None]:
#third event
high_jump=data_olympic[data_olympic['event']=='high jump']
high_jump['result']=high_jump['result'].astype(float)
high_jump_sum=pd.pivot_table(high_jump, index=['olympic_game_year'], columns='gender', values='result', aggfunc='mean')
high_jump_sum['gap']=high_jump_sum['men']-high_jump_sum['women']
high_jump_sum.round(2).sort_index()