In [None]:
from IPython.display import HTML, display
display(HTML("<table><tr><td><img src='data/rings2.png' width='620'></td><td><img src='data/sports.png' width='300'></td></tr></table>"))

In [None]:
#!pip install cufflinks
#!pip install ipywidgets

In [None]:
# load libraries and helper code
from helper_code.olympics import *
cf.go_offline()

# Group goal

 
Go through the  analysis below, work on challenges.


**Extra challenge**:

Is there anything else interesting you can find and visualize for this data? 

### Getting data
Olympics dataset was downloaded from [Kaggle](https://www.kaggle.com/heesoo37/120-years-of-olympic-history-athletes-and-results/data#athlete_events.csv)

**Kaggle** is the online community of data scientists and machine learners and the most well known competition platform for predictive modeling and analytics.

In [None]:
#there are 2 data files:

#Athlets
file_name1 = "data/athlete_events.csv"

#National Olympic regions
file_name2 = "data/noc_regions.csv"

In [None]:
#if reading from cloud object storage
target_url1="https://swift-yeg.cloud.cybera.ca:8080/v1/AUTH_d22d1e3f28be45209ba8f660295c84cf/hackaton/athlete_events.csv"
urllib.request.urlretrieve(target_url1, file_name1)
target_url2="https://swift-yeg.cloud.cybera.ca:8080/v1/AUTH_d22d1e3f28be45209ba8f660295c84cf/hackaton/noc_regions.csv"
urllib.request.urlretrieve(target_url2, file_name2)

In [None]:
#reading both files and combining the into one dataframe
athlets = pd.read_csv(file_name1) 
nocs = pd.read_csv(file_name2) 


olympics = pd.merge(athlets,nocs, how="left", on="NOC")

In [None]:
#how many rows and colums does the dataframe have?
olympics.shape

In [None]:
#what are the column names?
olympics.columns

Here is the column description from Kaggle:

**ID** - Unique number for each athlete  
**Name** - Athlete's name  
**Sex** - M or F  
**Age** - Integer  
**Height** - In centimeters  
**Weight** - In kilograms  
**Team** - Team name  
**NOC** - National Olympic Committee 3-letter code  
**Games** - Year and season  
**Year** - Integer  
**Season** - Summer or Winter  
**City** - Host city  
**Sport** - Sport  
**Event** - Event  
**Medal** - Gold, Silver, Bronze, or NA  
**region** - Country 

In [None]:
#display first 5 rows to explore how the data looks like
olympics.head()

### Number of participants by year

In [None]:
#lets group by year and calculate number of rows for every group
athlets_by_year = olympics.groupby(["Year"]).size()

#creating additional column "count" with the number of athlets per year 
athlets_by_year = athlets_by_year.reset_index(name="count")

#printing first 5 years and umber of athlets on the screen
athlets_by_year.head()

In [None]:
#what is the maximum number of participants:
athlets_by_year["count"].max()

In [None]:
#we create line graph
#setting index to "year" column so it will display year when we plot

athlets_by_year.set_index("Year")iplot()

### Challenge

Find the minimum number of Olympics participants using **min()** function

Experiment with differnt kinds of plots:

 - Try creating new cell by copying the call above and change **iplot()** to **iplot(kind="bar")** or **iplot(kind="barh")** or **.iplot(kind="area",fill=True)**  - wich visualisation works best?
 
 - What interesting can you notice on this plot? What do you think happened between years 1992 and 1994?


### Number of participants by year and by season

In [None]:
#in this case we call function "get_counts_by_group()" - we have defined it in olympics.py file
athlets_by_season = get_counts_by_group(olympics, "Season")

athlets_by_season.head()

In [None]:
athlets_by_season.iplot(kind="bar", barmode="stack")

Looks like Summer and Winter Olympics were  run in the same year before 1994!

Lets find the year with the most participants in Summer season:
 - we will do this using **sort_values()** function:

In [None]:
#note we have "ascending = False" , try changing it to "ascending = True" and see what happens

athlets_by_season.sort_values("Summer", ascending = False).head(10)

### Challenge

 - Using the example above - create new cell(s) and try to find number of participants by year and by sex(using "Sex" column)
 - Which year had the most number of female participants?

### Number of medals by country by sport

In [None]:
#we will keep only the rows for athlets who got medals
medals = olympics.dropna(subset=["Medal"])

#lets select only Winter season
medals_winter = medals[medals["Season"]=="Winter"]

#grouping by year and country and calculating the number  of rows
medals_by_region = get_counts_by_group(medals_winter, "region")

#displaying top 5 rows
medals_by_region.head()

In [None]:
#we will display data only for some countries - there are too many of them - it will get too messy if we plot all
medals_subset = medals_by_region[["Canada","Russia","USA","Norway","Japan","China"]]

medals_subset.iplot(kind="area",fill=True)

### Challenge
 - Using the example above - create new cell(s) and display number of medals for summer season
 
 - Is Canada more successful in Winter or in Summer Olympics?
 - What was the year when Canada got the most medals in Summer Olympics?
     

### Extra:   

We can choose country using interactive input   

**Note**:if you enter not existing country - the code will give an error, restart the cell to start over.

In [None]:
print("Enter country:")

country = input()

medals_subset1 = medals_by_region[country]

medals_subset1.iplot(kind="area",fill=True)

### For Summer olympics 1984 how many gold/silver/bronze medals in total and by sport

In [None]:
# we subset by specific year, county and season

medals_by_country = medals[(medals["Season"]=="Summer") 
                            &(medals["Year"]==1984) 
                            &(medals["region"]=="Canada")]

In [None]:
# we call predefined function to get medal counts - its in code_helper/olympics.py
medals_by_kind = medals_by_country.groupby(["Medal"]).size().reset_index(name="count")

medals_by_kind

In [None]:
#using new kind of plot - Pie chart, note it needs labels and values set so specific columns

medals_by_kind.iplot(kind="pie", labels="Medal",values="count")

In [None]:
# to get number of medal by sport - we call predefined function - its in code_helper/olympics.py
medal_by_sport = get_counts_by_medal(medals_by_country)

medal_by_sport

In [None]:
#note: barmode ='stack'  means bars stack on top of each other
medal_by_sport.iplot(kind = "bar", barmode = "stack")

### Challenge

- Using the example above - create new cell(s) and analyse number of medals for Russia in Summer 1980
  - What was the location of these Olympics?

## Extra

On the plot below we can compare the number of participants vs number of medals, feel free to play with the 
different years, countries and seasons.

In [None]:

summary = get_participation_counts(olympics ,year=1984, season="Summer", country="Canada")

summary.iplot(kind= "bar", barmode="stack")