# OLYMPIC DATA ANALYSIS


## IMPORT OLYMPIC DATAFRAME FOR ANALYSIS

In [2]:
import pandas as pd
o_df = pd.read_csv('olympics.csv', skiprows=4) # skiprows  is used to skip top 4 rows in csv which have unrelevant data


## DATA EXPLORATION WITH PANDAS TO UNDERSTAND WHAT THE DATAFRAME LOOKS LIKE

### HERE WE LOOK INTO MOST IMPORTANT FUNCTIONS USED FOR DATA EXPLORATION

In [3]:
o_df.head() # it gives the 5 rows of dataset

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
0,Athens,1896,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold
1,Athens,1896,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100m freestyle,M,Silver
2,Athens,1896,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100m freestyle for sailors,M,Bronze
3,Athens,1896,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100m freestyle for sailors,M,Gold
4,Athens,1896,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100m freestyle for sailors,M,Silver


In [4]:
o_df.info() # provides the summary of the dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29216 entries, 0 to 29215
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   City          29216 non-null  object
 1   Edition       29216 non-null  int64 
 2   Sport         29216 non-null  object
 3   Discipline    29216 non-null  object
 4   Athlete       29216 non-null  object
 5   NOC           29216 non-null  object
 6   Gender        29216 non-null  object
 7   Event         29216 non-null  object
 8   Event_gender  29216 non-null  object
 9   Medal         29216 non-null  object
dtypes: int64(1), object(9)
memory usage: 2.2+ MB


In [5]:
o_df.describe() #gives summary statistics for numerical columns, including count, mean, standard deviation, minimum, and maximum values.

Unnamed: 0,Edition
count,29216.0
mean,1967.713171
std,32.406293
min,1896.0
25%,1948.0
50%,1976.0
75%,1996.0
max,2008.0


In [6]:
o_df.shape #returns the dimentions of the dataframe

(29216, 10)

In [7]:
o_df.dtypes #returns the data types of each columns

City            object
Edition          int64
Sport           object
Discipline      object
Athlete         object
NOC             object
Gender          object
Event           object
Event_gender    object
Medal           object
dtype: object

In [8]:
# Check if any column has null values
if o_df.isnull().any().any(): 
    #.isnull() creates a DataFrame of Boolean values where True represents a null value in the original DataFrame.
    #.any() is used twice to check for any null values. 
    # The first .any() checks if any column has at least one null value.
    # The second .any() checks if there is any True value in the result.
    print("There are columns with null values.")
else:
    print("No columns have null values.")


No columns have null values.


# ANALYSIS STARTS HERE WHERE WE TRY TO ANSWER SOME QUESTION

### 1. How has our china's overall medal count changed over the years?

In [9]:
#lets first look into the df
o_df.head()

Unnamed: 0,City,Edition,Sport,Discipline,Athlete,NOC,Gender,Event,Event_gender,Medal
0,Athens,1896,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100m freestyle,M,Gold
1,Athens,1896,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100m freestyle,M,Silver
2,Athens,1896,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100m freestyle for sailors,M,Bronze
3,Athens,1896,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100m freestyle for sailors,M,Gold
4,Athens,1896,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100m freestyle for sailors,M,Silver


In [10]:
#now we set a df to china's data and we know that for that we can filter it with NOC column
#china's noc is CHN
china_data = o_df[o_df.NOC == 'CHN'] #here we have created a df of china only
#now to check the year's trend of of medals we need to further group the data using Edition column
medal_trend_chn = china_data.groupby('Edition')['Medal'].count()

#lets check if we have required result
medal_trend_chn



Edition
1984     76
1988     53
1992     83
1996    110
2000     79
2004     94
2008    184
Name: Medal, dtype: int64

In [11]:
o_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29216 entries, 0 to 29215
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   City          29216 non-null  object
 1   Edition       29216 non-null  int64 
 2   Sport         29216 non-null  object
 3   Discipline    29216 non-null  object
 4   Athlete       29216 non-null  object
 5   NOC           29216 non-null  object
 6   Gender        29216 non-null  object
 7   Event         29216 non-null  object
 8   Event_gender  29216 non-null  object
 9   Medal         29216 non-null  object
dtypes: int64(1), object(9)
memory usage: 2.2+ MB
