In [2]:
import pandas as pd

In [5]:
bios = pd.read_csv('./athletes/bios.csv')

In [10]:
bios.head()

Unnamed: 0,Roles,Sex,Full name,Used name,Born,Died,NOC,athlete_id,Measurements,Affiliations,Nick/petnames,Title(s),Other names,Nationality,Original name,Name order
0,Competed in Olympic Games,Male,"François Joseph Marie Antoine ""Jean-François""•...",Jean-François•Blanchy,"12 December 1886 in Bordeaux, Gironde (FRA)","2 October 1960 in Saint-Jean-de-Luz, Pyrénées-...",France,1,,,,,,,,
1,Competed in Olympic Games,Male,Arnaud Benjamin•Boetsch,Arnaud•Boetsch,"1 April 1969 in Meulan, Yvelines (FRA)",,France,2,183 cm / 76 kg,"Racing Club de France, Paris (FRA)",,,,,,
2,Competed in Olympic Games • Administrator,Male,Jean Laurent Robert•Borotra,Jean•Borotra,"13 August 1898 in Biarritz, Pyrénées-Atlantiqu...","17 July 1994 in Arbonne, Pyrénées-Atlantiques ...",France,3,183 cm / 76 kg,"TCP, Paris (FRA)",Le Basque Bondissant (The Bounding Basque),,,,,
3,Competed in Olympic Games,Male,Jacques Marie Stanislas Jean•Brugnon,Jacques•Brugnon,"11 May 1895 in Paris VIIIe, Paris (FRA)","20 March 1978 in Monaco, Monaco (MON)",France,4,168 cm / 64 kg,"Sporting club de Paris, Paris (FRA)",Toto,,,,,
4,Competed in Olympic Games,Male,Henry Albert•Canet,Albert•Canet,"17 April 1878 in Wandsworth, England (GBR)","25 July 1930 in Paris VIIe, Paris (FRA)",France,5,,"TCP, Paris (FRA)",,,,,,


### What should we clean up?

- [ ] Get rid of bullet points in Used Names
- [ ] Split height/weight
- [ ] Parse out dates from `Born` & `Died` column
- [ ] Parse out city, region, and country from `Born` column
- [ ] Get rid of extra columns

In [13]:
df = bios.copy()

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145500 entries, 0 to 145499
Data columns (total 16 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   Roles          145500 non-null  object
 1   Sex            145500 non-null  object
 2   Full name      145500 non-null  object
 3   Used name      145500 non-null  object
 4   Born           143772 non-null  object
 5   Died           34042 non-null   object
 6   NOC            145499 non-null  object
 7   athlete_id     145500 non-null  int64 
 8   Measurements   107833 non-null  object
 9   Affiliations   95832 non-null   object
 10  Nick/petnames  9145 non-null    object
 11  Title(s)       399 non-null     object
 12  Other names    7167 non-null    object
 13  Nationality    8259 non-null    object
 14  Original name  30739 non-null   object
 15  Name order     7844 non-null    object
dtypes: int64(1), object(15)
memory usage: 17.8+ MB


# Getting rid of bullet points in Used name column

First check is what is the length of Used name column

In [16]:
len(df)

145500

is it the same as the number of entries having the bullet point in it?

In [28]:
templist = df['Used name']
notokrows = 0
for item in templist:
    if "•" in str(item):
        notokrows += 1
print(notokrows)

145500


so all rows has the bullet point in it.<br>
Alternative way to check this is given below.<br>str.contains() method in pandas can be used here.

In [31]:
df['Used name'].str.contains("•")

0         True
1         True
2         True
3         True
4         True
          ... 
145495    True
145496    True
145497    True
145498    True
145499    True
Name: Used name, Length: 145500, dtype: bool

We create a name column copying everything else from the Used name column except the weird symbol

In [32]:
df['name'] = df['Used name'].str.replace("•"," ")

In [33]:
df.head()

Unnamed: 0,Roles,Sex,Full name,Used name,Born,Died,NOC,athlete_id,Measurements,Affiliations,Nick/petnames,Title(s),Other names,Nationality,Original name,Name order,name
0,Competed in Olympic Games,Male,"François Joseph Marie Antoine ""Jean-François""•...",Jean-François•Blanchy,"12 December 1886 in Bordeaux, Gironde (FRA)","2 October 1960 in Saint-Jean-de-Luz, Pyrénées-...",France,1,,,,,,,,,Jean-François Blanchy
1,Competed in Olympic Games,Male,Arnaud Benjamin•Boetsch,Arnaud•Boetsch,"1 April 1969 in Meulan, Yvelines (FRA)",,France,2,183 cm / 76 kg,"Racing Club de France, Paris (FRA)",,,,,,,Arnaud Boetsch
2,Competed in Olympic Games • Administrator,Male,Jean Laurent Robert•Borotra,Jean•Borotra,"13 August 1898 in Biarritz, Pyrénées-Atlantiqu...","17 July 1994 in Arbonne, Pyrénées-Atlantiques ...",France,3,183 cm / 76 kg,"TCP, Paris (FRA)",Le Basque Bondissant (The Bounding Basque),,,,,,Jean Borotra
3,Competed in Olympic Games,Male,Jacques Marie Stanislas Jean•Brugnon,Jacques•Brugnon,"11 May 1895 in Paris VIIIe, Paris (FRA)","20 March 1978 in Monaco, Monaco (MON)",France,4,168 cm / 64 kg,"Sporting club de Paris, Paris (FRA)",Toto,,,,,,Jacques Brugnon
4,Competed in Olympic Games,Male,Henry Albert•Canet,Albert•Canet,"17 April 1878 in Wandsworth, England (GBR)","25 July 1930 in Paris VIIe, Paris (FRA)",France,5,,"TCP, Paris (FRA)",,,,,,,Albert Canet


In [34]:
df[['Height', 'Weight']] = df['Measurements'].str.split(' / ', expand=True)

In [35]:
df

Unnamed: 0,Roles,Sex,Full name,Used name,Born,Died,NOC,athlete_id,Measurements,Affiliations,Nick/petnames,Title(s),Other names,Nationality,Original name,Name order,name,Height,Weight
0,Competed in Olympic Games,Male,"François Joseph Marie Antoine ""Jean-François""•...",Jean-François•Blanchy,"12 December 1886 in Bordeaux, Gironde (FRA)","2 October 1960 in Saint-Jean-de-Luz, Pyrénées-...",France,1,,,,,,,,,Jean-François Blanchy,,
1,Competed in Olympic Games,Male,Arnaud Benjamin•Boetsch,Arnaud•Boetsch,"1 April 1969 in Meulan, Yvelines (FRA)",,France,2,183 cm / 76 kg,"Racing Club de France, Paris (FRA)",,,,,,,Arnaud Boetsch,183 cm,76 kg
2,Competed in Olympic Games • Administrator,Male,Jean Laurent Robert•Borotra,Jean•Borotra,"13 August 1898 in Biarritz, Pyrénées-Atlantiqu...","17 July 1994 in Arbonne, Pyrénées-Atlantiques ...",France,3,183 cm / 76 kg,"TCP, Paris (FRA)",Le Basque Bondissant (The Bounding Basque),,,,,,Jean Borotra,183 cm,76 kg
3,Competed in Olympic Games,Male,Jacques Marie Stanislas Jean•Brugnon,Jacques•Brugnon,"11 May 1895 in Paris VIIIe, Paris (FRA)","20 March 1978 in Monaco, Monaco (MON)",France,4,168 cm / 64 kg,"Sporting club de Paris, Paris (FRA)",Toto,,,,,,Jacques Brugnon,168 cm,64 kg
4,Competed in Olympic Games,Male,Henry Albert•Canet,Albert•Canet,"17 April 1878 in Wandsworth, England (GBR)","25 July 1930 in Paris VIIe, Paris (FRA)",France,5,,"TCP, Paris (FRA)",,,,,,,Albert Canet,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145495,Competed in Olympic Games,Female,Polina•Luchnikova,Polina•Luchnikova,"30 January 2002 in Serov, Sverdlovsk (RUS)",,ROC,149222,167 cm / 61 kg,"Agidel, Ufa (RUS)",,,,,,,Polina Luchnikova,167 cm,61 kg
145496,Competed in Olympic Games,Female,Valeriya Sergeyevna•Merkusheva,Valeriya•Merkusheva,"20 September 1999 in Moskva (Moscow), Moskva (...",,ROC,149223,168 cm / 65 kg,"SKIF Nizhny Novgorod, Nizhny Novgorod (RUS)",,,,,Валерия Сергеевна•Меркушева,,Valeriya Merkusheva,168 cm,65 kg
145497,Competed in Olympic Games,Female,Yuliya•Smirnova,Yuliya•Smirnova,"8 May 1998 in Kotlas, Arkhangelsk (RUS)",,ROC,149224,163 cm / 55 kg,"Agidel, Ufa (RUS)",,,,,Юлия•Смирнова,,Yuliya Smirnova,163 cm,55 kg
145498,Competed in Olympic Games,Male,André•Foussard,André•Foussard,"19 May 1899 in Niort, Deux-Sèvres (FRA)","18 March 1986 in Niort, Deux-Sèvres (FRA)",France,149225,166 cm,"Beauvoisine FC, Rouen (FRA)",,,,,,,André Foussard,166 cm,
