## Exercise : Data Collection

### Importing the libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

### Requesting data from Wikipedia regarding current ATP rankings

In [2]:
# Send a request to the URL
url = "https://en.wikipedia.org/wiki/ATP_rankings"
response = requests.get(url)

In [3]:
# Parse HTML content with Beautiful Soup
soup = BeautifulSoup(response.content, "html.parser")


### Selecting the table from the webpage

In [4]:
# Find the ATP rankings table
table = soup.find("table", {"class": "wikitable nowrap"})


In [5]:
type(table)

bs4.element.Tag

### Extracting the table headers and rows

In [6]:
# Extract the table headers and rows
headers = []
rows = []
for row in table.find_all("tr"):
    cells = row.find_all("td")
    if len(cells) > 0:
        row_data = [cell.text.strip() for cell in cells]
        rows.append(row_data)
    else:
        header_cells = row.find_all("th")
        headers = [cell.text.strip() for cell in header_cells]

In [7]:
rows

[['1', 'Novak Djokovic\xa0(SRB)', '7,240', ''],
 ['2', 'Carlos Alcaraz\xa0(ESP)', '6,770', ''],
 ['3', 'Casper Ruud\xa0(NOR)', '5,255', '1'],
 ['4', 'Daniil Medvedev\xa0(RUS)', '5,240', '1'],
 ['5', 'Stefanos Tsitsipas\xa0(GRE)', '4,950', '2'],
 ['6', 'Andrey Rublev\xa0(RUS)', '4,380', ''],
 ['7', 'Holger Rune\xa0(DEN)', '3,865', '2'],
 ['8', 'Jannik Sinner\xa0(ITA)', '3,525', ''],
 ['9', 'Félix Auger-Aliassime\xa0(CAN)', '3,450', '2'],
 ['10', 'Taylor Fritz\xa0(USA)', '3,245', ''],
 ['11', 'Karen Khachanov\xa0(RUS)', '2,900', '1'],
 ['12', 'Frances Tiafoe\xa0(USA)', '2,870', '1'],
 ['13', 'Cameron Norrie\xa0(GBR)', '2,735', '1'],
 ['14', 'Rafael Nadal\xa0(ESP)', '2,715', '1'],
 ['15', 'Hubert Hurkacz\xa0(POL)', '2,660', '2'],
 ['16', 'Alexander Zverev\xa0(GER)', '2,140', ''],
 ['17', 'Pablo Carreño Busta\xa0(ESP)', '2,095', ''],
 ['18', 'Tommy Paul\xa0(USA)', '2,070', ''],
 ['19', 'Alex de Minaur\xa0(AUS)', '2,050', ''],
 ['20', 'Lorenzo Musetti\xa0(ITA)', '1,930', '1']]

In [8]:
headers

['No.', 'Player', 'Points', 'Move']

### Creating a dataframe with the rows and headers

In [9]:
# Create a DataFrame from the scraped data
df = pd.DataFrame(rows, columns=headers)

# Print the DataFrame
print(df)


   No.                       Player Points Move
0    1         Novak Djokovic (SRB)  7,240     
1    2         Carlos Alcaraz (ESP)  6,770     
2    3            Casper Ruud (NOR)  5,255    1
3    4        Daniil Medvedev (RUS)  5,240    1
4    5     Stefanos Tsitsipas (GRE)  4,950    2
5    6          Andrey Rublev (RUS)  4,380     
6    7            Holger Rune (DEN)  3,865    2
7    8          Jannik Sinner (ITA)  3,525     
8    9  Félix Auger-Aliassime (CAN)  3,450    2
9   10           Taylor Fritz (USA)  3,245     
10  11        Karen Khachanov (RUS)  2,900    1
11  12         Frances Tiafoe (USA)  2,870    1
12  13         Cameron Norrie (GBR)  2,735    1
13  14           Rafael Nadal (ESP)  2,715    1
14  15         Hubert Hurkacz (POL)  2,660    2
15  16       Alexander Zverev (GER)  2,140     
16  17    Pablo Carreño Busta (ESP)  2,095     
17  18             Tommy Paul (USA)  2,070     
18  19         Alex de Minaur (AUS)  2,050     
19  20        Lorenzo Musetti (ITA)  1,9

### Checking for null values

In [10]:
df.info

<bound method DataFrame.info of    No.                       Player Points Move
0    1         Novak Djokovic (SRB)  7,240     
1    2         Carlos Alcaraz (ESP)  6,770     
2    3            Casper Ruud (NOR)  5,255    1
3    4        Daniil Medvedev (RUS)  5,240    1
4    5     Stefanos Tsitsipas (GRE)  4,950    2
5    6          Andrey Rublev (RUS)  4,380     
6    7            Holger Rune (DEN)  3,865    2
7    8          Jannik Sinner (ITA)  3,525     
8    9  Félix Auger-Aliassime (CAN)  3,450    2
9   10           Taylor Fritz (USA)  3,245     
10  11        Karen Khachanov (RUS)  2,900    1
11  12         Frances Tiafoe (USA)  2,870    1
12  13         Cameron Norrie (GBR)  2,735    1
13  14           Rafael Nadal (ESP)  2,715    1
14  15         Hubert Hurkacz (POL)  2,660    2
15  16       Alexander Zverev (GER)  2,140     
16  17    Pablo Carreño Busta (ESP)  2,095     
17  18             Tommy Paul (USA)  2,070     
18  19         Alex de Minaur (AUS)  2,050     
19  20  

In [11]:
df.isnull().sum()

No.       0
Player    0
Points    0
Move      0
dtype: int64

### It looks like the column 'Move' has spaces as values. So, we need to replace it with zeros. This is because, this column basically indicates the level in which the player moves.

In [12]:
df['Move'].unique()

array(['', '1', '2'], dtype=object)

In [13]:
# replace columns with only spaces with zeros
df.loc[:, df.eq(' ').all()] = 0


In [14]:
df.head()

Unnamed: 0,No.,Player,Points,Move
0,1,Novak Djokovic (SRB),7240,
1,2,Carlos Alcaraz (ESP),6770,
2,3,Casper Ruud (NOR),5255,1.0
3,4,Daniil Medvedev (RUS),5240,1.0
4,5,Stefanos Tsitsipas (GRE),4950,2.0


In [15]:
df['Move'].unique()

array(['', '1', '2'], dtype=object)

In [16]:
df = df.replace("", 0)

In [17]:
df.head()

Unnamed: 0,No.,Player,Points,Move
0,1,Novak Djokovic (SRB),7240,0
1,2,Carlos Alcaraz (ESP),6770,0
2,3,Casper Ruud (NOR),5255,1
3,4,Daniil Medvedev (RUS),5240,1
4,5,Stefanos Tsitsipas (GRE),4950,2


### Splitting the 'Player' column into two columns.

In [18]:
df[['Name','Country']] = df.Player.str.split('\(|\)', expand=True).iloc[:,[0,1]]

In [19]:
df.head()

Unnamed: 0,No.,Player,Points,Move,Name,Country
0,1,Novak Djokovic (SRB),7240,0,Novak Djokovic,SRB
1,2,Carlos Alcaraz (ESP),6770,0,Carlos Alcaraz,ESP
2,3,Casper Ruud (NOR),5255,1,Casper Ruud,NOR
3,4,Daniil Medvedev (RUS),5240,1,Daniil Medvedev,RUS
4,5,Stefanos Tsitsipas (GRE),4950,2,Stefanos Tsitsipas,GRE


In [20]:
df=df.drop(columns=['Player'])

In [21]:
df.head()

Unnamed: 0,No.,Points,Move,Name,Country
0,1,7240,0,Novak Djokovic,SRB
1,2,6770,0,Carlos Alcaraz,ESP
2,3,5255,1,Casper Ruud,NOR
3,4,5240,1,Daniil Medvedev,RUS
4,5,4950,2,Stefanos Tsitsipas,GRE


In [22]:
df = df.reindex(columns=['No.','Name','Country','Points','Move'])

In [23]:
df.head()

Unnamed: 0,No.,Name,Country,Points,Move
0,1,Novak Djokovic,SRB,7240,0
1,2,Carlos Alcaraz,ESP,6770,0
2,3,Casper Ruud,NOR,5255,1
3,4,Daniil Medvedev,RUS,5240,1
4,5,Stefanos Tsitsipas,GRE,4950,2


In [24]:
df.shape

(20, 5)

### Writing the dataset into a csv file.

In [25]:
df.to_csv('Atp_Clean.csv', index=False)

In [26]:
df.dtypes

No.        object
Name       object
Country    object
Points     object
Move       object
dtype: object