# Importing Libraries

In [2]:
import pandas as pd
import seaborn as sns
import os

# Loading Data

In [3]:
folder_path = '/content/drive/MyDrive/Tennis_Analysis/full_matches_data'

# List all files in the directory with a .csv extension
all_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.endswith('.csv')]

# Use a list comprehension to read each file into a dataframe and then concatenate them all
combined_df = pd.concat([pd.read_csv(os.path.join(folder_path, f)) for f in all_files], ignore_index=True)

In [5]:
combined_df.sample(29)

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
92824,2019-0407,Rotterdam,Hard,32,A,20190211,272,104269,,,...,42.0,32.0,9.0,11.0,5.0,8.0,26.0,1455.0,45.0,1001.0
84491,2016-M007,Miami Masters,Hard,128,M,20160321,208,106298,,,...,40.0,27.0,7.0,10.0,4.0,8.0,88.0,622.0,49.0,918.0
19870,1996-407,Rotterdam,Carpet,32,A,19960304,22,101820,4.0,,...,63.0,46.0,15.0,15.0,3.0,6.0,12.0,1651.0,44.0,841.0
85954,2016-6242,Winston-Salem,Hard,64,A,20160822,281,104655,3.0,,...,20.0,13.0,20.0,9.0,5.0,9.0,20.0,1745.0,53.0,829.0
26117,1997-747,Beijing,Hard,32,A,19970929,28,102202,,,...,80.0,48.0,23.0,18.0,5.0,10.0,91.0,507.0,83.0,575.0
95377,2020-580,Australian Open,Hard,128,G,20200120,109,103333,,,...,58.0,48.0,23.0,17.0,3.0,5.0,124.0,436.0,138.0,392.0
82563,2015-D064,Davis Cup G2 PO: RSA vs IRL,Hard,4,D,20150717,2,120401,,,...,,,,,,,334.0,140.0,425.0,96.0
36741,2000-D059,Davis Cup G1 PO: POR vs RSA,Clay,4,D,20001006,4,101965,,,...,,,,,,,20.0,1243.0,352.0,80.0
17923,1995-423,Los Angeles,Hard,32,A,19950731,29,102358,5.0,,...,51.0,45.0,25.0,16.0,8.0,10.0,16.0,1551.0,7.0,2660.0
51379,2005-422,Cincinnati Masters,Hard,64,M,20050815,45,104053,5.0,,...,75.0,53.0,15.0,16.0,1.0,3.0,5.0,3130.0,24.0,1245.0


# Understanding the Big Picture

In [7]:
combined_df.shape

(104682, 49)

In [8]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104682 entries, 0 to 104681
Data columns (total 49 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   tourney_id          104682 non-null  object        
 1   tourney_name        104682 non-null  object        
 2   surface             104682 non-null  object        
 3   draw_size           104682 non-null  int64         
 4   tourney_level       104682 non-null  object        
 5   tourney_date        104682 non-null  datetime64[ns]
 6   match_num           104682 non-null  int64         
 7   winner_id           104682 non-null  int64         
 8   winner_seed         42400 non-null   float64       
 9   winner_entry        12809 non-null   object        
 10  winner_name         104682 non-null  object        
 11  winner_hand         104673 non-null  object        
 12  winner_ht           102228 non-null  float64       
 13  winner_ioc          104682 no

# Data Cleaning

Checking if there are any duplicate rows

In [None]:
number_of_duplicated_rows = combined_df.duplicated().sum()
print(f"There are {number_of_duplicated_rows} duplicated rows in the dataset.")

Changing the format of the dates in the tourney_date column, from `YYYYMMDD` format to `Y M D` format.

In [6]:
combined_df['tourney_date'] = pd.to_datetime(combined_df['tourney_date'], format='%Y%m%d')
combined_df.head()

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,1991-339,Adelaide,Hard,32,A,1990-12-31,1,101723,,,...,62.0,44.0,23.0,16.0,6.0,8.0,56.0,,2.0,
1,1991-339,Adelaide,Hard,32,A,1990-12-31,2,100946,,Q,...,41.0,35.0,27.0,15.0,1.0,2.0,304.0,,75.0,
2,1991-339,Adelaide,Hard,32,A,1990-12-31,3,101234,,,...,37.0,22.0,6.0,8.0,4.0,8.0,82.0,,69.0,
3,1991-339,Adelaide,Hard,32,A,1990-12-31,4,101889,8.0,,...,45.0,30.0,11.0,10.0,5.0,8.0,50.0,,84.0,
4,1991-339,Adelaide,Hard,32,A,1990-12-31,5,101274,,,...,41.0,28.0,15.0,11.0,4.0,8.0,88.0,,28.0,


Deleting unnecessary columns