In [1]:
import pandas as pd
import numpy as np
from tabulate import tabulate

In [2]:
df = pd.read_csv("Virat-Kohli-International-Cricket-Centuries.csv")

In [3]:
# Display the DataFrame without the index with (.to_string(index = False))

# Here's the different types of df you can print
# print(df.to_string(index=False))                          # full data without automated serial number
# print(df.drop(columns=['Against']))                       # Full data without the selected column
# print(df.iloc[70:80].to_string(index=False))              # data required from a specific range
# print(tabulate(df, headers='keys', tablefmt='grid'))      # to print data in table format

In [4]:
df.head()
df.info()
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   No.       80 non-null     int64 
 1   Runs      80 non-null     int64 
 2   Against   80 non-null     object
 3   Position  80 non-null     int64 
 4   Innings   80 non-null     int64 
 5   Venue     80 non-null     object
 6   Ground    80 non-null     object
 7   Date      80 non-null     object
 8   Result    80 non-null     object
dtypes: int64(4), object(5)
memory usage: 5.8+ KB
           No.        Runs   Position   Innings
count  80.0000   80.000000  80.000000  80.00000
mean   40.5000  131.550000   3.500000   1.67500
std    23.2379   35.094772   0.656033   0.67082
min     1.0000  100.000000   2.000000   1.00000
25%    20.7500  107.000000   3.000000   1.00000
50%    40.5000  118.500000   3.000000   2.00000
75%    60.2500  139.250000   4.000000   2.00000
max    80.0000  254.000000   6.000000   4.0000

In [5]:
null_ground_rows = df[df['Ground'].isnull()]
print(null_ground_rows)

Empty DataFrame
Columns: [No., Runs, Against, Position, Innings, Venue, Ground, Date, Result]
Index: []


In [6]:
unique_results = df['Result'].unique()
print("Unique Results:", unique_results)

Unique Results: ['Tied' 'Won' 'Lost' 'Drawn']


In [7]:
result_counts = df['Result'].value_counts()
print("Count of each result:")
print(result_counts)

Count of each result:
Result
Won      55
Lost     14
Drawn    10
Tied      1
Name: count, dtype: int64


In [8]:
lost_match = df[df['Result'] == 'Lost'].to_string(index=False)
print(lost_match)

 No.  Runs       Against  Position  Innings                                         Venue Ground      Date Result
   6   107       England         4        1                       Sophia Gardens, Cardiff   Away 16-Sep-11   Lost
   9   116     Australia         6        2                        Adeaide Oval, Adelaide   Away 24-Jan-12   Lost
  23   123   New Zealand         3        2                           McLean Park, Napier   Away 19-Jan-14   Lost
  28   115     Australia         4        2                       Adelaide Oval, Adelaide   Away 09-Dec-14   Lost
  29   141     Australia         4        4                       Adelaide Oval, Adelaide   Away 09-Dec-14   Lost
  33   103     Sri Lanka         4        2            Galle International Stadium, Galle   Away 12-Aug-15   Lost
  35   117     Australia         3        1           Melbourne Cricket Ground, Melbourne   Away 17-Jan-16   Lost
  36   106     Australia         3        2                         Manuka Oval, Canberr

In [9]:
num = df[df['No.'].isin([11, 22])].to_string(index=False)
print(num)

 No.  Runs       Against  Position  Innings                                Venue  Ground      Date Result
  11   108     Sri Lanka         3        1 Sher-e-Bangla Cricket Stadium, Dhaka Neutral 13-Mar-12    Won
  22   119  South Africa         4        1      Wanderers Stadium, Johannesburg    Away 18-Dec-13  Drawn


In [10]:

drawn_match = df[df['Result'] == 'Drawn'].to_string(index=False)
print(drawn_match)

 No.  Runs       Against  Position  Innings                                        Venue Ground      Date Result
  16   103       England         5        2 Vidarbha Cricket Association Stadium, Nagpur   Home 13-Dec-12  Drawn
  22   119  South Africa         4        1              Wanderers Stadium, Johannesburg   Away 18-Dec-13  Drawn
  24   105   New Zealand         4        4                    Basin Reserve, Wellington   Away 14-Feb-14  Drawn
  30   169     Australia         4        2          Melbourne Cricket Ground, Melbourne   Away 26-Dec-14  Drawn
  31   147     Australia         4        2                Sydney Cricket Ground, Sydney   Away 06-Jan-15  Drawn
  50   104     Sri Lanka         4        3                        Eden Gardens, Kolkata   Home 16-Nov-17  Drawn
  52   243     Sri Lanka         4        1               Feroz Shah Kotla Ground, Delhi   Home 02-Dec-17  Drawn
  61   157   West Indies         3        1      ACA-VDCA Cricket Stadium, Visakhapatnam   Home 

In [11]:
innings = df[df['Innings'] > 3 ].to_string(index=False)
print(innings)

 No.  Runs      Against  Position  Innings                     Venue Ground      Date Result
  24   105  New Zealand         4        4 Basin Reserve, Wellington   Away 14-Feb-14  Drawn
  29   141    Australia         4        4   Adelaide Oval, Adelaide   Away 09-Dec-14   Lost


In [12]:
# Extracting Month of the Match
df['Date'] = pd.to_datetime(df['Date'], format='%d-%b-%y')  # Specify the format
df['Day'] = df['Date'].dt.day
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year
print(df['Day'].head())
print(df['Month'].head())
print(df['Year'].head())

0    24
1    11
2    20
3    28
4    19
Name: Day, dtype: int32
0    12
1     1
2    10
3    11
4     2
Name: Month, dtype: int32
0    2009
1    2010
2    2010
3    2010
4    2011
Name: Year, dtype: int32


In [13]:
df = df.drop(columns=['Date'])

Label Encoder

In [14]:
from sklearn.preprocessing import LabelEncoder


le = LabelEncoder()
df['Result'] = le.fit_transform(df['Result'])

# Access the mapping of original categories to encoded values
mapping = dict(zip(le.classes_, range(len(le.classes_))))
print("\nMapping of original categories to encoded values:")
print(mapping)

print("------------------------")
# Print the transformed DataFrame
print("Transformed DataFrame:")
print(df['Result'])


Mapping of original categories to encoded values:
{'Drawn': 0, 'Lost': 1, 'Tied': 2, 'Won': 3}
------------------------
Transformed DataFrame:
0     2
1     3
2     3
3     3
4     3
     ..
75    0
76    3
77    3
78    3
79    3
Name: Result, Length: 80, dtype: int32


In [15]:
print(df['Result'].dtype)
print(df['Result'] == 4)

int32
0     False
1     False
2     False
3     False
4     False
      ...  
75    False
76    False
77    False
78    False
79    False
Name: Result, Length: 80, dtype: bool


In [16]:
le = LabelEncoder()
df['Ground'] = le.fit_transform(df['Ground'])

mapping = dict(zip(le.classes_, range(len(le.classes_))))
print("\nMapping of original categories to encoded values:")
print(mapping)

print("------------------------")

# Print the transformed DataFrame
print("Transformed DataFrame:")
print(df['Ground'].head())


Mapping of original categories to encoded values:
{'Away': 0, 'Home': 1, 'Neutral': 2}
------------------------
Transformed DataFrame:
0    1
1    0
2    1
3    1
4    0
Name: Ground, dtype: int32


In [17]:
print( df['Ground'] == 1)

0      True
1     False
2      True
3      True
4     False
      ...  
75    False
76    False
77     True
78     True
79     True
Name: Ground, Length: 80, dtype: bool


In [18]:
df.info()
print(df.describe())
df.head()
print(df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   No.       80 non-null     int64 
 1   Runs      80 non-null     int64 
 2   Against   80 non-null     object
 3   Position  80 non-null     int64 
 4   Innings   80 non-null     int64 
 5   Venue     80 non-null     object
 6   Ground    80 non-null     int32 
 7   Result    80 non-null     int32 
 8   Day       80 non-null     int32 
 9   Month     80 non-null     int32 
 10  Year      80 non-null     int32 
dtypes: int32(5), int64(4), object(2)
memory usage: 5.4+ KB
           No.        Runs   Position   Innings     Ground     Result  \
count  80.0000   80.000000  80.000000  80.00000  80.000000  80.000000   
mean   40.5000  131.550000   3.500000   1.67500   0.625000   2.262500   
std    23.2379   35.094772   0.656033   0.67082   0.623891   1.144538   
min     1.0000  100.000000   2.000000   1.00000   0

In [19]:
print(df[df['Year'] == 2013].to_string(index=False))

print(" --------------------------------")

# Finding the top 10 highest scoring matches
top_10_matches = df.nlargest(10, 'Runs')
print("Top 10 highest scoring matches:")
print(top_10_matches)

print("--------------------------------")


 No.  Runs       Against  Position  Innings                              Venue  Ground  Result  Day  Month  Year
  17   107     Australia         5        2 M. A. Chidambaram Stadium, Chennai       1       3   22      2  2013
  18   102   West Indies         3        1   Queen's Park Oval, Port of Spain       0       3    5      7  2013
  19   115      Zimbabwe         3        2         Harare Sports Club, Harare       0       3   24      7  2013
  20   100     Australia         3        2     Sawai Mansingh Stadium, Jaipur       1       3   16     10  2013
  21   115     Australia         3        2                VCA Stadium, Nagpur       1       3   30     10  2013
  22   119  South Africa         4        1    Wanderers Stadium, Johannesburg       0       0   18     12  2013
 --------------------------------
Top 10 highest scoring matches:
    No.  Runs        Against  Position  Innings  \
68   69   254   South Africa         4        1   
51   52   243      Sri Lanka         4   