In [1]:
# Imports and Helper Functions
# data Analysis
import pandas as pd
import numpy as np
import random as rng
from datetime import datetime

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style('darkgrid') 

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
#InteractiveShell.ast_node_interactivity = "last_expr"

In [24]:
pd.options.display.float_format = '{:,.2f}'.format
raw_data = pd.read_csv("data/data.csv")
data = raw_data.copy(True)

In [13]:
fighters = pd.read_csv("data/fighters.csv")
categories_data = pd.read_csv("data/categories.csv")
fighter_mean = pd.read_csv("data/fighter_means.csv")
data_edited = pd.read_csv("data/data_edited.csv")
data_cleaned = pd.read_csv("data/cleaned_data.csv")

In [40]:
data.shape

(2278, 895)

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [6]:
data.describe(include=['object']) 

Unnamed: 0,B_HomeTown,B_Location,B_Name,Date,R_HomeTown,R_Location,R_Name,winby,winner
count,2301,2305,2318,2318,2293,2294,2318,2282,2318
unique,724,567,949,389,652,504,842,3,4
top,Rio de Janeiro Brazil,Rio de Janeiro Brazil,Kevin Lee,10/08/2018,Rio de Janeiro Brazil,Rio de Janeiro Brazil,Donald Cerrone,DEC,red
freq,46,58,11,76,65,99,14,1111,1327


In [30]:
#mala slova imena
data['R_Name'] = data['R_Name'].str.lower()
data['B_Name'] = data['B_Name'].str.lower()

In [8]:

#nan vrijednosti u winby stupcu
no_winby = data[data['winby'].isnull()]
print(len(no_winby), "rows where 'winby' is not set.")

#First thought is maybe draws and no-contests don't populate this column.
print(data['winner'][data['winner'] == 'no contest'].size, 'no contests')
print(data['winner'][data['winner'] == 'draw'].size, 'draws')

print(no_winby['winner'][no_winby['winner'] == 'draw'].size, 'draws with an empty \'winby\' value')
print(no_winby['winner'][no_winby['winner'] == 'no contest'].size, 'no contest with an empty \'winby\' value')

#every draw counts for an empty winby, but not every "no contest". Let's set all the draws to a new win type - "DRAW"
data.loc[data['winner'] == 'draw', 'winby'] = 'DRAW'
no_winby = data[data['winby'].isnull()]

# remaining no-contests -  gives the context  to deal with the no-contests with a missing "winby"
data[['B_Name', 'R_Name', 'Date', 'Last_round', 'winner', 'winby']][(data['winner'] == 'no contest') & (data['winby'].notnull())] 

#After Googling these, turns out the majority  are from wins being overturned by failed drug test results.
# deciding to overwrite the "winner" column here with the original winner

b = 'blue'
r = 'red'

data.at[40, 'winner'] = b
data.at[70, 'winner'] = r
data.at[234, 'winner'] = r
data.at[255, 'winner'] = r
data.at[301, 'winner'] = b
data.at[403, 'winner'] = r
data.at[428, 'winner'] = r
data.at[513, 'winner'] = r
data.at[628, 'winner'] = r
data.at[734, 'winner'] = r
data.at[894, 'winby'] = np.NaN
data.at[1179, 'winner'] = r
data.at[1389, 'winner'] = b
data.at[1473, 'winner'] = b
data.at[1475, 'winner'] = b
data.at[1522, 'winner'] = b
data.at[1664, 'winner'] = b
data.at[1764, 'winner'] = b

# recompute no_winby
no_winby = data[data['winby'].isnull()]


#Now let's look at the remaining matches, in more granular detail.
# extract any column that would help with Googling the match
no_winby[['B_Name', 'R_Name', 'Date', 'Last_round', 'winner']]

#After Googling each of these fights, added two new values for "winby", "DQ" for disqualifications and "NC" for no-contests ( this won't be applied to wins that were overturned to become no-contests, as explained above).
s = 'SUB'
d = 'DEC'
k = 'KO/TKO'

data.at[36, 'winby'] = s # Omari Akhmedov	Gunnar Nelson: SUB
data.at[170, 'winby'] = s # Johnny Bedataord	Rani Yahya: head bump NC
data.at[177, 'winby'] = d # Rashid Magomedov vs Rodrigo Damm: DEC
data.at[364, 'winby'] = d # Efrain Escudero	Leonardo Santos: DEC
data.at[1803, 'winby'] = k # Gilbert Burns	Jason Saggo: KO/TKO
data.at[2234, 'winby'] = d # Liu Pingyuan	Damian Stasiak: DEC
data.at[2286, 'winby'] = d # Joey Gomez	Kevin Aguilar: DEC
data.at[2287, 'winby'] = k # Alton Cunningham	Bevon Lewis: KO/TKO
data.at[2288, 'winby'] = d # Ricky Palacios	Toby Misech: DEC
data.at[2289, 'winby'] = k # Rilley Dutro	Jordan Espinosa: KO/TKO
data.at[2290, 'winby'] = k # Jamie Colleen	Maycee Barber: KO/TKO
data.at[2291, 'winby'] = s # Dom Pilarte	Vincent Morales: SUB
data.at[2292, 'winby'] = k # Josh Appelt	Jeff Hughes: KO/TKO

no_winby[['B_Name', 'R_Name', 'Date', 'Last_round', 'winner', 'winby']]


36 rows where 'winby' is not set.
24 no contests
16 draws
16 draws with an empty 'winby' value
6 no contest with an empty 'winby' value


Unnamed: 0,B_Name,R_Name,Date,Last_round,winner,winby
40,louis gaudinot,phil harris,06/21/2014,1,no contest,SUB
70,yaotzin meza,chico camus,06/06/2014,3,no contest,DEC
234,keith berish,robert drysdale,01/11/2016,1,no contest,SUB
255,bubba bush,kevin casey,07/07/2015,1,no contest,KO/TKO
301,brian ortega,mike de la torre,08/22/2014,1,no contest,SUB
403,jerrod sanders,pedro munhoz,10/06/2014,1,no contest,SUB
428,joshua burkman,hector lombard,02/13/2017,3,no contest,DEC
513,nick diaz,anderson silva,02/02/2015,5,no contest,DEC
628,drew dober,leandro silva,03/26/2015,2,no contest,SUB
734,damon jackson,rony jason,05/31/2015,1,no contest,SUB


Unnamed: 0,B_Name,R_Name,Date,Last_round,winner
36,omari akhmedov,gunnar nelson,06/03/2014,1,red
170,johnny bedford,rani yahya,05/21/2014,1,no contest
177,rashid magomedov,rodrigo damm,11/05/2016,3,blue
364,efrain escudero,leonardo santos,09/15/2014,3,red
493,daron cruickshank,kj noons,12/16/2014,2,no contest
622,norifumi yamamoto,roman salazar,03/04/2015,2,no contest
894,jim alers,cole miller,02/16/2016,2,no contest
973,kevin casey,antonio carlos junior,12/14/2015,1,no contest
1450,tim means,alex oliveira,01/02/2017,1,no contest
1576,dustin poirier,eddie alvarez,05/17/2017,2,no contest


Unnamed: 0,B_Name,R_Name,Date,Last_round,winner,winby
36,omari akhmedov,gunnar nelson,06/03/2014,1,red,
170,johnny bedford,rani yahya,05/21/2014,1,no contest,
177,rashid magomedov,rodrigo damm,11/05/2016,3,blue,
364,efrain escudero,leonardo santos,09/15/2014,3,red,
493,daron cruickshank,kj noons,12/16/2014,2,no contest,
622,norifumi yamamoto,roman salazar,03/04/2015,2,no contest,
894,jim alers,cole miller,02/16/2016,2,no contest,
973,kevin casey,antonio carlos junior,12/14/2015,1,no contest,
1450,tim means,alex oliveira,01/02/2017,1,no contest,
1576,dustin poirier,eddie alvarez,05/17/2017,2,no contest,


In [4]:
#------Drop Nan from winby(DC,NC)--------------
data = data.dropna(subset=['winby'])

In [9]:
#Wrong rounds
#There's a max-3-round fight that ends in decision...that had less than 3 rounds?
# fight in question
data[(data['Max_round'] == 3) & (data['winby'] == 'DEC') & (data['Last_round'] < 3 )]

Unnamed: 0,BPrev,BStreak,B_Age,B_Height,B_HomeTown,B_ID,B_Location,B_Name,B_Weight,B__Round1_Grappling_Reversals_Landed,...,R__Round5_TIP_Ground Time,R__Round5_TIP_Guard Control Time,R__Round5_TIP_Half Guard Control Time,R__Round5_TIP_Misc. Ground Control Time,R__Round5_TIP_Mount Control Time,R__Round5_TIP_Neutral Time,R__Round5_TIP_Side Control Time,R__Round5_TIP_Standing Time,winby,winner
334,0,0,38.0,167.0,"Phoenix, Arizona United States",2304,"Scottsdale, Arizona United States",frankie saenz,61.0,,...,,,,,,,,,DEC,blue


In [10]:
# correct it to 3 (verified using Google)
data.loc[334, 'Last_round'] = 3
print('"Last_round" set to', data.loc[334, 'Last_round'], 'for the fight:', data.loc[334, 'B_Name'], 'vs', data.loc[334, 'R_Name'])

"Last_round" set to 3 for the fight: frankie saenz vs nolan ticman


In [11]:
#Weight: static and missing values
#"Looks like there are some null values, let's see if we can fill those in.
# get names of fighters will missing weights
missing_weight_names = pd.concat([data['B_Name'][data['B_Weight'].isnull()], data['R_Name'][data['R_Weight'].isnull()]]).unique()

# fill in with Googled values
weights = {
    'lipeng zhang': 70,
    'antonio carlos junior': 84,
    'aleksei oleinik': 65,
    'cat zingano': 65,
    'yao zhikui': 56,
    'jack marshman': 84,
    'allan zuniga': 70 
}

missing_weight_names = pd.concat([data['B_Name'][data['B_Weight'].isnull()], data['R_Name'][data['R_Weight'].isnull()]]).unique()

for name in missing_weight_names:
    data.loc[data['B_Name'] == name, 'B_Weight'] = weights[name]
    data.loc[data['R_Name'] == name, 'R_Weight'] = weights[name]


In [12]:
#ispis kategorija
sorted([ weight for weight in pd.concat([data['B_Weight'], data['R_Weight']]).unique() if weight <= 93 ])

[52.0, 56.0, 61.0, 65.0, 70.0, 76.0, 77.0, 84.0, 93.0]

In [13]:
#76 is abnormal, as 77 is representative of the middeweight class. Let's set it to 77.
data.loc[data['B_Weight'] == 76, 'B_Weight'] = 77
data.loc[data['R_Weight'] == 76, 'R_Weight'] = 77

In [21]:
#ispi borbi s 4 runde, Let's try to figure out why some fights have "Max_round" set to 4
data[['B_Name', 'R_Name', 'Last_round', 'Max_round']][data['Max_round'] == 4]

Unnamed: 0,B_Name,R_Name,Last_round,Max_round
79,eddie gordon,dhiego lima,1,4
235,matt van buren,corey anderson,1,4
306,jianping yang,ning guangyou,3,4
722,fernando bruno,glaico franca moreira,3,4
723,dileno lopes,reginaldo vieira,3,4
783,hayder hassan,kamaru usman,2,4
871,enrique marin,erick montano,3,4
873,enrique barzola,horacio gutierrez,3,4
967,ryan hall,artem lobov,3,4
1156,amanda cooper,tatiana suarez,1,4


In [22]:

#aftere googling fight, replacing Max_round of 4 to 3 instead
data.loc[data['Max_round'] == 4, 'Max_round'] = 3

# print to double check
print('There are', data['Max_round'][data['Max_round'] == 3].size, 'fights with max 3 rounds')
print('There are', data['Max_round'][data['Max_round'] == 4].size, 'fights with max 4 rounds')
print('There are', data['Max_round'][data['Max_round'] == 5].size, 'fights with max 5 rounds')

There are 2099 fights with max 3 rounds
There are 0 fights with max 4 rounds
There are 219 fights with max 5 rounds


In [28]:
#There's a max-3-round fight that ends in decision...that had less than 3 rounds?
# fight in question
data[(data['Max_round'] == 3) & (data['winby'] == 'DEC') & (data['Last_round'] < 3 )][['B_Name', 'R_Name', 'Max_round','winby','Last_round']]
# correct it to 3 (verified using Google)
data.loc[334, 'Last_round'] = 3
print('"Last_round" set to', data['Last_round'].iloc[334], 'for the fight:', data['B_Name'].iloc[334], 'vs', data['R_Name'].iloc[334])

Unnamed: 0,B_Name,R_Name,Max_round,winby,Last_round
334,Frankie Saenz,Nolan Ticman,3,DEC,1


In [37]:
#As you can see above, one of these must be true:
#the Age field represents the fighter's age at the time of dataset creation, not at the time of the fight.
#Jose Aldo is a time lord...
#Since we'll be exploring age later, let's adjust this column to reflect age at time of fight. We'll do this by taking the fighter's age and subtract the time difference between the fight's date and the last fight in the dataset. We'll leave it as an integer, for discrete grouping.
data[['R_Name', 'R_Age', 'Date']][data['R_Name'] == 'conor mcgregor']

# find the date of the last fight 
data['Date'].max()

Unnamed: 0,R_Name,R_Age,Date
267,conor mcgregor,26,07/21/2014
502,conor mcgregor,27,01/19/2015
1064,conor mcgregor,28,03/07/2016


'19/12/2015'

In [33]:
#The "max" date is in European format, while the majority of the dates are in American format. 
# outliers should bubble to the top or bottom
data.sort_values('Date')['Date'].tail(20)

480     12/22/2014
620     12/22/2014
489     12/22/2014
483     12/22/2014
488     12/22/2014
311     12/23/2014
831     12/23/2015
1807    16/09/2017
1806    17/09/2017
1238    19/12/2015
12      2014-02-16
197     2014-06-08
78      2014-06-29
384     2014-10-04
449     2014-11-17
334     2014-12-20
686     2015-05-23
1032    21/02/2016
415     22/07/2018
853     24/10/2015
Name: Date, dtype: object

In [34]:
# setting outliers to standard slash format
data.at[12, 'Date'] = '02/16/2014'
data.at[197, 'Date'] = '06/08/2014'
data.at[78, 'Date'] = '06/29/2014'
data.at[384, 'Date'] = '10/04/2014'
data.at[449, 'Date'] = '11/17/2014'
data.at[334, 'Date'] = '12/20/2014'
data.at[686, 'Date'] = '05/23/2015'
data.at[1032, 'Date'] = '02/21/2016'
data.at[415, 'Date'] = '07/22/2018'
data.at[853, 'Date'] = '10/24/2015'

# indicate added column with "_" prefix
data['_Date_year'] = data['Date'].transform(lambda data: int(data[-4:]))

In [35]:
# recalculate age
# formula: Age - (last_year - fight_year)
data['R_Age'] = data['R_Age'] - (data['_Date_year'].max() - data['_Date_year'])
data['B_Age'] = data['B_Age'] - (data['_Date_year'].max() - data['_Date_year'])

# checking
data[['R_Name', 'R_Age', 'Date']][data['R_Name'] == 'lipeng zhang']


Unnamed: 0,R_Name,R_Age,Date
307,lipeng zhang,,08/25/2014


In [36]:
#------------------------------------replace Nan from Age, replace Nan from Height, ostalo NaN fill 0
columns_to_fill = ['B_Age', 'R_Age', 'B_Height', 'R_Height']
data[columns_to_fill] = data[columns_to_fill].apply(lambda x: x.fillna(x.mean()).round().astype(int))

data.loc[:, :] = data.fillna(value=0)

string_to_remove = "draw"
data = data[data["winner"] != string_to_remove]
string_to_remove = "no contest"
data = data[data["winner"] != string_to_remove]

In [13]:
data.to_csv('data/cleaned_data.csv', index=False)

In [26]:
#---------------------Feature engenering
data_source = pd.read_csv("data/cleaned_data.csv")



dropdata = data_source.drop(['Event_ID', 'Fight_ID', 'B_Location', 'B_HomeTown', 'B_ID', 'R_Location', 'R_HomeTown', 'R_ID',
                             'Date', 'Last_round', 'Max_round', '_Date_year', 'winby'], axis=1)

dropdata.rename(columns={'BPrev':'B__Prev',
                        'RPrev':'R__Prev',
                        'B_Name':'B__Name',
                        'B_Age':'B__Age',
                        'B_Height':'B__Height',
                        'B_Weight':'B__Weight',
                        'BStreak':'B__Streak',
                        'R_Name':'R__Name',
                        'R_Age':'R__Age',
                        'R_Height':'R__Height',
                        'R_Weight':'R__Weight',                       
                        #'Winby':'Winby',
                        'winner':'Winner'}, inplace=True)

#--------------------------------Calculate RStreak---------------------
fighter_streaks = {}  # Dictionary to store fighter streaks
df = dropdata
winStyle = {}

for index, row in df.iterrows():
    blue_fighter = row['B__Name']
    red_fighter = row['R__Name']
    winner = row['Winner']
    #winby = row['winby']

    if blue_fighter not in fighter_streaks:
        fighter_streaks[blue_fighter] = 0
    if red_fighter not in fighter_streaks:
        fighter_streaks[red_fighter] = 0

    # if blue_fighter not in winStyle:
    #     winStyle[blue_fighter] = {'TKO': 0, 'SUB': 0, 'DEC': 0}
    # if red_fighter not in winStyle:
    #     winStyle[red_fighter] = {'TKO': 0, 'SUB': 0, 'DEC': 0}


    df.at[index, 'B__Streak'] = int(fighter_streaks[blue_fighter])
    df.at[index, 'R__Streak'] = int(fighter_streaks[red_fighter])

    if winner == "blue":
        fighter_streaks[blue_fighter] += 1
        fighter_streaks[red_fighter] = 0

        # if winby == 'KO/TKO':
        #     winStyle[blue_fighter]['TKO'] += 1
        # if winby == 'SUB':
        #     winStyle[blue_fighter]['SUB'] += 1
        # if winby == 'DEC':
        #     winStyle[blue_fighter]['DEC'] += 1


    else:
        fighter_streaks[red_fighter] += 1
        fighter_streaks[blue_fighter] = 0

        # if winby == 'KO/TKO':
        #     winStyle[blue_fighter]['TKO'] += 1
        # if winby == 'SUB':
        #     winStyle[blue_fighter]['SUB'] += 1
        # if winby == 'DEC':
        #     winStyle[blue_fighter]['DEC'] += 1


    # Com_Winning_Style_B = max(winStyle[blue_fighter], key=winStyle[blue_fighter].get)
    # Com_Winning_Style_R = max(winStyle[red_fighter], key=winStyle[red_fighter].get)
    
    # df.at[index, 'Com_Winning_Style_B'] = Com_Winning_Style_B  
    # df.at[index, 'Com_Winning_Style_R'] = Com_Winning_Style_R  
   

df.to_csv('data/data_edited.csv', index=False)

In [None]:
B__Prev,B__Streak,B__Age,B__Height,B__Weight,B__Round1_Grappling_Reversals_Landed,B__Round1_Grappling_S


In [16]:
names = np.unique(dropdata[['B__Name', 'R__Name']])
lower_names = [name.lower() for name in names]
np.savetxt('data/fighters.csv', lower_names, delimiter=',', fmt='%s', header='Fighters', comments='')

1098

In [17]:
#-------------------------------------------------Calculate mean for all fighters------------------------------
data = pd.read_csv("data/data_edited.csv")
data_source = data.copy(True).drop('Winner',axis=1)


object_columns_to_exclude = ['B__Name', 'R__Name', 'R__Streak', 'B__Streak']
objecttypes = list(data_source.select_dtypes(include=['O']).columns)
for col in objecttypes:
    if col not in object_columns_to_exclude:
        data_source[col] = data_source[col].astype('category')

cat_columns = data_source.select_dtypes(['category']).columns
data_source[cat_columns] = data_source[cat_columns].apply(lambda x: x.cat.codes)


data_mean_all = pd.DataFrame()
fighters = pd.read_csv("data/fighters.csv")
#name = 'jose aldo'

for i, name in enumerate(fighters['Fighters']):
    #if winner, 0 == blue
    #fighterRows_win = data_source.loc[(data_source['B__Name'] == name) & (data_source['Winner'] == 0), data_source.columns.difference(['Winner'])]
    #if not winner
    #fighterRows_def = data_source.loc[(data_source['B__Name'] == name) & (data_source['Winner'] == 3),  data_source.columns.difference(['Winner', 'Winby'])]
    #data_mean = pd.concat([fighterRows_win, fighterRows_def], ignore_index=True)          

    data_mean = data_source.loc[(data_source['B__Name'] == name)]
    filtered_columns = data_mean.filter(regex='^R', axis=1)
    data_mean = data_mean.drop(filtered_columns.columns, axis=1)

    new_columns = {
        col: col.replace("B__", "").replace("B_", "")
        if col.startswith("B__") or col.startswith("B_")
        else col
        for col in data_source.columns
    }
    data_mean = data_mean.rename(columns=new_columns)
    data_mean_B = data_mean.drop("Name", axis=1)



    #fighterRows_win = data_source.loc[(data_source['R__Name'] == name) & (data_source['Winner'] == 3), data_source.columns.difference(['Winner'])]
    #fighterRows_def = data_source.loc[(data_source['R__Name'] == name) & (data_source['Winner'] == 0),  data_source.columns.difference(['Winner', 'Winby'])]
    #data_mean = pd.concat([fighterRows_win, fighterRows_def], ignore_index=True)
    
    data_mean = data_source.loc[(data_source['R__Name'] == name)]
    filtered_columns = data_mean.filter(regex='^B', axis=1)
    data_mean = data_mean.drop(filtered_columns.columns, axis=1)

    new_columns = {
        col: col.replace("R__", "").replace("R_", "")
        if col.startswith("R__") or col.startswith("R_")
        else col
        for col in data_source.columns
    }

    data_mean = data_mean.rename(columns=new_columns)
    data_mean_R = data_mean.drop("Name", axis=1)


    combined_df = pd.concat([data_mean_R, data_mean_B], ignore_index=True)
    combined_df = combined_df.mean().to_frame().transpose()

    combined_df['Name'] = name
    data_mean_all = pd.concat([data_mean_all, combined_df], ignore_index=True)

objecttypes = list(data_mean_all.select_dtypes(include=['float64']).columns)
for col in objecttypes:
    data_mean_all[col] = data_mean_all[col].astype('float32')
    
desired_column_order = ['Prev', 'Streak', 'Age','Height','Weight','Name']
new_df1 = data_mean_all[desired_column_order + [col for col in data_mean_all.columns if col not in desired_column_order]]
new_df1.to_csv('data/fighter_means.csv', index=False, header=True)

In [27]:
#-----------------------------Parameters for neural network----------------------------
# Read the CSV file and drop the specified columns
data = pd.read_csv("data/data_edited.csv").drop(columns=["B__Name", "R__Name"])

objecttypes = list(data.select_dtypes(include=['O']).columns)
for col in objecttypes:
    data[col] = data[col].astype('category')

cat_columns = data.select_dtypes(['category']).columns
data[cat_columns] = data[cat_columns].apply(lambda x: x.cat.codes)

desired_column_orderB = ['B__Prev', 'B__Streak', 'B__Age', 'B__Height', 'B__Weight']
desired_column_orderR = ['R__Prev', 'R__Streak', 'R__Age', 'R__Height', 'R__Weight'] 
desired_column_order = desired_column_orderB + [col for col in data.columns if col not in desired_column_orderB and col.startswith('B_')] \
                      + desired_column_orderR + [col for col in data.columns if col not in desired_column_orderR and col.startswith('R_')]

# Create a new DataFrame with the desired column order
new_df1 = data[desired_column_order + [col for col in data.columns if col not in desired_column_order]]
new_df1.to_csv('data/data_edited.csv', index=False)

In [None]:
Influence of age on the outcome of fight¶


In [None]:
Influence of Height on the outcome of fight¶


In [None]:
Most common way of winning a fight

In [None]:
Wine streak and winning correlation

In [None]:
#data_hist = data.iloc[:, :5].hist(figsize=(10,10), xrot=-45)
data_hist = data.iloc[:, :10]
#sns.histplot(data=data_hist, element='step')

# # Customize the legend
# legend_labels = ['Group A', 'Group B with a long name']


# plt.legend(legend_labels, title='Groups')
# # Adjust the aesthetics of the legend
# legend = plt.gca().get_legend()
# plt.setp(legend.get_title(), fontsize='14', fontweight='bold')
# plt.setp(legend.get_texts(), fontsize='12')

for column in data_hist.columns:
    column_values = data_hist[column].astype(str)
    # Create a new plot for each variable
    sns.histplot(data=column_values)

    plt.title(column)  # Set the title of the plot as the variable name
    plt.xlabel('Value')  # Set the x-axis label
    plt.xticks(rotation='vertical')  # Adjust the rotation angle as needed
    plt.ylabel('Frequency')  # Set the y-axis label
    plt.show();  # Display the plot

#plt.show()


In [None]:
df = {  'Name': data['B_Location'].value_counts().head(15).index, 
        'Count': data['B_Location'].value_counts().head(15).values}
df = pd.DataFrame(df)
sns.barplot(data=df, y='Name', x='Count')
plt.show() 

In [None]:
sns.countplot(y='winner', data=data_cleaned) 
plt.show() 


In [None]:
sns.countplot(y='Winner', data=data_source) 
plt.show() 

In [None]:
sns.countplot(y='winner_code', data=dropdata) 
plt.show() 

In [None]:
data = dropdata[(dropdata["winner"] == "red") | (dropdata["winner"] == "blue")]

In [None]:
#for feature in data.dtypes[data.dtypes == 'object'].index: 
sns.countplot(y='R__Weight', data=dropdata) 
plt.show() 
sns.countplot(y='B__Weight', data=dropdata) 
plt.show() 

In [None]:
#Basic Correlation Matrix
corrmat = dropdata.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True)

In [None]:
# Subset Correlation Matrix
k = 10 #number of variables for heatmap
corrmat = dropdata.corr()
cols = corrmat.nlargest(k, 'winner')['winner'].index
cm = np.corrcoef(dropdata[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

In [None]:
sns.lmplot(x="B__Round3_Strikes_Body Significant Strikes_Attempts", 
            y="B__Round3_Strikes_Body Significant Strikes_Landed", 
            col="winner", hue="winner", data=data, col_wrap=2)