In [1]:
# Imports and Helper Functions
# data Analysis
import pandas as pd
import numpy as np
import random as rng
from datetime import datetime

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style('darkgrid') 

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
#InteractiveShell.ast_node_interactivity = "last_expr"

In [20]:
pd.options.display.float_format = '{:,.2f}'.format
df = pd.read_csv("data.csv")
data = df.copy()

In [None]:
fighters = pd.read_csv("fighters.csv")
categories_data = pd.read_csv("categories.csv")
fighter_mean = pd.read_csv("fighter_means.csv")
data_edited = pd.read_csv("data_edited.csv")

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.describe(include=['object']) 

In [17]:
#mala slova imena
data['R_Name'] = data['R_Name'].str.lower()
data['B_Name'] = data['B_Name'].str.lower()

In [None]:

#nan vrijednosti u winby stupcu
no_winby = df[df['winby'].isnull()]
print(len(no_winby), "rows where 'winby' is not set.")

print(df['winner'][df['winner'] == 'no contest'].size, 'no contests')
print(df['winner'][df['winner'] == 'draw'].size, 'draws')

print(no_winby['winner'][no_winby['winner'] == 'draw'].size, 'draws with an empty \'winby\' value')
print(no_winby['winner'][no_winby['winner'] == 'no contest'].size, 'no contest with an empty \'winby\' value')

# set on the original dataset, and recompute `no_winby`
df.loc[df['winner'] == 'draw', 'winby'] = 'DRAW'
no_winby = df[df['winby'].isnull()]

df[['B_Name', 'R_Name', 'Date', 'Last_round', 'winner', 'winby']][(df['winner'] == 'no contest') & (df['winby'].notnull())] # exclude the no-contests with no 'winby'

In [None]:
#missing values Nan
columns_to_fill = data['B_Age', 'B_Height', 'R_Age', 'R_Height']
data[columns_to_fill] = data[columns_to_fill].apply(lambda x: x.fillna(x.mean()))

data.loc[:, :] = data.fillna(value=0)

In [None]:
# get names of fighters will missing weights
df = data
missing_weight_names = pd.concat([df['B_Name'][df['B_Weight'].isnull()], df['R_Name'][df['R_Weight'].isnull()]]).unique()
missing_weight_names

# fill in with Googled values
weights = {
    'Lipeng Zhang': 70,
    'Antonio Carlos Junior': 84,
    'Aleksei Oleinik': 65,
    'Cat Zingano': 65,
    'Yao Zhikui': 56,
    'Jack Marshman': 84,
    'Allan Zuniga': 70 
}
for name in missing_weight_names:
    df['B_Weight'][df['B_Name'] == name] = weights[name]
    df['R_Weight'][df['R_Name'] == name] = weights[name]

In [None]:
#ispis kategorija
sorted([ weight for weight in pd.concat([data['B_Weight'], data['R_Weight']]).unique() if weight <= 93 ])

In [None]:
df['B_Weight'][df['B_Weight'] == 76] = 77
df['R_Weight'][df['R_Weight'] == 76] = 77

In [None]:
#ispi borbi s 4 runde
data[['B_Name', 'R_Name', 'Last_round']][data['Max_round'] == 4]

In [None]:

#When I Googled some of these fights, most of the 3-round fights were by decision (as opposed to KO/submission) - 
#this means that there was no opportunity for a 4th round. Let's replace any Max_round of 4 with a 3 instead

# replacing Max_round of 4 to 3 instead
data.loc[data['Max_round'] == 4, 'Max_round'] = 3

# print to double check
print('There are', data['Max_round'][data['Max_round'] == 3].size, 'fights with max 3 rounds')
print('There are', data['Max_round'][data['Max_round'] == 4].size, 'fights with max 4 rounds')
print('There are', data['Max_round'][data['Max_round'] == 5].size, 'fights with max 5 rounds')

In [None]:
#There's a max-3-round fight that ends in decision...that had less than 3 rounds?
# fight in question
df[(df['Max_round'] == 3) & (df['winby'] == 'DEC') & (df['Last_round'] < 3 )]
# correct it to 3 (verified using Google)
df['Last_round'].iloc[334] = 3
print('"Last_round" set to', df['Last_round'].iloc[334], 'for the fight:', df['B_Name'].iloc[334], 'vs', df['R_Name'].iloc[334])

In [None]:
data[['R_Name', 'R_Age', 'Date']][data['R_Name'] == 'jose aldo']

# find the date of the last fight
#The "max" date is in European format, while the majority of the dates are in American 
data['Date'].max()

In [None]:
#ispravak format a datuma sve u američki
# outliers should bubble to the top or bottom
data.sort_values('Date')['Date']

In [None]:
#ispravak format a datuma sve u američki
# outliers should bubble to the top or bottom
data.sort_values('Date')['Date']

# setting outliers to standard slash format
df.at[12, 'Date'] = '02/16/2014'
df.at[197, 'Date'] = '06/08/2014'
df.at[78, 'Date'] = '06/29/2014'
df.at[384, 'Date'] = '10/04/2014'
df.at[449, 'Date'] = '11/17/2014'
df.at[334, 'Date'] = '12/20/2014'
df.at[686, 'Date'] = '05/23/2015'

# indicate added column with "_" prefix
df['_Date_year'] = df['Date'].transform(lambda date: int(date[-4:]))

In [None]:
# recalculate age
# formula: Age - (last_year - fight_year)
df['R_Age'] = df['R_Age'] - (df['_Date_year'].max() - df['_Date_year'])
df['B_Age'] = df['B_Age'] - (df['_Date_year'].max() - df['_Date_year'])

# checking our work
df[['R_Name', 'R_Age', 'Date']][df['R_Name'] == 'Jose Aldo']

In [None]:
df.to_csv('cleaned_data.csv', index = False)

In [48]:
data = data.drop(['Event_ID', 'Fight_ID', 'B_Location', 'B_HomeTown', 'B_ID','B_Name', 'R_Location', 'R_HomeTown','R_ID',
                    'R_Name','winby','Date', 'BStreak', 'Last_round', 'Max_round'], axis=1)

data.rename(columns={'BPrev':'B__Prev',
                        'RPrev':'R__Prev',
                        'B_Age':'B__Age',
                        'B_Height':'B__Height',
                        'B_Weight':'B__Weight',
                        'R_Age':'R__Age',
                        'R_Height':'R__Height',
                        'R_Weight':'R__Weight'}, inplace=True)

In [61]:
data = data[(data["winner"] == "red") | (data["winner"] == "blue")]
data['winner_code'] = data.winner.replace({'red': 0, 'blue': 1}) 
dropdata = data.drop('winner', axis=1)


In [10]:
objecttypes = list(dropdata.select_dtypes(include=['float64']).columns)
for col in objecttypes:
    dropdata[col] = dropdata[col].astype('float32')

dropdata.to_csv('data_edited.csv', index=False)

In [None]:
names = data["B_Name"].unique()
lower_names = [name.lower() for name in names]
np.savetxt('fighters.csv', lower_names, delimiter=',', fmt='%s', header='Fighters', comments='')

In [None]:
objecttypes = list(combined_all.select_dtypes(include=['float64']).columns)
for col in objecttypes:
    combined_all[col] = combined_all[col].astype('float32')

combined_all.to_csv('fighter_means.csv', index=False, header=True)

In [None]:
data_mean = data.drop(['Event_ID', 'Fight_ID', 'B_Location', 'B_HomeTown', 'B_ID', 'R_Location', 'R_HomeTown','R_ID','winby','Date', 'BStreak', 'winner', 'Last_round', 'Max_round'], axis=1)
data_mean.rename(columns={'BPrev':'B__Prev',
                        'RPrev':'R__Prev',
                        'B_Age':'B__Age',
                        'B_Height':'B__Height',
                        'B_Weight':'B__Weight',
                        'R_Age':'R__Age',
                        'R_Height':'R__Height',
                        'R_Weight':'R__Weight',
                        'B_Name': 'B__Name',
                        'R_Name': 'R__Name'}, inplace=True)

combined_all = pd.DataFrame()

for i, name in enumerate(fighters['Fighters']):
    conor = data_mean.loc[ (data_mean['B__Name'] == name)]
    filtered_columns = conor.filter(regex='^R', axis=1)
    conorB = conor.drop(filtered_columns.columns, axis=1)

    new_columns = {col: col.replace("B__", "") for col in conorB.columns if col.startswith("B__")}
    conorB = conorB.rename(columns=new_columns)
    conorB = conorB.drop("Name", axis=1)

    conor = data_mean.loc[ (data_mean['R__Name'] == name)]
    filtered_columns = conor.filter(regex='^B', axis=1)
    conorR = conor.drop(filtered_columns.columns, axis=1)

    new_columns = {col: col.replace("R__", "") for col in conorR.columns if col.startswith("R__")}
    conorR = conorR.rename(columns=new_columns)
    conorR = conorR.drop("Name", axis=1)
    # conorR['Last_round'] = conorR.pop('Last_round')
    # conorR['Max_round'] = conorR.pop('Max_round')
    combined_df = pd.concat([conorR, conorB], ignore_index=True)
    combined_df = combined_df.mean().to_frame().transpose()

    combined_df['Name'] = name
    combined_all = pd.concat([combined_all, combined_df], ignore_index=True)

In [None]:
df.to_csv('cleaned_data.csv', index = False)

In [None]:
#data_hist = data.iloc[:, :5].hist(figsize=(10,10), xrot=-45)
data_hist = data.iloc[:, :10]
#sns.histplot(data=data_hist, element='step')

# # Customize the legend
# legend_labels = ['Group A', 'Group B with a long name']


# plt.legend(legend_labels, title='Groups')
# # Adjust the aesthetics of the legend
# legend = plt.gca().get_legend()
# plt.setp(legend.get_title(), fontsize='14', fontweight='bold')
# plt.setp(legend.get_texts(), fontsize='12')

for column in data_hist.columns:
    column_values = data_hist[column].astype(str)
    # Create a new plot for each variable
    sns.histplot(data=column_values)

    plt.title(column)  # Set the title of the plot as the variable name
    plt.xlabel('Value')  # Set the x-axis label
    plt.xticks(rotation='vertical')  # Adjust the rotation angle as needed
    plt.ylabel('Frequency')  # Set the y-axis label
    plt.show();  # Display the plot

#plt.show()


In [None]:
df = {  'Name': data['B_Location'].value_counts().head(15).index, 
        'Count': data['B_Location'].value_counts().head(15).values}
df = pd.DataFrame(df)
sns.barplot(data=df, y='Name', x='Count')
plt.show() 

In [None]:
sns.countplot(y='winner', data=data) 
plt.show() 


In [None]:
sns.countplot(y='winner_code', data=data) 
plt.show() 

In [None]:
data = data[(data["winner"] == "red") | (data["winner"] == "blue")]

In [None]:
#for feature in data.dtypes[data.dtypes == 'object'].index: 
sns.countplot(y='R__Weight', data=dropdata) 
plt.show() 
sns.countplot(y='B__Weight', data=dropdata) 
plt.show() 

In [None]:
#Basic Correlation Matrix
corrmat = dropdata.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True)

In [None]:
# Subset Correlation Matrix
k = 10 #number of variables for heatmap
corrmat = dropdata.corr()
cols = corrmat.nlargest(k, 'winner')['winner'].index
cm = np.corrcoef(dropdata[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

In [None]:
sns.lmplot(x="B__Round3_Strikes_Body Significant Strikes_Attempts", 
            y="B__Round3_Strikes_Body Significant Strikes_Landed", 
            col="winner", hue="winner", data=data, col_wrap=2)