In [141]:
%%capture
!pip install -U kaleido
!pip install plotly>=4.0.0
!wget https://github.com/plotly/orca/releases/download/v1.2.1/orca-1.2.1-x86_64.AppImage -O /usr/local/bin/orca
!chmod +x /usr/local/bin/orca
!apt-get install xvfb libgtk2.0-0 libgconf-2-4

In [124]:
import os

if not os.path.exists("images"):
    os.mkdir("images")

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

In [2]:
df = pd.read_csv("cr7.csv")

# Basic exploration

In [3]:
df.head()

Unnamed: 0,Season,Competition,Matchday,Date,Venue,Club,Opponent,Result,Playing_Position,Minute,At_score,Type,Goal_assist
0,02/03,Liga Portugal,6,10-07-02,H,Sporting CP,Moreirense FC,3:00,LW,34,2:00,Solo run,
1,02/03,Liga Portugal,6,10-07-02,H,Sporting CP,Moreirense FC,3:00,LW,90+5,3:00,Header,Rui Jorge
2,02/03,Liga Portugal,8,10/26/02,A,Sporting CP,Boavista FC,1:02,,88,1:02,Right-footed shot,Carlos Martins
3,02/03,Taca de Portugal Placard,Fourth Round,11/24/02,H,Sporting CP,CD Estarreja,4:01,,67,3:00,Left-footed shot,Cesar Prates
4,02/03,Taca de Portugal Placard,Fifth Round,12/18/02,H,Sporting CP,FC Oliveira do Hospital,8:01,,13,3:00,,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 701 entries, 0 to 700
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Season            701 non-null    object
 1   Competition       701 non-null    object
 2   Matchday          701 non-null    object
 3   Date              701 non-null    object
 4   Venue             701 non-null    object
 5   Club              701 non-null    object
 6   Opponent          701 non-null    object
 7   Result            701 non-null    object
 8   Playing_Position  643 non-null    object
 9   Minute            701 non-null    object
 10  At_score          701 non-null    object
 11  Type              686 non-null    object
 12  Goal_assist       459 non-null    object
dtypes: object(13)
memory usage: 71.3+ KB


In [37]:
#NEW COLUMN
df['Date_year'] = pd.to_datetime(df['Date']).dt.year

In [38]:
#NEW COLUMN
df.loc[df['Minute']<= '45', 'Goal_time'] = 'First half'
df.loc[df['Minute'] >= '46', 'Goal_time'] = 'Second half'

In [134]:
#Goals per year

fig = px.histogram(
    df,
    x='Date_year',
    title='Goals per year',
).update_layout(bargap=0.2, #spacing between bars
                xaxis={"dtick":1}, #one label for each bar
                xaxis_title_text='Year', # xaxis label
                yaxis_title_text='# Goals', # yaxis label
                )

fig.show()
fig.write_image("images/fig1.png")

In [90]:
#Goals per played position - df preparation
position_df=df.groupby('Playing_Position').size().to_frame('count').reset_index()
goal_position_df=position_df[position_df["count"]>10]

In [135]:
#Goals per played position
fig = px.histogram(
    goal_position_df,
    x='Playing_Position',
    y='count',
    title='Goals per played position'
).update_layout(bargap=0.2, #spacing between bars
                xaxis={"dtick":1}, #one label for each bar
                xaxis_title_text='Position', # xaxis label
                yaxis_title_text='# Goals', # yaxis label
                ).update_xaxes(
                    categoryorder='total descending')

fig.show()
fig.write_image("images/fig2.png")

In [136]:
#Goals per type of finishing
fig = px.histogram(
    df,
    x='Type',
    title='Goals per type of finishing',
).update_layout(bargap=0.2, #spacing between bars
                xaxis={"dtick":1}, #one label for each bar
                xaxis_title_text='Type of finishing', # xaxis label
                yaxis_title_text='# Goals', # yaxis label
                ).update_xaxes(
                    categoryorder='total descending')
                
fig.show()
fig.write_image("images/fig3.png")

In [137]:
#istogramma con gol segnati squadra
fig = px.histogram(
    df,
    x='Club',
    title='Goals per club',
).update_layout(bargap=0.2, #spacing between bars
                xaxis={"dtick":1}, #one label for each bar
                xaxis_title_text='Club', # xaxis label
                yaxis_title_text='# Goals', # yaxis label
                ).update_xaxes(
                    categoryorder='total descending')

fig.show()
fig.write_image("images/fig4.png")

In [138]:
#Goals per half
fig = px.histogram(
    df,
    x='Goal_time',
    title='Goals per half',
).update_layout(bargap=0.2, #spacing between bars
                xaxis={"dtick":1}, #one label for each bar
                xaxis_title_text='Half', # xaxis label
                yaxis_title_text='# Goals', # yaxis label
                ).update_xaxes(
                    categoryorder='total ascending')
                
fig.show()
fig.write_image("images/fig5.png")

In [117]:
df['Type'] = df['Type'].fillna('Other')

In [139]:
#Type of goals per season
fig = px.histogram(
    df,
    x='Date_year',
    color='Type',
    text_auto=True,
    title='Type of goals per season'
).update_layout(bargap=0.2, #spacing between bars
                xaxis={"dtick":1}, #one label for each bar
                xaxis_title_text='Season', # xaxis label
                yaxis_title_text='# Goals' # yaxis label
                ).update_traces(textposition='inside', textfont_size=8)

fig.show()
fig.write_image("images/fig6.png")

In [92]:
#Favorite opponents - df preparation
opponents_df=df.groupby('Opponent').size().to_frame('count').reset_index()
fav_opponents_df=opponents_df[opponents_df["count"]>15]

In [140]:
#Favorite opponents
fig = px.histogram(
    fav_opponents_df,
    x='Opponent',
    y='count',
    title='Favorite opponents (at least 15 goals)'
).update_layout(bargap=0.2, #spacing between bars
                xaxis={"dtick":1}, #one label for each bar
                xaxis_title_text='Opponent', # xaxis label
                yaxis_title_text='# Goals', # yaxis label
                ).update_xaxes(
                    categoryorder='total descending')
                
fig.show()
fig.write_image("images/fig7.png")