In [None]:
import pandas as pd
from IPython.display import display, HTML

def install_module(module):
    ! conda install "$module" -y
    js_cmd = ['IPython.notebook.kernel.restart();',
              'IPython.notebook.select(1);',
              'IPython.notebook.execute_cell();'
              ]
    js = "<script>{0}</script>".format(' '.join(js_cmd))
    display(HTML(js))

url = 'https://simple.wikipedia.org/wiki/List_of_U.S._states'
try:
    df_list = pd.read_html(url)
except Exception as e:
    print(e)
    # #install necessary modules for read_html
    module = str(e).split()[0]
    install_module(module)
print('Number of Data Frames {}'.format(len(df_list)))
df_list[0].columns = df_list[0].iloc[0]
df = df_list[0].iloc[1:]
df.head()


In [None]:
link_chelsea = 'https://en.wikipedia.org/wiki/List_of_Chelsea_F.C._seasons'
link_man_u = 'https://en.wikipedia.org/wiki/List_of_Manchester_United_F.C._seasons'
link_man_city = 'https://en.wikipedia.org/wiki/List_of_Manchester_City_F.C._seasons'
link_aresenal = 'https://en.wikipedia.org/wiki/List_of_Arsenal_F.C._seasons'

In [None]:
import re
    
def get_num(text):
    out = re.search('\d+', text)
    if out:
        return out.group(0)
    else:
        return text

def extract_data_from_html(link, index, match):
    df_list = pd.read_html(link)
    df_list[index].columns = df_list[index].iloc[0]
    df = df_list[index].iloc[1:]
    df['Season'] = df['Season'].apply(lambda x : get_num(x))
    df.set_index('Season', inplace=True)
    columns = list(df.iloc[0])[:-1]
    columns.insert(0,'Division')
    df.columns = columns
    df = df[df['Division'] == match][['Division','P','W','D','L','F','A','Pts','Pos']]
    df['Pos'] = df['Pos'].apply(lambda x: get_num(x))
    df['Division'] = 'Prem'
#     Removing all data before 2002 as all teams were in Prem by then
    df.index = pd.to_datetime(df.index)
    df = df[df.index >= pd.to_datetime('2002')]
    for col in ['P', 'W','D','L','F','A','Pts','Pos']:
        df[col] = df[col].apply(lambda x : int(get_num(x)))
    df['Win%'] = df['W'].div(df['P'])*100
    df['MA5 Win%'] = pd.rolling_mean(df['Win%'], window=5, center=True)
    return df

def extract_data_from_html_2(link, index, match):
    df_list = pd.read_html(link)
    df_list[index].columns = df_list[index].iloc[0]
    df = df_list[index].iloc[1:]
    df['Season'] = df['Season'].apply(lambda x : get_num(x))
    df.set_index('Season', inplace = True)
    df = df[df['Division'] == match]
    cols = ['Division','P','W','D','L','F','A','Pts','Pos']
    df= df[cols]
    for col in ['P','W','D','L','F','A','Pts','Pos']:
        df[col] = df[col].apply(lambda x : int(get_num(x)))
    df['Division'] = 'Prem'
#     Removing all data before 2002 as all teams were in Prem by then
    df.index = pd.to_datetime(df.index)
    df = df[df.index >= pd.to_datetime('2002')]
    df['Win%'] = df['W'].div(df['P'])*100
    df['MA5 Win%'] = pd.rolling_mean(df['Win%'], window=5, center=True)
    return df

In [None]:
chelsea_df = extract_data_from_html(link_chelsea, 2, 'Prem')
chelsea_df.head()

In [None]:
mancity_df = extract_data_from_html(link_man_city, 0, 'Prem (1)')
mancity_df['Division'] = 'Prem'
mancity_df.head()

In [None]:
arsenal_df = extract_data_from_html_2(link_aresenal, 3, '1 !Prem')
arsenal_df.head()

In [None]:
manu_df = extract_data_from_html_2(link_man_u, 2, '1 !Prem')
manu_df.head()

In [None]:
% matplotlib notebook
import matplotlib.pyplot as plt

plt.figure()
dfs = [chelsea_df, mancity_df, arsenal_df, manu_df]
colors = ['blue', 'purple', 'red', 'orange' ]
names = ['Chelsea', 'Manchester City', 'Arsenal', 'Manchester United']

for i, df in enumerate(dfs):
    if i==2:
        linewidth = 3
    else:
        linewidth = 1
    line = plt.plot(df.index, df['Pos'], color=colors[i], label=names[i], linewidth=linewidth)

# remove all the ticks (both axes)
plt.tick_params(top='off', bottom='off', left='off', right='off', labelleft='on', labelbottom='on')
plt.yticks(range(2,21,2))
plt.gca().invert_yaxis()
# add a label to the x axis
plt.xlabel('Seasons')
# add a label to the y axis
plt.ylabel('Position in the Premier League')
# add a title
plt.title('Performance of top EPL teams')
for spine in plt.gca().spines.values():
    spine.set_visible(False)
plt.tight_layout()
plt.legend(frameon=False)
plt.show();

plt.figure()
for i, df in enumerate(dfs):
    if i==2:
        linewidth = 3
    else:
        linewidth = 1
    plt.plot(df.index, df['MA5 Win%'], color=colors[i], label=names[i], linewidth=linewidth)
# remove all the ticks (both axes)
plt.tick_params(top='off', bottom='off', left='off', right='off', labelleft='on', labelbottom='on')
plt.yticks(range(0,101,20))
# add a label to the x axis
plt.xlabel('Seasons')
# add a label to the y axis
plt.ylabel('5 Year Moving Average Win %')
# add a title
plt.title('EPL teams Win%')
plt.text(0.1, 0.705, names[0], transform=plt.gca().transAxes, fontsize=8)
plt.text(0.05, 0.35, names[1], transform=plt.gca().transAxes, fontsize=8)
plt.text(0.1, 0.61, names[2], transform=plt.gca().transAxes, fontsize=8)
plt.text(0.4, 0.71, names[3], transform=plt.gca().transAxes, fontsize=8)
plt.text(0.6, 0.3, r'$ Win\% = \frac{Games Won}{Games Played}$', fontsize=10, 
                 bbox=dict(facecolor='lightgrey', alpha=0.3), transform=plt.gca().transAxes)
for spine in plt.gca().spines.values():
    spine.set_visible(False)
plt.tight_layout()
plt.show();
