# Pandas

[Tutorial Video](https://www.youtube.com/watch?v=vmEHCJofslg)

[Tutorial Repository](https://github.com/KeithGalli/pandas)

[Pandas](https://pandas.pydata.org/docs/reference/index.html#api)

## Load data from file

In [19]:
import pandas as pd
df = pd.read_csv('./pokemon_data.csv')
print(df.head(3))
print(df.tail(3))

   #       Name Type 1  Type 2  HP  Attack  Defense  Sp. Atk  Sp. Def  Speed  \
0  1  Bulbasaur  Grass  Poison  45      49       49       65       65     45   
1  2    Ivysaur  Grass  Poison  60      62       63       80       80     60   
2  3   Venusaur  Grass  Poison  80      82       83      100      100     80   

   Generation  Legendary  
0           1      False  
1           1      False  
2           1      False  
       #                 Name   Type 1 Type 2  HP  Attack  Defense  Sp. Atk  \
797  720  HoopaHoopa Confined  Psychic  Ghost  80     110       60      150   
798  720   HoopaHoopa Unbound  Psychic   Dark  80     160       60      170   
799  721            Volcanion     Fire  Water  80     110      120      130   

     Sp. Def  Speed  Generation  Legendary  
797      130     70           6       True  
798      130     80           6       True  
799       90     70           6       True  


In [20]:
df_xl = pd.read_excel('pokemon_data.xlsx')
print(df_xl.head(3))

   #       Name Type 1  Type 2  HP  Attack  Defense  Sp. Atk  Sp. Def  Speed  \
0  1  Bulbasaur  Grass  Poison  45      49       49       65       65     45   
1  2    Ivysaur  Grass  Poison  60      62       63       80       80     60   
2  3   Venusaur  Grass  Poison  80      82       83      100      100     80   

   Generation  Legendary  
0           1      False  
1           1      False  
2           1      False  


***For txt add eg., delmiter='\t'***

## Reading Data in Pandas

***Column Headers***

In [21]:
df.columns

Index(['#', 'Name', 'Type 1', 'Type 2', 'HP', 'Attack', 'Defense', 'Sp. Atk',
       'Sp. Def', 'Speed', 'Generation', 'Legendary'],
      dtype='object')

***Read a column***

In [22]:
df[['Name', 'Type 1']][0:5]

Unnamed: 0,Name,Type 1
0,Bulbasaur,Grass
1,Ivysaur,Grass
2,Venusaur,Grass
3,VenusaurMega Venusaur,Grass
4,Charmander,Fire


**Read a row**

***iloc***

In [23]:
df.iloc[1:4]

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,80,100,123,122,120,80,1,False


In [24]:
df.iloc[2,1]

'Venusaur'

***iterrows***

In [25]:
count = 0
for index, row in df.iterrows():
  if count < 5:
    print(index,row['Name'])
    count += 1

0 Bulbasaur
1 Ivysaur
2 Venusaur
3 VenusaurMega Venusaur
4 Charmander


***loc***

For non-integer information

In [26]:
df.loc[(df['Type 1'] == "Fire") & (df['Type 2'] == 'Flying')]

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
6,6,Charizard,Fire,Flying,78,84,78,109,85,100,1,False
8,6,CharizardMega Charizard Y,Fire,Flying,78,104,78,159,115,100,1,False
158,146,Moltres,Fire,Flying,90,100,90,125,85,90,1,True
270,250,Ho-oh,Fire,Flying,106,130,90,110,154,90,2,True
730,662,Fletchinder,Fire,Flying,62,73,55,56,52,84,6,False
731,663,Talonflame,Fire,Flying,78,81,71,74,69,126,6,False


## Making changes to the data

In [27]:
df.head(5)

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,39,52,43,60,50,65,1,False


**Add a column with totals for stats**

In [28]:
df['Total'] = df['HP'] + df['Attack'] + df['Defense'] + df['Sp. Atk'] + df['Sp. Def'] + df['Speed']
df.head(5)

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Total
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False,318
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False,405
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,1,False,525
3,3,VenusaurMega Venusaur,Grass,Poison,80,100,123,122,120,80,1,False,625
4,4,Charmander,Fire,,39,52,43,60,50,65,1,False,309


In [29]:
df = df.drop(columns=['Total'])
df.head(5)

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,39,52,43,60,50,65,1,False


***With iloc***

While this is easer it could break if the column order has been changed

In [30]:
df['Total'] = df.iloc[:, 4:10].sum(axis=1)
df.head(5)

Unnamed: 0,#,Name,Type 1,Type 2,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,Total
0,1,Bulbasaur,Grass,Poison,45,49,49,65,65,45,1,False,318
1,2,Ivysaur,Grass,Poison,60,62,63,80,80,60,1,False,405
2,3,Venusaur,Grass,Poison,80,82,83,100,100,80,1,False,525
3,3,VenusaurMega Venusaur,Grass,Poison,80,100,123,122,120,80,1,False,625
4,4,Charmander,Fire,,39,52,43,60,50,65,1,False,309


**Rearrange Columns**

Note that a single-column slice returns a string so it must be made an array

In [31]:
cols = list(df.columns.values)
df = df[cols[0:4] + [cols[-1]] + cols[4:-1]]
df.head(5)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


## Saving/Exporting Data

In [32]:
df.to_csv('modified.csv', index=False) # remove the index column
df.to_excel('modified.xlsx', index=False)
df.to_csv('modified.txt', index=False, sep="\t")

## Filtering Data

In [33]:
df.loc[(df['Type 1'] == 'Grass') & (df['Type 2'] == 'Poison')].head(5)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
48,43,Oddish,Grass,Poison,320,45,50,55,75,65,30,1,False


In [34]:
df.loc[(df['Type 1'] == 'Grass') | (df['Type 2'] == 'Poison')].head(5)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
16,13,Weedle,Bug,Poison,195,40,35,30,20,20,50,1,False


In [35]:
new_df = df.loc[(df['Type 1'] == 'Grass') | (df['Type 2'] == 'Poison')]
print(new_df.head(6))
new_df = new_df.reset_index(drop=True) # drop prevents creation of a new column with the old index number
print(new_df.head(6))

     #                   Name Type 1  Type 2  Total  HP  Attack  Defense  \
0    1              Bulbasaur  Grass  Poison    318  45      49       49   
1    2                Ivysaur  Grass  Poison    405  60      62       63   
2    3               Venusaur  Grass  Poison    525  80      82       83   
3    3  VenusaurMega Venusaur  Grass  Poison    625  80     100      123   
16  13                 Weedle    Bug  Poison    195  40      35       30   
17  14                 Kakuna    Bug  Poison    205  45      25       50   

    Sp. Atk  Sp. Def  Speed  Generation  Legendary  
0        65       65     45           1      False  
1        80       80     60           1      False  
2       100      100     80           1      False  
3       122      120     80           1      False  
16       20       20     50           1      False  
17       25       25     35           1      False  
    #                   Name Type 1  Type 2  Total  HP  Attack  Defense  \
0   1              Bu

In [36]:
df.loc[df['Name'].str.contains('Mega')].head(5)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
7,6,CharizardMega Charizard X,Fire,Dragon,634,78,130,111,130,85,100,1,False
8,6,CharizardMega Charizard Y,Fire,Flying,634,78,104,78,159,115,100,1,False
12,9,BlastoiseMega Blastoise,Water,,630,79,103,120,135,115,78,1,False
19,15,BeedrillMega Beedrill,Bug,Poison,495,65,150,40,15,80,145,1,False


In [37]:
df.loc[~df['Name'].str.contains('Mega')].head(5)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False
5,5,Charmeleon,Fire,,405,58,64,58,80,65,80,1,False


In [38]:
import re
df.loc[df['Type 1'].str.contains('Fire|Grass', regex=True)].head(6)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False
5,5,Charmeleon,Fire,,405,58,64,58,80,65,80,1,False


***Case insentive***

In [39]:
df.loc[df['Type 1'].str.contains('fire|grass', flags=re.I, regex=True)].head(6)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False
5,5,Charmeleon,Fire,,405,58,64,58,80,65,80,1,False


In [40]:
df.loc[df['Name'].str.contains('^Pi', regex=True)].head(6)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
20,16,Pidgey,Normal,Flying,251,40,45,40,35,35,56,1,False
21,17,Pidgeotto,Normal,Flying,349,63,60,55,50,50,71,1,False
22,18,Pidgeot,Normal,Flying,479,83,80,75,70,70,101,1,False
23,18,PidgeotMega Pidgeot,Normal,Flying,579,83,80,80,135,80,121,1,False
30,25,Pikachu,Electric,,320,35,55,40,50,50,90,1,False
136,127,Pinsir,Bug,,500,65,125,100,55,70,85,1,False


## Conditional Changes

In [41]:
df.loc[df['Type 1'] == 'Fire', 'Type 1'] = 'Flamer'
df.head(6)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Flamer,,309,39,52,43,60,50,65,1,False
5,5,Charmeleon,Flamer,,405,58,64,58,80,65,80,1,False


In [42]:
df.loc[df['Type 1'] == 'Grass', 'Legendary'] = True
df.head(6)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,True
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,True
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,True
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,True
4,4,Charmander,Flamer,,309,39,52,43,60,50,65,1,False
5,5,Charmeleon,Flamer,,405,58,64,58,80,65,80,1,False


In [43]:
df.loc[df['Total'] > 500, ['Generation', 'Legendary']] = [2, True]
df.head(6)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,True
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,True
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,2,True
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,2,True
4,4,Charmander,Flamer,,309,39,52,43,60,50,65,1,False
5,5,Charmeleon,Flamer,,405,58,64,58,80,65,80,1,False


## Aggregation

In [44]:
df = pd.read_csv('modified.csv')
df.groupby(['Type 1']).mean(numeric_only=True).sort_values('Defense', ascending=False).head(6)
# Default numeric_only = True is deprecated

Unnamed: 0_level_0,#,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
Type 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Steel,442.851852,487.703704,65.222222,92.703704,126.37037,67.518519,80.62963,55.259259,3.851852,0.148148
Rock,392.727273,453.75,65.363636,92.863636,100.795455,63.340909,75.477273,55.909091,3.454545,0.090909
Dragon,474.375,550.53125,83.3125,112.125,86.375,96.84375,88.84375,83.03125,3.875,0.375
Ground,356.28125,437.5,73.78125,95.75,84.84375,56.46875,62.75,63.90625,3.15625,0.125
Ghost,486.5,439.5625,64.4375,73.78125,81.1875,79.34375,76.46875,64.34375,4.1875,0.0625
Water,303.089286,430.455357,72.0625,74.151786,72.946429,74.8125,70.517857,65.964286,2.857143,0.035714


In [45]:
df.groupby(['Type 1']).sum(numeric_only=True).head(6) # Default True is deprecated

Unnamed: 0_level_0,#,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
Type 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Bug,23080,26146,3925,4897,4880,3717,4471,4256,222,0
Dark,14302,13818,2071,2740,2177,2314,2155,2361,125,2
Dragon,15180,17617,2666,3588,2764,3099,2843,2657,124,12
Electric,15994,19510,2631,3040,2917,3961,3243,3718,144,4
Fairy,7642,7024,1260,1046,1117,1335,1440,826,70,1
Fighting,9824,11244,1886,2613,1780,1434,1747,1784,91,0


In [46]:
df.groupby(['Type 1']).count().head(6)

Unnamed: 0_level_0,#,Name,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
Type 1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Bug,69,69,52,69,69,69,69,69,69,69,69,69
Dark,31,31,21,31,31,31,31,31,31,31,31,31
Dragon,32,32,21,32,32,32,32,32,32,32,32,32
Electric,44,44,17,44,44,44,44,44,44,44,44,44
Fairy,17,17,2,17,17,17,17,17,17,17,17,17
Fighting,27,27,7,27,27,27,27,27,27,27,27,27


In [47]:
df['count'] = 1
df.groupby(['Type 1', 'Type 2']).count()['count'].head(16)

Type 1  Type 2  
Bug     Electric     2
        Fighting     2
        Fire         2
        Flying      14
        Ghost        1
        Grass        6
        Ground       2
        Poison      12
        Rock         3
        Steel        7
        Water        1
Dark    Dragon       3
        Fighting     2
        Fire         3
        Flying       5
        Ghost        2
Name: count, dtype: int64

## Working with large amounts of data

In [48]:
new_df = pd.DataFrame(columns=df.columns)

for df in pd.read_csv('modified.csv', chunksize=5):
  results = df.groupby(['Type 1']).count()
  new_df = pd.concat([new_df, results])
  
new_df

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,count
Fire,1,1,,0,1,1,1,1,1,1,1,1,1,
Grass,4,4,,4,4,4,4,4,4,4,4,4,4,
Fire,4,4,,3,4,4,4,4,4,4,4,4,4,
Water,1,1,,0,1,1,1,1,1,1,1,1,1,
Bug,2,2,,0,2,2,2,2,2,2,2,2,2,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Fairy,1,1,,0,1,1,1,1,1,1,1,1,1,
Flying,2,2,,2,2,2,2,2,2,2,2,2,2,
Fire,1,1,,1,1,1,1,1,1,1,1,1,1,
Psychic,2,2,,2,2,2,2,2,2,2,2,2,2,


## Extract tables from Websites

[Video Source](https://www.youtube.com/watch?v=PXMJ6FS7llk)

simpsons = pd.read_html('https://en.wikipedia.org/wiki/List_of_The_Simpsons_episodes_(seasons_1%E2%80%9320)')

In [50]:
len(simpsons)

23

In [51]:
simpsons[1]

Unnamed: 0,No.overall,No. inseason,Title,Directed by,Written by,Original air date,Prod.code,U.S. viewers(millions)
0,1,1,"""Simpsons Roasting on an Open Fire""",David Silverman,Mimi Pond,"December 17, 1989",7G08,26.7[46]
1,2,2,"""Bart the Genius""",David Silverman,Jon Vitti,"January 14, 1990",7G02,24.5[46]
2,3,3,"""Homer's Odyssey""",Wes Archer,Jay Kogen & Wallace Wolodarsky,"January 21, 1990",7G03,27.5[47]
3,4,4,"""There's No Disgrace Like Home""",Gregg Vanzo & Kent Butterworth,Al Jean & Mike Reiss,"January 28, 1990",7G04,20.2[48]
4,5,5,"""Bart the General""",David Silverman,John Swartzwelder,"February 4, 1990",7G05,27.1[49]
5,6,6,"""Moaning Lisa""",Wes Archer,Al Jean & Mike Reiss,"February 11, 1990",7G06,27.4[50]
6,7,7,"""The Call of the Simpsons""",Wes Archer,John Swartzwelder,"February 18, 1990",7G09,27.6[51]
7,8,8,"""The Telltale Head""",Rich Moore,"Al Jean, Mike Reiss, Sam Simon & Matt Groening","February 25, 1990",7G07,28[52]
8,9,9,"""Life on the Fast Lane""",David Silverman,John Swartzwelder,"March 18, 1990",7G11,33.5[53]
9,10,10,"""Homer's Night Out""",Rich Moore,Jon Vitti,"March 25, 1990",7G10,30.3[54]


## Scraping: Extract CSV from Weebsites

**reading 1 csv file from website**

In [52]:
df_premier21 = pd.read_csv('https://www.football-data.co.uk/mmz4281/2223/E0.csv')

**showing dataframe**

In [53]:
df_premier21

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,E0,05/08/2022,20:00,Crystal Palace,Arsenal,0,2,A,0,1,...,1.76,0.50,2.09,1.84,2.04,1.88,2.09,1.88,2.03,1.85
1,E0,06/08/2022,12:30,Fulham,Liverpool,2,2,D,1,0,...,2.73,1.75,1.90,2.03,1.91,2.02,2.01,2.06,1.89,1.99
2,E0,06/08/2022,15:00,Bournemouth,Aston Villa,2,0,H,1,0,...,1.76,0.50,1.93,2.00,1.93,2.00,1.94,2.04,1.88,2.00
3,E0,06/08/2022,15:00,Leeds,Wolves,2,1,H,1,1,...,1.87,-0.25,2.08,1.85,2.10,1.84,2.14,1.87,2.08,1.81
4,E0,06/08/2022,15:00,Newcastle,Nott'm Forest,2,0,H,0,0,...,1.89,-1.00,1.97,1.96,1.99,1.93,2.19,1.97,2.03,1.86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,E0,27/12/2022,17:30,Chelsea,Bournemouth,2,0,H,2,0,...,2.28,-1.50,2.02,1.88,2.04,1.88,2.05,2.01,2.00,1.88
154,E0,27/12/2022,20:00,Man United,Nott'm Forest,3,0,H,2,0,...,2.63,-1.50,1.91,1.99,1.93,2.00,1.97,2.10,1.91,1.96
155,E0,28/12/2022,20:00,Leeds,Man City,1,3,A,0,1,...,3.06,1.75,1.98,1.95,1.97,1.93,2.10,1.97,1.96,1.90
156,E0,30/12/2022,19:45,West Ham,Brentford,0,2,A,0,2,...,1.97,-0.25,1.77,2.02,1.86,2.07,1.86,2.16,1.79,2.09


**rename columns**

In [54]:
df_premier21.rename(columns={'FTHG': 'home_goals',
                             'FTAG': 'away_goals'}, inplace=True)

In [55]:
df_premier21

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,home_goals,away_goals,FTR,HTHG,HTAG,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,E0,05/08/2022,20:00,Crystal Palace,Arsenal,0,2,A,0,1,...,1.76,0.50,2.09,1.84,2.04,1.88,2.09,1.88,2.03,1.85
1,E0,06/08/2022,12:30,Fulham,Liverpool,2,2,D,1,0,...,2.73,1.75,1.90,2.03,1.91,2.02,2.01,2.06,1.89,1.99
2,E0,06/08/2022,15:00,Bournemouth,Aston Villa,2,0,H,1,0,...,1.76,0.50,1.93,2.00,1.93,2.00,1.94,2.04,1.88,2.00
3,E0,06/08/2022,15:00,Leeds,Wolves,2,1,H,1,1,...,1.87,-0.25,2.08,1.85,2.10,1.84,2.14,1.87,2.08,1.81
4,E0,06/08/2022,15:00,Newcastle,Nott'm Forest,2,0,H,0,0,...,1.89,-1.00,1.97,1.96,1.99,1.93,2.19,1.97,2.03,1.86
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153,E0,27/12/2022,17:30,Chelsea,Bournemouth,2,0,H,2,0,...,2.28,-1.50,2.02,1.88,2.04,1.88,2.05,2.01,2.00,1.88
154,E0,27/12/2022,20:00,Man United,Nott'm Forest,3,0,H,2,0,...,2.63,-1.50,1.91,1.99,1.93,2.00,1.97,2.10,1.91,1.96
155,E0,28/12/2022,20:00,Leeds,Man City,1,3,A,0,1,...,3.06,1.75,1.98,1.95,1.97,1.93,2.10,1.97,1.96,1.90
156,E0,30/12/2022,19:45,West Ham,Brentford,0,2,A,0,2,...,1.97,-0.25,1.77,2.02,1.86,2.07,1.86,2.16,1.79,2.09


## Extract Tables from PDFs

In [56]:
!pip install tk
!pip install ghostscript
!pip install 'PyPDF2<3.0'
!pip install opencv-python
!pip install "camelot-py[base]"



In [57]:
import camelot

In [59]:
tables = camelot.read_pdf('foo.pdf', pages='1')

In [62]:
print(tables)

<TableList n=1>


In [None]:
print('hello')

In [63]:
tables.export('foo.csv', f='csv', compress=True)
tables[0].to_csv('foo.csv')

## Web Automation and Scraping: HTML - Tags

[source](https://subslikescript.com/movie/Titanic-120338)

Elements, Attributes and Text are nodes

article element is root node
    
All nodes have one parent

**XPath**

Similar to css in syntax

//tagName[@AttributeName="Value"]

Functions:
- contains()     //tagName[contains(@AttributeName, "Value")]
- starts-with()
- and/or         //tagName[(expression 1) and (expression2)]



In [70]:
import html

In [77]:
input = html.unescape('<article class ="main-article"><h1> Titanic(1997) </h1><p class ="plot" > 84 years later... </p><p class ="plot2" > In the end ... </p><div class ="full-script">"13 meters. You should see it. ""Okay, take her up and over the bow rail. "</div></article>')

**XPath Special Characters**

- / : Select children on left side
- // : Specifies that matching node can be at any level
- . : Refers to current node
- .. : Refers to parent node
- \* : Selects all children
- @ : Select an attribute
- () : Grouping expressions
- [n] : Specifies a node by it's index

In [78]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.7.2-py3-none-any.whl (6.3 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0mm eta [36m0:00:01[0m
Collecting trio-websocket~=0.9
  Downloading trio_websocket-0.9.2-py3-none-any.whl (16 kB)
Collecting trio~=0.17
  Downloading trio-0.22.0-py3-none-any.whl (384 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m384.9/384.9 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
Collecting async-generator>=1.9
  Downloading async_generator-1.10-py3-none-any.whl (18 kB)
Collecting exceptiongroup>=1.0.0rc9
  Downloading exceptiongroup-1.1.0-py3-none-any.whl (14 kB)
Collecting outcome
  Downloading outcome-1.2.0-py2.py3-none-any.whl (9.7 kB)
Collecting wsproto>=0.14
  Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━

## Automate the News

In [90]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service

options = Options()
options.binary_location = "/home/biscotty/.selenium/chrome"
service = Service(executable_path=r'/home/biscotty/.selenium/chromedriver')
driver = webdriver.Chrome(service=service)

website = "https://thesun.co.uk/sport/football/"

driver.get(website)

WebDriverException: Message: unknown error: DevToolsActivePort file doesn't exist
Stacktrace:
#0 0x564fa1a1b2a3 <unknown>
#1 0x564fa17d9f77 <unknown>
#2 0x564fa1805fc4 <unknown>
#3 0x564fa1801b0c <unknown>
#4 0x564fa17fe7d0 <unknown>
#5 0x564fa183f0b7 <unknown>
#6 0x564fa183ea5f <unknown>
#7 0x564fa1836903 <unknown>
#8 0x564fa1809ece <unknown>
#9 0x564fa180afde <unknown>
#10 0x564fa1a6b63e <unknown>
#11 0x564fa1a6eb79 <unknown>
#12 0x564fa1a5189e <unknown>
#13 0x564fa1a6fa83 <unknown>
#14 0x564fa1a44505 <unknown>
#15 0x564fa1a90ca8 <unknown>
#16 0x564fa1a90e36 <unknown>
#17 0x564fa1aac333 <unknown>
#18 0x7fb580a9eb43 <unknown>


In [None]:
driver.find_element(by="xpath", value="//div[@class='teaser__copy-container']")

#  //div[@class='teaser__copy-container']
