In [65]:
import pandas as pd
import numpy as np

In [66]:
df=pd.read_csv('soccer.csv')

In [67]:
df.head(2)

Unnamed: 0,name,club,age,position,position_cat,market_value,page_views,fpl_value,fpl_sel,fpl_points,region,nationality,new_foreign,age_cat,club_id,big_club,new_signing
0,Alexis Sanchez,Arsenal,28,LW,1,65.0,4329,12.0,17.10%,264,3,Chile,0,4,1,1,0
1,Mesut Ozil,Arsenal,28,AM,1,50.0,4395,9.5,5.60%,167,2,Germany,0,4,1,1,0


### Identifying Duplicates

In [68]:
df[df.duplicated(subset=['club', 'age', 'position', 'market_value'], keep=False)].head()

Unnamed: 0,name,club,age,position,position_cat,market_value,page_views,fpl_value,fpl_sel,fpl_points,region,nationality,new_foreign,age_cat,club_id,big_club,new_signing
10,Granit Xhaka,Arsenal,24,DM,2,35.0,1815,5.5,2.00%,85,2,Switzerland,0,2,1,1,0
11,Granit Xhaka,Arsenal,24,DM,2,35.0,1815,5.5,2.00%,85,2,Switzerland,0,2,1,1,0
12,Alex Oxlade-Chamberlain,Arsenal,23,RM,2,22.0,1519,6.0,1.80%,83,1,England,0,2,1,1,0
13,Alex Oxlade-Chamberlain,Arsenal,23,RM,2,22.0,1519,6.0,1.80%,83,1,England,0,2,1,1,0
23,Alex Oxlade-Chamberlain,Arsenal,23,RM,2,22.0,1519,6.0,1.80%,83,1,England,0,2,1,1,0


keep=False means that we want all repeated values to be treated as duplicate

Without specifying the subset (default) will include all the columns, 

In [69]:
df[df.duplicated(keep=False)]

Unnamed: 0,name,club,age,position,position_cat,market_value,page_views,fpl_value,fpl_sel,fpl_points,region,nationality,new_foreign,age_cat,club_id,big_club,new_signing
10,Granit Xhaka,Arsenal,24,DM,2,35.0,1815,5.5,2.00%,85,2,Switzerland,0,2,1,1,0
11,Granit Xhaka,Arsenal,24,DM,2,35.0,1815,5.5,2.00%,85,2,Switzerland,0,2,1,1,0
12,Alex Oxlade-Chamberlain,Arsenal,23,RM,2,22.0,1519,6.0,1.80%,83,1,England,0,2,1,1,0
13,Alex Oxlade-Chamberlain,Arsenal,23,RM,2,22.0,1519,6.0,1.80%,83,1,England,0,2,1,1,0
23,Alex Oxlade-Chamberlain,Arsenal,23,RM,2,22.0,1519,6.0,1.80%,83,1,England,0,2,1,1,0


### Removing Duplicates

In [70]:
df = df.drop_duplicates(keep='first')

### Sophisticated Alternatives - Reindexing to Remove Rows and Columns

- first we set the unwanted rows (or the duplicated one)
- change the 'unwanted' list to set
- use the set().difference() method to delet the unwanted rows

In [71]:
unwanted = [0,1,2,3]

In [72]:
df.reindex(labels=set(df.index).difference(unwanted)).head()

Unnamed: 0,name,club,age,position,position_cat,market_value,page_views,fpl_value,fpl_sel,fpl_points,region,nationality,new_foreign,age_cat,club_id,big_club,new_signing
4,Laurent Koscielny,Arsenal,31,CB,3,22.0,912,6.0,0.70%,121,2,France,0,4,1,1,0
5,Hector Bellerin,Arsenal,22,RB,3,30.0,1675,6.0,13.70%,119,2,Spain,0,2,1,1,0
6,Olivier Giroud,Arsenal,30,CF,1,22.0,2230,8.5,2.50%,116,2,France,0,4,1,1,0
7,Nacho Monreal,Arsenal,31,LB,3,13.0,555,5.5,4.70%,115,2,Spain,0,4,1,1,0
8,Shkodran Mustafi,Arsenal,25,CB,3,30.0,1877,5.5,4.00%,90,2,Germany,0,3,1,1,1


So now the dataframe starts from index 4. We can use it for the columns as well

### Null Values in the DataFrames

In [73]:
df.isnull().sum()

name            0
club            0
age             0
position        1
position_cat    0
market_value    3
page_views      0
fpl_value       0
fpl_sel         0
fpl_points      0
region          0
nationality     0
new_foreign     0
age_cat         0
club_id         0
big_club        0
new_signing     0
dtype: int64

How many NAs are there in this dataframe?

In [74]:
np.count_nonzero(df.isnull())

4

Which records have NAs in this dataframe?

In [75]:
df.isna().values

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [76]:
df[df.isna().values]

Unnamed: 0,name,club,age,position,position_cat,market_value,page_views,fpl_value,fpl_sel,fpl_points,region,nationality,new_foreign,age_cat,club_id,big_club,new_signing
30,Granit Xhaka,Arsenal,24,,2,,1815,5.5,2.00%,85,2,Switzerland,0,2,1,1,0
30,Granit Xhaka,Arsenal,24,,2,,1815,5.5,2.00%,85,2,Switzerland,0,2,1,1,0
192,Steve Mounie,Huddersfield,22,CF,1,,56,6.0,0.60%,0,2,Benin,0,2,8,0,0
195,Kasper Schmeichel,Leicester+City,30,GK,4,,1601,5.0,2.40%,109,2,Denmark,0,4,9,0,0


There is one duplicate in the dataframe above (index 30)

In [77]:
df[df.isnull().values].drop_duplicates()

Unnamed: 0,name,club,age,position,position_cat,market_value,page_views,fpl_value,fpl_sel,fpl_points,region,nationality,new_foreign,age_cat,club_id,big_club,new_signing
30,Granit Xhaka,Arsenal,24,,2,,1815,5.5,2.00%,85,2,Switzerland,0,2,1,1,0
192,Steve Mounie,Huddersfield,22,CF,1,,56,6.0,0.60%,0,2,Benin,0,2,8,0,0
195,Kasper Schmeichel,Leicester+City,30,GK,4,,1601,5.0,2.40%,109,2,Denmark,0,4,9,0,0


### Dropping and Filling NAs in DataFrames

In [78]:
df.fillna(method='ffill').loc[[30,192,195]]

Unnamed: 0,name,club,age,position,position_cat,market_value,page_views,fpl_value,fpl_sel,fpl_points,region,nationality,new_foreign,age_cat,club_id,big_club,new_signing
30,Granit Xhaka,Arsenal,24,LB,2,15.0,1815,5.5,2.00%,85,2,Switzerland,0,2,1,1,0
192,Steve Mounie,Huddersfield,22,CF,1,3.0,56,6.0,0.60%,0,2,Benin,0,2,8,0,0
195,Kasper Schmeichel,Leicester+City,30,GK,4,30.0,1601,5.0,2.40%,109,2,Denmark,0,4,9,0,0


We can specify the custom values

In [79]:
df.fillna(value={'market_value':100, 'position':'GK'}).loc[[30,192,195]]

Unnamed: 0,name,club,age,position,position_cat,market_value,page_views,fpl_value,fpl_sel,fpl_points,region,nationality,new_foreign,age_cat,club_id,big_club,new_signing
30,Granit Xhaka,Arsenal,24,GK,2,100.0,1815,5.5,2.00%,85,2,Switzerland,0,2,1,1,0
192,Steve Mounie,Huddersfield,22,CF,1,100.0,56,6.0,0.60%,0,2,Benin,0,2,8,0,0
195,Kasper Schmeichel,Leicester+City,30,GK,4,100.0,1601,5.0,2.40%,109,2,Denmark,0,4,9,0,0


In [80]:
%timeit df.drop_duplicates(subset=['age', 'club', 'position']).loc[:,['age', 'position']]

2.74 ms ± 492 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [81]:
%timeit df.drop_duplicates(subset=['age', 'club', 'position'])[['age', 'position']]

2 ms ± 408 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Calculating Aggregates using .agg()

In [82]:
df.select_dtypes(np.number).agg('mean')

age              26.798701
position_cat      2.179654
market_value     11.026253
page_views      766.051948
fpl_value         5.448052
fpl_points       57.374459
region            1.993506
new_foreign       0.034632
age_cat           3.203463
club_id          10.313853
big_club          0.305195
new_signing       0.145022
dtype: float64

Using agg('mean', etc.) must only int or float data types (number) or els will throw future warning

In [83]:
'a' < 'b'

True

In [84]:
print(f'ord "a": {ord("a")}')
print(f'ord "b": {ord("b")}')

ord "a": 97
ord "b": 98


In [85]:
df.select_dtypes(np.number).agg(['mean', 'min', 'max'])

Unnamed: 0,age,position_cat,market_value,page_views,fpl_value,fpl_points,region,new_foreign,age_cat,club_id,big_club,new_signing
mean,26.798701,2.179654,11.026253,766.051948,5.448052,57.374459,1.993506,0.034632,3.203463,10.313853,0.305195,0.145022
min,17.0,1.0,0.05,3.0,4.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
max,38.0,4.0,75.0,7664.0,12.5,264.0,4.0,1.0,6.0,20.0,1.0,1.0


### Same-Shape Transforms

Unlike .agg() method, that change the shape of the dataframes. But there is a dedicated method used for applying custom function to the entire dataframe without changing the shape

Let's try. Now we want to convert the currency from USD to Euro. 1 usd is 0.93 euro

In [86]:
df.loc[:,['market_value', 'fpl_value']].transform(lambda x:x*0.93).head()

Unnamed: 0,market_value,fpl_value
0,60.45,11.16
1,46.5,8.835
2,6.51,5.115
3,18.6,6.975
4,20.46,5.58


In [87]:
def euro_converter(x):
    return x*0.93
df.loc[:,['market_value', 'fpl_value']].transform(euro_converter).head()

Unnamed: 0,market_value,fpl_value
0,60.45,11.16
1,46.5,8.835
2,6.51,5.115
3,18.6,6.975
4,20.46,5.58


The two codes above produce similar results

In [88]:
from random import choice

### Building a pandas string wrapper function

the function should: 
- apply a random string capitalization method
- to a sequence of values
- return the transformed sequence

In [89]:
def random_case(x):
    funcs=[x.str.upper, x.str.lower, x.str.swapcase]
    return choice(funcs)()

return choice(funcs)() the parenthesis at the end is to invoke the function

In [90]:
df.select_dtypes(include='object').transform(random_case).head(3)

Unnamed: 0,name,club,position,fpl_sel,nationality
0,aLEXIS sANCHEZ,ARSENAL,lw,17.10%,cHILE
1,mESUT oZIL,ARSENAL,am,5.60%,gERMANY
2,pETR cECH,ARSENAL,gk,5.90%,cZECH rEPUBLIC


Conclusion trasnfrom dataframe --> apply a transformation to a dataframe without reshaping it

### More Flexible with .apply() Method

The dataframe apply method gives us the opportunity to apply a function across the entire dataframe one row or one column at a time

In [91]:
df.select_dtypes(include='object').apply(random_case).head()

Unnamed: 0,name,club,position,fpl_sel,nationality
0,alexis sanchez,aRSENAL,LW,17.10%,CHILE
1,mesut ozil,aRSENAL,AM,5.60%,GERMANY
2,petr cech,aRSENAL,GK,5.90%,CZECH REPUBLIC
3,theo walcott,aRSENAL,RW,1.50%,ENGLAND
4,laurent koscielny,aRSENAL,CB,0.70%,FRANCE


We want to modify all of our floating point columns and run them to the closest integer without actually convert them into an ingeter

In [92]:
def round_float(x):
    if x.dtype=='float':
        return round(x)
    return x #if not, we will return the series unchanged

regarding the func above. This function will be applied to the entire dataframe, so to avoid any TypeError we need to check first the dtype any incoming argument

round_float(x). This x will be assigned to an entire row or column. Which means that x will be a series when this funcion runs

In [93]:
df.select_dtypes('float').head()

Unnamed: 0,market_value,fpl_value
0,65.0,12.0
1,50.0,9.5
2,7.0,5.5
3,20.0,7.5
4,22.0,6.0


In [94]:
df.select_dtypes('float').transform(round_float).head()

Unnamed: 0,market_value,fpl_value
0,65.0,12.0
1,50.0,10.0
2,7.0,6.0
3,20.0,8.0
4,22.0,6.0


Apply and transform are legitimately different methods. The reason is that apply() is a little bit more flexible and powerful

.apply() has more genral scope specifically in that it supports aggregations as well as same shape transforms

We can think of the dataframe apply method as a combination of __transform__ and __aggregate__ in that a single function, a single piece of syntax supports both types of operations

so .apply() = .agg() + .transform()

For example:

In [95]:
df.select_dtypes(np.number).agg('mean').head(3)

age             26.798701
position_cat     2.179654
market_value    11.026253
dtype: float64

We can do that with .apply() method as well. But not with .transform()

In [96]:
df.select_dtypes(np.number).apply('mean').head(3)

age             26.798701
position_cat     2.179654
market_value    11.026253
dtype: float64

In [97]:
df.select_dtypes(np.number).apply('mean', axis=1)

0      392.333333
1      388.208333
2      143.708333
3      214.875000
4       91.916667
          ...    
460     31.875000
461     24.791667
462     23.750000
463     39.875000
464     24.708333
Length: 462, dtype: float64

Now we want to calculate the mean manually for the index 460

Select all but the object dtype

In [98]:
df.loc[460,[i !='object' for i in df.dtypes]].mean()

31.875

### Element-wise Operations with .applymap()

The .agg(), .apply(), and .transform() method operate on entire rows or columns at once. They take advantage of this concept of vectorized operations, which is a numpy feature (vectorized ops)

The .applymap() method can accept and operate on a individual value --> operating one item at a time (Non Vectorize ops)

Example: we want to increase the market value by 2% --> *1.02 and log the 100th passed value

In [99]:
from datetime import datetime
counter = 0

def log_transform(x):
    global counter
    counter +=1
    if counter % 100 == 0:
        print(f"It's {datetime.now()} and I just ajusted the {counter}th value.")
    return x*1.02

In [100]:
df.select_dtypes('float').applymap(log_transform)

It's 2023-09-11 09:00:53.831614 and I just ajusted the 100th value.
It's 2023-09-11 09:00:53.832282 and I just ajusted the 200th value.
It's 2023-09-11 09:00:53.832511 and I just ajusted the 300th value.
It's 2023-09-11 09:00:53.832637 and I just ajusted the 400th value.
It's 2023-09-11 09:00:53.833591 and I just ajusted the 500th value.
It's 2023-09-11 09:00:53.833727 and I just ajusted the 600th value.
It's 2023-09-11 09:00:53.833830 and I just ajusted the 700th value.
It's 2023-09-11 09:00:53.833930 and I just ajusted the 800th value.
It's 2023-09-11 09:00:53.834030 and I just ajusted the 900th value.


Unnamed: 0,market_value,fpl_value
0,66.30,12.24
1,51.00,9.69
2,7.14,5.61
3,20.40,7.65
4,22.44,6.12
...,...,...
460,5.10,4.59
461,7.14,4.59
462,4.59,4.59
463,1.02,4.59


Now what will happend if we use .apply() --> vectorized ops

.applymap() will work only on dataframe and not on series

In [101]:
df.select_dtypes('float').apply(log_transform)

Unnamed: 0,market_value,fpl_value
0,66.30,12.24
1,51.00,9.69
2,7.14,5.61
3,20.40,7.65
4,22.44,6.12
...,...,...
460,5.10,4.59
461,7.14,4.59
462,4.59,4.59
463,1.02,4.59


The counter will not reach 100th so therefore no print statement will happend

Mini challenge:

In [102]:
def popular(x):
    if x<200:
        return 'rel unknoqn'
    elif x>=220 and x<600:
        return 'kind of popular'
    elif x>=600 and x<2000:
        return 'popular'
    elif x>2000:
        return 'super popular'

In [103]:
df['page_views'].apply(popular)

0        super popular
1        super popular
2              popular
3        super popular
4              popular
            ...       
460    kind of popular
461        rel unknoqn
462        rel unknoqn
463    kind of popular
464               None
Name: page_views, Length: 462, dtype: object

### Setting DataFrame Values

In [104]:
df.head()

Unnamed: 0,name,club,age,position,position_cat,market_value,page_views,fpl_value,fpl_sel,fpl_points,region,nationality,new_foreign,age_cat,club_id,big_club,new_signing
0,Alexis Sanchez,Arsenal,28,LW,1,65.0,4329,12.0,17.10%,264,3,Chile,0,4,1,1,0
1,Mesut Ozil,Arsenal,28,AM,1,50.0,4395,9.5,5.60%,167,2,Germany,0,4,1,1,0
2,Petr Cech,Arsenal,35,GK,4,7.0,1529,5.5,5.90%,134,2,Czech Republic,0,6,1,1,0
3,Theo Walcott,Arsenal,28,RW,1,20.0,2393,7.5,1.50%,122,1,England,0,4,1,1,0
4,Laurent Koscielny,Arsenal,31,CB,3,22.0,912,6.0,0.70%,121,2,France,0,4,1,1,0


Now we want to change the index 3 - Theo Walcott and the columns 3 - position to 'AM'

In [105]:
df.iat[3,3]='AM'

We can as well using iloc or loc and the equal sign. But if it is a single value better using iat (similar to iloc) or at(similar to loc)

In [106]:
df.head()

Unnamed: 0,name,club,age,position,position_cat,market_value,page_views,fpl_value,fpl_sel,fpl_points,region,nationality,new_foreign,age_cat,club_id,big_club,new_signing
0,Alexis Sanchez,Arsenal,28,LW,1,65.0,4329,12.0,17.10%,264,3,Chile,0,4,1,1,0
1,Mesut Ozil,Arsenal,28,AM,1,50.0,4395,9.5,5.60%,167,2,Germany,0,4,1,1,0
2,Petr Cech,Arsenal,35,GK,4,7.0,1529,5.5,5.90%,134,2,Czech Republic,0,6,1,1,0
3,Theo Walcott,Arsenal,28,AM,1,20.0,2393,7.5,1.50%,122,1,England,0,4,1,1,0
4,Laurent Koscielny,Arsenal,31,CB,3,22.0,912,6.0,0.70%,121,2,France,0,4,1,1,0


In [107]:
df.at[2,'page_views'] = 2001

In [108]:
df['page_views'][2]

2001

But if we use square bracket assignment it will throw an error

### Setting With Copy Warning

In [109]:
df['page_views'][3] = 2001

We get this warning is because pandas cannot guarantee that we are working with the actual dataframe. This known as __chain__ __indexing__

So we are using assignment using __chain__ __indexing__, so we can call it __chain__ __assignment__

drop_duplicates() will return a copy of the dataframe but not the dataframe itself (if we dont include inplace=True)

In [110]:
df.drop_duplicates()['page_views'][2]=1

So that's why we get a setting with copy warning

### Copy vs View

- Pandas loves to give us a copy
- If we use a 'loc/iloc or at/iat' indexer, we are guaranteed to get a view
- We will get a copy if 'inplace' parameter set to be False

### Adding Columns to a DataFrame

df.assign() --> assign new column to a dataframe. Return a new object with all original columns in addition to the new ones

But the existing columns that are re-assigned will be overwritten

In [111]:
df.assign(test=[1]*len(df)).head()

Unnamed: 0,name,club,age,position,position_cat,market_value,page_views,fpl_value,fpl_sel,fpl_points,region,nationality,new_foreign,age_cat,club_id,big_club,new_signing,test
0,Alexis Sanchez,Arsenal,28,LW,1,65.0,4329,12.0,17.10%,264,3,Chile,0,4,1,1,0,1
1,Mesut Ozil,Arsenal,28,AM,1,50.0,4395,9.5,5.60%,167,2,Germany,0,4,1,1,0,1
2,Petr Cech,Arsenal,35,GK,4,7.0,2001,5.5,5.90%,134,2,Czech Republic,0,6,1,1,0,1
3,Theo Walcott,Arsenal,28,AM,1,20.0,2001,7.5,1.50%,122,1,England,0,4,1,1,0,1
4,Laurent Koscielny,Arsenal,31,CB,3,22.0,912,6.0,0.70%,121,2,France,0,4,1,1,0,1


Similar method is .insert(). This method doesn't produce a copy. but directly change the dataframe

### Adding Rows to a DataFrame

In [136]:
df_mini=df.iloc[0:3,1:4]

In [137]:
df_mini

Unnamed: 0,club,age,position
0,Arsenal,28,LW
1,Arsenal,28,AM
2,Arsenal,35,GK


In [138]:
cr7=pd.Series({
    'club': 'Chelsea',
    'age':35,
    'position':'FW'
}, name=4)

We can use append to add a row in a dataframe

In [139]:
df_mini.append(cr7)

Unnamed: 0,club,age,position
0,Arsenal,28,LW
1,Arsenal,28,AM
2,Arsenal,35,GK
4,Chelsea,35,FW


.append() can accept a series or a dataframe

In [140]:
cr8=pd.DataFrame({
    'club': ['Bayern', 'München'],
    'age':[35,36],
    'position':['FW', 'RW']
}, index=[5,6])

In [141]:
df_mini.append(cr8)

Unnamed: 0,club,age,position
0,Arsenal,28,LW
1,Arsenal,28,AM
2,Arsenal,35,GK
5,Bayern,35,FW
6,München,36,RW


In [143]:
df_mini

Unnamed: 0,club,age,position
0,Arsenal,28,LW
1,Arsenal,28,AM
2,Arsenal,35,GK


.append() only add a copy not a view. So the underlying dataframe is unchanged, and .append() method doesn't have an inplace=True parameter

There is another way, that sometimes used by pandas user to add a column or row

The name is __setting with enlarement__. for example: df['a'] == 1

In [144]:
df_mini.loc[7] = 'abc'

In [145]:
df_mini

Unnamed: 0,club,age,position
0,Arsenal,28,LW
1,Arsenal,28,AM
2,Arsenal,35,GK
7,abc,abc,abc


So what we are doing above is __setting with entlargement__

Adding rows to a dataframe is inefficient and it is a very expensive operation. Wether we are using setting with entlargement or with .append() mehtod