# Tutorial Pandas

In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib as plt

## Dataframe

In [None]:
n = 1000
days = pd.date_range('2018-01-01', periods=n, freq='D')
names = [random.choice(['Angelo', 'Marco', 'Vittorio']) for _ in range(n)]
numbers = [random.randint(1, 10) for _ in range(n)]

df = pd.DataFrame(data=zip(names, numbers), columns=['names', 'numbers'], index=days)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df.index

In [None]:
df.dtypes

In [None]:
df.to_numpy()

## Series

In [None]:
names_series = df.names; names_series

## Indexing

In [None]:
# indexing con interi (iloc)
df.iloc[0, :]

In [None]:
df.iloc[1:4, :]

In [None]:
df.iloc[1:4, 0]

In [None]:
# indexing con lable (loc)
df

In [None]:
# dataframe
df.loc['2018-01-04':, ['numbers']]

In [None]:
# series
df.loc['2018-01-04':, 'numbers']

In [None]:
# accesso veloce a singoli elementi con at e iat
df.at['2018-01-04', 'numbers']

In [None]:
df.iat[5, 1]

In [None]:
# maschere
df['numbers'] > 5

In [None]:
# si può fare anche con loc e iloc
len(df[df['numbers'] > 5])

In [None]:
# WARNING!!!!!

# le parentesi sono necessarie perché gli operatori logici hanno una priorità
# maggiore rispetto a quelli di confronto

len(df[(df['numbers'] > 5) & (df['numbers'] < 8)])

In [None]:
# WARNING

# vanno utilizzati gli operatori logici element-wise di Pandas

len(df[(df['numbers'] > 5) and (df['numbers'] < 8)])

In [None]:
dates = pd.date_range('1/1/2000', periods=8)

df = pd.DataFrame(np.random.randn(8, 4), index=dates, columns=['A', 'B', 'C', 'D'])

# una riga è una serie che ha come indice le colonne
for index, row in df.iterrows():
    print(f'index={index}, row={row}')

In [None]:
for index, elem in df['A'].iteritems():
    print(f'index={index}, elem={elem}')

## Unione
https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html

In [None]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3'],
    'C': ['C0', 'C1', 'C2', 'C3'],
    'D': ['D0', 'D1', 'D2', 'D3']},
    index=[0, 1, 2, 3])

df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
    'B': ['B4', 'B5', 'B6', 'B7'],
    'C': ['C4', 'C5', 'C6', 'C7'],
    'D': ['D4', 'D5', 'D6', 'D7']},
    index=[4, 5, 6, 7])

df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
    'B': ['B8', 'B9', 'B10', 'B11'],
    'C': ['C8', 'C9', 'C10', 'C11'],
    'D': ['D8', 'D9', 'D10', 'D11']},
    index=[8, 9, 10, 11])

In [None]:
pd.concat((df1, df2, df3))

In [None]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3'],
    'C': ['C0', 'C1', 'C2', 'C3'],
    'D': ['D0', 'D1', 'D2', 'D3']},
    index=[0, 1, 2, 3])

df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
    'B': ['B4', 'B5', 'B6', 'B7'],
    'C': ['C4', 'C5', 'C6', 'C7'],
    'D': ['D4', 'D5', 'D6', 'D7']},
    index=[4, 5, 6, 7])

df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
    'B': ['B8', 'B9', 'B10', 'B11'],
    'C': ['C8', 'C9', 'C10', 'C11'],
    'D': ['D8', 'D9', 'D10', 'D11']},
    index=[0, 1, 2, 3])

In [None]:
pd.concat((df1, df2, df3), ignore_index=True)

In [None]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3'],
    'C': ['C0', 'C1', 'C2', 'C3'],
    'D': ['D0', 'D1', 'D2', 'D3']},
    index=[0, 1, 2, 3])

df2 = pd.DataFrame({'E': ['A4', 'A5', 'A6', 'A7'],
    'F': ['B4', 'B5', 'B6', 'B7'],
    'G': ['C4', 'C5', 'C6', 'C7'],
    'H': ['D4', 'D5', 'D6', 'D7']},
    index=[4, 5, 6, 7])

In [None]:
pd.concat((df1, df2))

In [None]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3'],
    'C': ['C0', 'C1', 'C2', 'C3'],
    'D': ['D0', 'D1', 'D2', 'D3']},
    index=[0, 1, 2, 3])

df2 = pd.DataFrame({'F': ['A4', 'A5', 'A6']},
    index=[0, 2, 4])

In [None]:
pd.concat((df1, df2))

In [None]:
pd.concat((df1, df2), axis=1)

In [None]:
pd.concat((df1, df2), axis=1, join='inner')

In [None]:
# Per join più complicati
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.join.html#pandas.DataFrame.join
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.merge.html

# Allineamento

In [None]:
a = pd.Series([10, 11, 12], index=[0, 1, 2])

In [None]:
b = pd.Series([10, 11, 12], index=[2, 3, 4])

In [None]:
a + b

## Valori Nan

<img src="imgs\nan_operations.png" width="600">

In [None]:
df = pd.read_csv('data\FSE\GXI_X.csv', index_col='Date')

In [None]:
df.info()

In [None]:
# se si vogliono trattare i valori infiniti come nan bisogna attivare l'opzione
df.notna()

In [None]:
df.isna()

In [None]:
s = pd.Series(np.array([np.nan, 2, 3, 4, np.nan, np.nan]))

In [None]:
s.fillna('missing')

In [None]:
s.fillna(method='pad', limit=1)

In [None]:
s.fillna(method='bfill', limit=1)

In [None]:
df = pd.DataFrame(s)

In [None]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html#pandas.DataFrame.dropna
df.dropna()

In [None]:
# fill i valori di una tabella
dff = pd.DataFrame(np.random.randn(10, 3), columns=list('ABC'))
dff.iloc[3:5, 0] = np.nan
dff.iloc[4:6, 1] = np.nan
dff.iloc[5:8, 2] = np.nan

In [None]:
dff.mean()

In [None]:
dff.fillna(dff.mean())

In [None]:
dff.fillna(dff.mean()['B':'C'])

In [None]:
# fill con interpolazione

## Plot
https://pandas.pydata.org/pandas-docs/stable/user_guide/visualization.html

In [None]:
df = pd.read_csv('data\FSE\GXI_X.csv', index_col='Date', parse_dates=['Date'])

In [None]:
df['Close'].plot(figsize=(25,10))

In [None]:
df[[col for col in df.columns if col not in ['Turnover', 'Traded Volume'] ]].plot(figsize=(25,10))

## Rolling
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rolling.html

In [None]:
df = pd.read_csv('data\FSE\GXI_X.csv', index_col='Date', parse_dates=['Date'])

In [None]:
df.head()

In [None]:
df['Close'].plot(figsize=(15, 15))

In [None]:
# il rolling può essere effettuato anche specificando un intervallo temporale piuttosto che il numero di elementi
smooth_close = df['Close'].rolling(window=20, center=True, min_periods=1).mean(); smooth_close

In [None]:
smooth_close.plot(figsize=(15, 15))

## Variabili categoriche

In [4]:

df = pd.read_csv('https://www.kaggle.com/lespin/house-prices-dataset?select=train.csv')

ParserError: Error tokenizing data. C error: Expected 1 fields in line 6, saw 2


## Groupby

In [None]:
# https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html

In [None]:
n = 1000
days = pd.date_range('2018-01-01', periods=n, freq='D')
names = [random.choice(['Angelo', 'Marco', 'Vittorio']) for _ in range(n)]
numbers = [random.randint(1, 10) for _ in range(n)]
ages = [random.randint(30, 80) for _ in range(n)]

df = pd.DataFrame(data=zip(names, numbers, ages), columns=['names', 'numbers', 'ages'], index=days)

In [None]:
df.head()

In [None]:
grouped = df.groupby('names')

for name, group in grouped:
    print(name)
    print(group)

In [None]:
grouped.get_group('Angelo')

<img src="imgs\aggregazione.png" width=400>

In [None]:
grouped.size()

In [None]:
grouped.describe()

In [None]:
grouped.std()

In [None]:
## Aggregazione
df.groupby('names').agg(lambda x: sum(x))

In [None]:
# Trasformazione
df.groupby(['names']).transform(lambda x: (x - x.mean()) / x.std())

In [None]:
# Filtraggio
df.groupby(['names']).filter(lambda x: x['numbers'].sum() < 1940)

## Alternative a Pandas

Limiti di Pandas:

* Non parallelizza i calcoli
* Carica l'intero Dataset in memoria

Modin, Dask


# Esercizi

In [None]:
# load AAD_X (Date come indice)
df = pd.read_csv(r'F:\Documenti\insegnamento\pandas\data\FSE\AAD_X.csv', parse_dates=['Date'], index_col='Date')

In [None]:
# visualizza la colonna open

In [None]:
# conta i nan nella colonna open e riempili con interpolazione lineare

In [None]:
# aggiungi una colonna con i ritorni giornalieri (relativi alla colonna open) (es 0.02 se +2%)

In [None]:
# calcola il ritorno mensile medio