<a href="https://colab.research.google.com/github/agagawrys/data-science-bootcamp/blob/main/02_analiza_danych/01_pandas_intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

* @author: krakowiakpawel9@gmail.com  
* @site: e-smartdata.org

### Pandas
>Strona biblioteki: [https://pandas.pydata.org/](https://pandas.pydata.org/)  
>Dokumentacja: [https://pandas.pydata.org/pandas-docs/stable/](https://pandas.pydata.org/pandas-docs/stable/)
>
>Podstawowa biblioteka do analizy danych w języku Python.
>
>Aby zainstalować bibliotekę Pandas użyj polecenia poniżej:
```
pip install pandas
```
### Spis treści:
1. [Podstawowe struktury danych: pd.Series](#a1)
2. [Podstawowe struktury danych: pd.DataFrame](#a2)
3. [Selekcja kolumn](#a3)



In [18]:
import pandas as pd
pd.__version__

'1.5.3'

### <a name='a1'></a>  Podstawowe struktury danych: pd.Series

In [2]:
s = pd.Series(data=[2, 4, 1, 3])
s

0    2
1    4
2    1
3    3
dtype: int64

In [4]:
s = pd.Series(data=[2, 4, 1, 3], index=['a', 'b', 'c', 'd'], name='sample')
s

a    2
b    4
c    1
d    3
Name: sample, dtype: int64

In [5]:
s = pd.Series(data=[2., 4, 1, 3], index=['a', 'b', 'c', 'd'], name='sample')
s

a    2.0
b    4.0
c    1.0
d    3.0
Name: sample, dtype: float64

In [19]:
import numpy as np

np.nan

nan

In [7]:
s = pd.Series(data=[2., np.nan, 1, 3], index=['a', 'b', 'c', 'd'], name='sample')
s

a    2.0
b    NaN
c    1.0
d    3.0
Name: sample, dtype: float64

In [8]:
s = pd.Series(data=[True, False, False])
s

0     True
1    False
2    False
dtype: bool

In [28]:
s = pd.Series(data=np.arange(10, 20), index=pd.date_range(start='20200101', periods=10))
s

2020-01-01    10
2020-01-02    11
2020-01-03    12
2020-01-04    13
2020-01-05    14
2020-01-06    15
2020-01-07    16
2020-01-08    17
2020-01-09    18
2020-01-10    19
Freq: D, dtype: int64

In [16]:
list(s.index)

[Timestamp('2020-01-01 00:00:00', freq='D'),
 Timestamp('2020-01-02 00:00:00', freq='D'),
 Timestamp('2020-01-03 00:00:00', freq='D'),
 Timestamp('2020-01-04 00:00:00', freq='D'),
 Timestamp('2020-01-05 00:00:00', freq='D'),
 Timestamp('2020-01-06 00:00:00', freq='D'),
 Timestamp('2020-01-07 00:00:00', freq='D'),
 Timestamp('2020-01-08 00:00:00', freq='D'),
 Timestamp('2020-01-09 00:00:00', freq='D'),
 Timestamp('2020-01-10 00:00:00', freq='D')]

In [29]:
s.dtypes

dtype('int64')

In [20]:
s = pd.Series(data=['python', 'java', 'sql'], name='languages')
s

0    python
1      java
2       sql
Name: languages, dtype: object

In [None]:
type(s)

In [23]:
s.index

RangeIndex(start=0, stop=3, step=1)

In [25]:
s.values

array(['python', 'java', 'sql'], dtype=object)

In [26]:
s.dtypes

dtype('O')

In [30]:
s.shape

(10,)

In [44]:
price = pd.Series(data={'Apple': 200, 'CD Projekt' : 60, 'Amazon' : 1900, 'KGHM' : np.nan})
price

Apple          200.0
CD Projekt      60.0
Amazon        1900.0
KGHM             NaN
dtype: float64

In [34]:
price['CD Projekt']

60

In [35]:
price[1]

60

In [45]:
price.count()

3

In [47]:
price.value_counts(dropna=False)

200.0     1
60.0      1
1900.0    1
NaN       1
dtype: int64

In [38]:
price.sum()

2160

In [39]:
price.min()

60

In [40]:
price.max()

1900

In [41]:
price.std()

1024.3046421841502

In [48]:
price.describe()

count       3.000000
mean      720.000000
std      1024.304642
min        60.000000
25%       130.000000
50%       200.000000
75%      1050.000000
max      1900.000000
dtype: float64

In [43]:
price.describe().T

count       3.000000
mean      720.000000
std      1024.304642
min        60.000000
25%       130.000000
50%       200.000000
75%      1050.000000
max      1900.000000
dtype: float64

In [50]:
price.nlargest(2)

Amazon    1900.0
Apple      200.0
dtype: float64

In [52]:
price.nsmallest(2)

CD Projekt     60.0
Apple         200.0
dtype: float64

In [57]:
price.rank()

Apple         2.0
CD Projekt    1.0
Amazon        3.0
KGHM          NaN
dtype: float64

In [54]:
price.sort_values()

CD Projekt      60.0
Apple          200.0
Amazon        1900.0
KGHM             NaN
dtype: float64

In [55]:
price.sort_values(ascending=False)

Amazon        1900.0
Apple          200.0
CD Projekt      60.0
KGHM             NaN
dtype: float64

In [61]:
price_pln = price.apply(lambda x : x * 3.98)

In [62]:
price_pln

Apple          796.0
CD Projekt     238.8
Amazon        7562.0
KGHM             NaN
dtype: float64

In [63]:
price

Apple          200.0
CD Projekt      60.0
Amazon        1900.0
KGHM             NaN
dtype: float64

### <a name='a2'></a>  Podstawowe struktury danych: pd.DataFrame


In [64]:
df = pd.DataFrame(data=[12, 12, 32])
df

Unnamed: 0,0
0,12
1,12
2,32


In [69]:
df = pd.DataFrame(data=[12, 12, 32], index=['first', 'second', 'third'], columns=['col_1'])
df

Unnamed: 0,col_1
first,12
second,12
third,32


In [72]:
df = pd.DataFrame(data={'WIG20': ['PKN ORLEN', 'PKO BP'],
                        'mWIG40': ['Amica', 'Playway']})
df

Unnamed: 0,WIG20,mWIG40
0,PKN ORLEN,Amica
1,PKO BP,Playway


In [75]:
df = pd.DataFrame(data=[[10, 20, 32], [12, 12, 23]], index=['first', 'second'], columns=['col_1', 'col_2', 'col_3'])
df

Unnamed: 0,col_1,col_2,col_3
first,10,20,32
second,12,12,23


In [76]:
df.columns

Index(['col_1', 'col_2', 'col_3'], dtype='object')

In [77]:
df.values

array([[10, 20, 32],
       [12, 12, 23]])

In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, first to second
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   col_1   2 non-null      int64
 1   col_2   2 non-null      int64
 2   col_3   2 non-null      int64
dtypes: int64(3)
memory usage: 172.0+ bytes


In [79]:
df.describe()

Unnamed: 0,col_1,col_2,col_3
count,2.0,2.0,2.0
mean,11.0,16.0,27.5
std,1.414214,5.656854,6.363961
min,10.0,12.0,23.0
25%,10.5,14.0,25.25
50%,11.0,16.0,27.5
75%,11.5,18.0,29.75
max,12.0,20.0,32.0


In [80]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
col_1,2.0,11.0,1.414214,10.0,10.5,11.0,11.5,12.0
col_2,2.0,16.0,5.656854,12.0,14.0,16.0,18.0,20.0
col_3,2.0,27.5,6.363961,23.0,25.25,27.5,29.75,32.0


### <a name='a3'> </a> Selekcja kolumn

In [81]:
df

Unnamed: 0,col_1,col_2,col_3
first,10,20,32
second,12,12,23


In [82]:
df['col_1']

first     10
second    12
Name: col_1, dtype: int64

In [83]:
type(df['col_1'])

In [84]:
df[['col_1']]

Unnamed: 0,col_1
first,10
second,12


In [85]:
type(df[['col_1']])

In [93]:
df.columns = ['a', 'sprzedaz_grudzien', 'c']
df

Unnamed: 0,a,sprzedaz_grudzien,c
first,10,20,32
second,12,12,23


In [94]:
df.sprzedaz_grudzien

first     20
second    12
Name: sprzedaz_grudzien, dtype: int64

In [95]:
df['d'] = df.a + df.c
df

Unnamed: 0,a,sprzedaz_grudzien,c,d
first,10,20,32,42
second,12,12,23,35


In [96]:
df = pd.DataFrame(data=[[10, 20, 32], [12, 12, 23]], index=['first', 'second'], columns=['col_1', 'col_2', 'col_3'])
df

Unnamed: 0,col_1,col_2,col_3
first,10,20,32
second,12,12,23


In [97]:
df.loc['first']

col_1    10
col_2    20
col_3    32
Name: first, dtype: int64

In [98]:
df.iloc[0]

col_1    10
col_2    20
col_3    32
Name: first, dtype: int64

In [99]:
df.loc['first', 'col_2']

20

In [101]:
df.loc[:, 'col_2']

first     20
second    12
Name: col_2, dtype: int64

In [102]:
df.iloc[0, 1]

20