In [1]:
import numpy as np
import pandas as pd


In [2]:
# series, dataframe & index 
# SERIES
data = pd.Series([0.25, 0.5, 0.75, 1.0])
print(data)
print(type(data))

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64
<class 'pandas.core.series.Series'>


In [4]:
print(data.values)
print(type(data.values))
print(data.index)
print(type(data.index))

[0.25 0.5  0.75 1.  ]
<class 'numpy.ndarray'>
RangeIndex(start=0, stop=4, step=1)
<class 'pandas.core.indexes.range.RangeIndex'>


In [6]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
print(data[0])
print(data[1:3])

0.25
1    0.50
2    0.75
dtype: float64


In [7]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])

In [8]:
print(data)

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64


In [12]:
print(data['a'])
print(data['b':'d'])
print(type(data.index))

0.25
b    0.50
c    0.75
d    1.00
dtype: float64
<class 'pandas.core.indexes.base.Index'>


In [16]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[1, 10, 7, 'd'])
print(data[1])
print(data[10:'d'])

0.25
10    0.50
7     0.75
d     1.00
dtype: float64


In [18]:
population_dict = {'city1':1000, 
                   'city2':1001,
                   'city3':1002, 
                   'city4':1003,
                   'city5':1004, 
                   'city6':1005,
                  }
population = pd.Series(population_dict)
print(population)
print(population['city4'])
print(population['city4':'city5'])
# для создания series можно использовать: списки python или массивы numpy,
# cкалярные значения, словари

city1    1000
city2    1001
city3    1002
city4    1003
city5    1004
city6    1005
dtype: int64
1003
city4    1003
city5    1004
dtype: int64


In [20]:
# dataframe - двумерный массив с явно определенными индексами. 
# последовательность согласованных объектов series

area_dict = {'city1':9991, 
                   'city2':9992,
                   'city3':9993, 
                   'city4':9994,
                   'city5':9995, 
                   'city6':9996,
                  }
population = pd.Series(population_dict)
area = pd.Series(area_dict)

states = pd.DataFrame({'population1': population, 'area1':area})
print(states)
print(states.values)
print(states.index)
print(states.columns)

       population1  area1
city1         1000   9991
city2         1001   9992
city3         1002   9993
city4         1003   9994
city5         1004   9995
city6         1005   9996
[[1000 9991]
 [1001 9992]
 [1002 9993]
 [1003 9994]
 [1004 9995]
 [1005 9996]]
Index(['city1', 'city2', 'city3', 'city4', 'city5', 'city6'], dtype='object')
Index(['population1', 'area1'], dtype='object')


In [21]:
print(type(states.values))
print(type(states.index))
print(type(states.columns))

<class 'numpy.ndarray'>
<class 'pandas.core.indexes.base.Index'>
<class 'pandas.core.indexes.base.Index'>


In [24]:
print(states['area1'])
# dataframe способы создания: через series, словари, словари объектов series,
# двумерные массив numpy, структурированный массив numpy

city1    9991
city2    9992
city3    9993
city4    9994
city5    9995
city6    9996
Name: area1, dtype: int64


In [25]:
# index - способ организации ссылки на данные объектов series и dataframe 
# index - неизменяем, упорядочен, яв-ся
# мультимножеством (могут быть повторяющиеся значения)

In [26]:
ind = pd.Index([2, 3, 5, 7, 11])
print(ind[1])
print(ind[::2])
# index - следует соглашениям объекта set

3
Index([2, 5, 11], dtype='int64')


In [27]:
inda = pd.Index([1, 2, 3, 4, 5])
indb = pd.Index([2, 3, 4, 5, 6])
print(inda.intersection(indb))

Index([2, 3, 4, 5], dtype='int64')


In [29]:
# series выборка данных
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
print('a' in data)
print('z' in data)

True
False


In [33]:
print(data.keys())
print(list(data.items()))

Index(['a', 'b', 'c', 'd'], dtype='object')
[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]


In [34]:
data['a'] = 100
data['z'] = 1000
print(data)
# как словарь (выше)

a     100.00
b       0.50
c       0.75
d       1.00
z    1000.00
dtype: float64


In [38]:
# как одномерный массив
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
print(data['a':'c'])
print(data[0:2])
print(data[(data > 0.5) & (data < 1)])
print(data[['a', 'd']])

a    0.25
b    0.50
c    0.75
dtype: float64
a    0.25
b    0.50
dtype: float64
c    0.75
dtype: float64
a    0.25
d    1.00
dtype: float64


In [39]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[1, 3, 10, 15])
print(data[1])
print(data.loc[1])
print(data.iloc[1])

0.25
0.25
0.5


In [43]:
# dataframe выборка данных 
area = pd.Series({'city1':9991, 
                   'city2':9992,
                   'city3':9993, 
                   'city4':9994,
                   'city5':9995, 
                   'city6':9996,
                  })
pop = pd.Series({'city1':1000, 
                   'city2':1001,
                   'city3':1002, 
                   'city4':1003,
                   'city5':1004, 
                   'city6':1005,
                  })
data = pd.DataFrame({'area':area, 'pop1':pop})
print(data)

       area  pop1
city1  9991  1000
city2  9992  1001
city3  9993  1002
city4  9994  1003
city5  9995  1004
city6  9996  1005


In [41]:
print(data.area)

city1    9991
city2    9992
city3    9993
city4    9994
city5    9995
city6    9996
Name: area, dtype: int64


In [44]:
print(data.pop1 is data['pop1'])
# если бы назвали pop, результат был бы false

True


In [None]:
data['new'] = data['area1']
print(data)

In [47]:
# как двумерный массив
data = pd.DataFrame({"area1":area, "pop1": pop})

In [48]:
print(data)
print(data.values)
print(data.T)
print(data['area1'])

       area1  pop1
city1   9991  1000
city2   9992  1001
city3   9993  1002
city4   9994  1003
city5   9995  1004
city6   9996  1005
[[9991 1000]
 [9992 1001]
 [9993 1002]
 [9994 1003]
 [9995 1004]
 [9996 1005]]
       city1  city2  city3  city4  city5  city6
area1   9991   9992   9993   9994   9995   9996
pop1    1000   1001   1002   1003   1004   1005
city1    9991
city2    9992
city3    9993
city4    9994
city5    9995
city6    9996
Name: area1, dtype: int64


In [49]:
print(data.values[0])

[9991 1000]


In [50]:
print(data.values[0:3])

[[9991 1000]
 [9992 1001]
 [9993 1002]]


In [54]:
# атрибуты индикаторы 
data = pd.DataFrame({"area1":area, "pop1": pop, 'pop':pop})
print(data.iloc[:3, 1:2])
print(data.loc[:'city4', 'pop1':'pop'])


       pop1
city1  1000
city2  1001
city3  1002
       pop1   pop
city1  1000  1000
city2  1001  1001
city3  1002  1002
city4  1003  1003


In [58]:
print(data.loc[data['pop']> 1002, ['area1', 'pop']])

       area1   pop
city4   9994  1003
city5   9995  1004
city6   9996  1005


In [59]:
data.iloc[0, 2] = 999999
print(data)

       area1  pop1     pop
city1   9991  1000  999999
city2   9992  1001    1001
city3   9993  1002    1002
city4   9994  1003    1003
city5   9995  1004    1004
city6   9996  1005    1005


In [61]:
rng = np.random.default_rng()
s = pd.Series(rng.integers(0, 10, 4))
print(s)

0    5
1    0
2    5
3    9
dtype: int64


In [63]:
print(np.exp(s))

0     148.413159
1       1.000000
2     148.413159
3    8103.083928
dtype: float64


In [64]:
area = pd.Series({'city1':9991, 
                   'city2':9992,
                   'city3':9993, 
                   'city41':9994,
                   'city51':9995, 
                   'city6':9996,
                  })
pop = pd.Series({'city1':1000, 
                   'city2':1001,
                   'city3':1002, 
                   'city42':1003,
                   'city52':1004, 
                   'city6':1005,
                  })
data = pd.DataFrame({'area':area, 'pop1':pop})
print(data)

          area    pop1
city1   9991.0  1000.0
city2   9992.0  1001.0
city3   9993.0  1002.0
city41  9994.0     NaN
city42     NaN  1003.0
city51  9995.0     NaN
city52     NaN  1004.0
city6   9996.0  1005.0


In [65]:
dfa = pd.DataFrame(rng.integers(0, 10, (2, 2)), columns=['a', 'b'])
dfb = pd.DataFrame(rng.integers(0, 10, (3, 3)), columns=['a', 'b', 'c'])
print(dfa)
print(dfb)

   a  b
0  5  2
1  7  8
   a  b  c
0  9  8  6
1  2  5  4
2  0  1  7


In [66]:
print(dfa+dfb)

      a     b   c
0  14.0  10.0 NaN
1   9.0  13.0 NaN
2   NaN   NaN NaN


In [67]:
rng = np.random.default_rng(1)
a = rng.integers(0, 10, (3, 4))
print(a)
print(a[0])
print(a - a[0])

[[4 5 7 9]
 [0 1 8 9]
 [2 3 8 4]]
[4 5 7 9]
[[ 0  0  0  0]
 [-4 -4  1  0]
 [-2 -2  1 -5]]


In [68]:
df = pd.DataFrame(a, columns =['a', 'b', 'c', 'd'])
print(df.iloc[0])

a    4
b    5
c    7
d    9
Name: 0, dtype: int64


In [69]:
print(df - df.iloc[0])

   a  b  c  d
0  0  0  0  0
1 -4 -4  1  0
2 -2 -2  1 -5


In [70]:
print(df.iloc[0, ::2])

a    4
c    7
Name: 0, dtype: int64


In [71]:
print(df - df.iloc[0, ::2])

     a   b    c   d
0  0.0 NaN  0.0 NaN
1 -4.0 NaN  1.0 NaN
2 -2.0 NaN  1.0 NaN


In [72]:
# два способа хранить отсутствующие значения: индикаторы NaN, None 
# Null
# none - объект. не работает с sum, min 
val1 = np.array([1, 2, 3])
print(val1.sum())

6


In [75]:
val1 = np.array([1, 2, 3, np.nan])
print(val1.sum())
print(np.nansum(val1))

nan
6.0


In [74]:
# так нельзя:
val1 = np.array([1, 2, 3, None])
print(val1.sum())

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [77]:
x = pd.Series(range(10), dtype=int)
print(x)

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32


In [79]:
x[0] = None
x[1] = np.nan
print(x)
x1 = pd.Series(['a', 'b', 'c'])
x1[0] = None
x1[1] = np.nan

0    NaN
1    NaN
2    2.0
3    3.0
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
dtype: float64


In [80]:
x2 = pd.Series([1, 2, 3, np.nan, None, pd.NA], dtype='Int32')

In [81]:
print(x2)

0       1
1       2
2       3
3    <NA>
4    <NA>
5    <NA>
dtype: Int32


In [86]:
print(x2.isnull())
print(x2[x2.notnull()])

0    False
1    False
2    False
3     True
4     True
5     True
dtype: bool
0    1
1    2
2    3
dtype: Int32


In [87]:
print(x2.dropna())

0    1
1    2
2    3
dtype: Int32


In [90]:
df = pd.DataFrame([
    [1, 2, 3, np.nan, None, pd.NA],
    [1,2, 3, 4, 5, 6], 
    [1, np.nan, 3, 4, np.nan, 6]])

In [91]:
print(df)

   0    1  2    3    4     5
0  1  2.0  3  NaN  NaN  <NA>
1  1  2.0  3  4.0  5.0     6
2  1  NaN  3  4.0  NaN     6


In [92]:
print(df.dropna())

   0    1  2    3    4  5
1  1  2.0  3  4.0  5.0  6


In [93]:
print(df.dropna(axis=0))
print(df.dropna(axis=1))

   0    1  2    3    4  5
1  1  2.0  3  4.0  5.0  6
   0  2
0  1  3
1  1  3
2  1  3


In [95]:
df = pd.DataFrame([
    [1, 2, 3, np.nan, None, pd.NA],
    [1,2, 3, 4, 5, 6], 
    [1, np.nan, 3, None, np.nan, 6]])
print(df.dropna(axis=1, how='all')) # все значения нан
print(df.dropna(axis=1, how='any')) # хотя бы одно нан
print(df.dropna(axis=1, thresh=2)) # хотя бы x НЕ нан

   0    1  2    3    4     5
0  1  2.0  3  NaN  NaN  <NA>
1  1  2.0  3  4.0  5.0     6
2  1  NaN  3  NaN  NaN     6
   0  2
0  1  3
1  1  3
2  1  3
   0    1  2     5
0  1  2.0  3  <NA>
1  1  2.0  3     6
2  1  NaN  3     6
