# Essential Basic functionality

In [3]:
import pandas as pd
import numpy as np

index = pd.date_range("1/1/2000", periods=8)

s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])

df = pd.DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"])

df

Unnamed: 0,A,B,C
2000-01-01,0.226107,-0.27299,1.800063
2000-01-02,-0.427044,-0.613192,-1.114922
2000-01-03,0.798684,1.053494,-1.224048
2000-01-04,0.470741,0.305556,1.079824
2000-01-05,1.605109,0.719443,0.311213
2000-01-06,-1.508638,-0.738148,-1.166051
2000-01-07,0.426154,-1.971816,1.116023
2000-01-08,-0.578577,0.849573,-0.649928


## Attributes and Underlying Data

In [4]:
df[:2]

Unnamed: 0,A,B,C
2000-01-01,0.226107,-0.27299,1.800063
2000-01-02,-0.427044,-0.613192,-1.114922


In [7]:
df.columns = [x.lower() for x in df.columns]
df

Unnamed: 0,a,b,c
2000-01-01,0.226107,-0.27299,1.800063
2000-01-02,-0.427044,-0.613192,-1.114922
2000-01-03,0.798684,1.053494,-1.224048
2000-01-04,0.470741,0.305556,1.079824
2000-01-05,1.605109,0.719443,0.311213
2000-01-06,-1.508638,-0.738148,-1.166051
2000-01-07,0.426154,-1.971816,1.116023
2000-01-08,-0.578577,0.849573,-0.649928


In [8]:
s.array

<PandasArray>
[  2.616037742147975,  0.1859624174306764, -0.6288443218019035,
  0.5943662320483487, 0.20812108852049346]
Length: 5, dtype: float64

In [9]:
s.index.array

<PandasArray>
['a', 'b', 'c', 'd', 'e']
Length: 5, dtype: object

In [10]:
#how to use a numpy array.

s.to_numpy()

array([ 2.61603774,  0.18596242, -0.62884432,  0.59436623,  0.20812109])

In [11]:
#Also using a numpy array.

np.asarray(s)

array([ 2.61603774,  0.18596242, -0.62884432,  0.59436623,  0.20812109])

In [12]:
ser = pd.Series(pd.date_range("2000", periods=2, tz="CET"))

ser.to_numpy(dtype=object)

array([Timestamp('2000-01-01 00:00:00+0100', tz='CET'),
       Timestamp('2000-01-02 00:00:00+0100', tz='CET')], dtype=object)

## Accelerated Operations

In [None]:
# Pandas has support for accelerated certain types of binary numerical and boolean operations using the `numexpr` library and the `bottleneck` libraries.

#Thes are both enabled to be used by default, you can control this by the options:

pd.set_option("compute.use_bottleneck", False)
pd.set_options("compute.use_numexpr", False)

## Flexible binary operations

 - Broadcasting behaviour between higher - (e.g DataFrame) and lower-dimensional (e.g series) object.
 
  - Missing data computations.

### Matching / broadcasting behaviour.

 - DataFrame has the methods add(), sub(), mul(), div() and related functions radd(), rsub() for carrying out binary operations.

In [13]:
df = pd.DataFrame(
    {
        "one": pd.Series(np.random.randn(3), index=["a", "b", "c"]),
        "two": pd.Series(np.random.randn(4), index=["a", "b", "c", "d"]),
        "three": pd.Series(np.random.randn(3), index=["b", "c", "d"]),
    }
)
df

Unnamed: 0,one,two,three
a,-0.19704,-0.459026,
b,-0.413446,-1.299196,0.629927
c,-0.494118,-0.804975,0.876716
d,,0.556723,1.096478


In [16]:
row = df.iloc[1]

column = df["two"]

df.sub(row, axis="columns")

Unnamed: 0,one,two,three
a,0.216406,0.84017,
b,0.0,0.0,0.0
c,-0.080672,0.494221,0.246789
d,,1.855919,0.466551


In [17]:
df.sub(row, axis=1)

Unnamed: 0,one,two,three
a,0.216406,0.84017,
b,0.0,0.0,0.0
c,-0.080672,0.494221,0.246789
d,,1.855919,0.466551


In [18]:
df.sub(column, axis="index")

Unnamed: 0,one,two,three
a,0.261986,0.0,
b,0.885749,0.0,1.929123
c,0.310856,0.0,1.681691
d,,0.0,0.539755


In [19]:
df.sub(column, axis=0)

Unnamed: 0,one,two,three
a,0.261986,0.0,
b,0.885749,0.0,1.929123
c,0.310856,0.0,1.681691
d,,0.0,0.539755


In [20]:
dfmi = df.copy()

dfmi.index = pd.MultiIndex.from_tuples(
    [(1, "a"), (1, "b"), (1, "c"), (2, "a")], names=["first", "second"]
)

dfmi.sub(column, axis=0, level="second")

Unnamed: 0_level_0,Unnamed: 1_level_0,one,two,three
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,a,0.261986,0.0,
1,b,0.885749,0.0,1.929123
1,c,0.310856,0.0,1.681691
2,a,,1.015749,1.555504


In [22]:
#divmod() - takes the floor division and modulo operation at the same time returning a two-tuple of the same type as the left hand side

s = pd.Series(np.arange(10))
s

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int32

In [23]:
div, rem = divmod(s, 3)

div

0    0
1    0
2    0
3    1
4    1
5    1
6    2
7    2
8    2
9    3
dtype: int32

In [24]:
rem

0    0
1    1
2    2
3    0
4    1
5    2
6    0
7    1
8    2
9    0
dtype: int32

In [26]:
idx = pd.Index(np.arange(10))
idx

Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int32')

In [27]:
rem

0    0
1    1
2    2
3    0
4    1
5    2
6    0
7    1
8    2
9    0
dtype: int32

### Missing Data / operations with fill values.

In [29]:
df2 = df.copy()

df2["three"]["a"] = 1.0

df

Unnamed: 0,one,two,three
a,-0.19704,-0.459026,
b,-0.413446,-1.299196,0.629927
c,-0.494118,-0.804975,0.876716
d,,0.556723,1.096478


In [30]:
df2

Unnamed: 0,one,two,three
a,-0.19704,-0.459026,1.0
b,-0.413446,-1.299196,0.629927
c,-0.494118,-0.804975,0.876716
d,,0.556723,1.096478


In [31]:
df + df2

Unnamed: 0,one,two,three
a,-0.394081,-0.918052,
b,-0.826892,-2.598391,1.259854
c,-0.988237,-1.609949,1.753433
d,,1.113446,2.192956


In [32]:
df.add(df2, fill_value=0)

Unnamed: 0,one,two,three
a,-0.394081,-0.918052,1.0
b,-0.826892,-2.598391,1.259854
c,-0.988237,-1.609949,1.753433
d,,1.113446,2.192956


### Flexible Comparisons

In [33]:
# Series and DataFrames have the binary comparison methods `eq`, `ne`, `lt`, `gt`, `le` and 'ge'.
# whose behaviour is analogous to the binary arithmetic operations described above:

df.gt(df2)

Unnamed: 0,one,two,three
a,False,False,False
b,False,False,False
c,False,False,False
d,False,False,False


In [34]:
df2.ne(df)

Unnamed: 0,one,two,three
a,False,False,True
b,False,False,False
c,False,False,False
d,True,False,False


### Boolean Reductions

In [35]:
# You can apply the reductions: `empty`, `any()`, `all()`, `bool` to provide a way to summarize a boolean result.

(df > 0).all()

one      False
two      False
three    False
dtype: bool

In [36]:
(df > 0).any()

one      False
two       True
three     True
dtype: bool

In [37]:
#You can reduce to a final boolean value.

(df > 0).any().any()

True

In [38]:
# Testing if a pandas is empty, via the `empy` property.
df.empty

False

In [39]:
pd.DataFrame(columns=list("ABC")).empty

True

### Comparing if Objects are Equivalent

In [40]:
# Testing if df + df and df * 2 produce the same result.

# Using: (df + df == df * 2).all() is a false expression.

df + df == df * 2

Unnamed: 0,one,two,three
a,True,True,False
b,True,True,True
c,True,True,True
d,False,True,True


In [41]:
(df + df == df * 2).all()

# NaNs do no compute as equal.

one      False
two       True
three    False
dtype: bool

In [42]:
np.nan == np.nan

False

In [43]:
# NDFrames(Series and DataFrames) have an equal() method for testing equality, with NaNs in corresponding locations treated as equal.

(df + df).equals(df * 2)

True

In [46]:
# Series and DataFrames needs to be in the same order for equality to be True:

df1 = pd.DataFrame({"col": ["foo", 0, np.nan]})

df2 = pd.DataFrame({"col": [np.nan, 0, "foo"]}, index=[2, 1, 0])

df1.equals(df2)

False

In [47]:
df1.equals(df2.sort_index())

True

### Comparing array-like objects.

In [48]:
# You can conveniently perform element-wise comparisons when comparing a pandas data structure with scalar value:

pd.Series(["foo", "bar", "baz"]) == "foo"

0     True
1    False
2    False
dtype: bool

In [49]:
pd.Index(["foo", "bar", "baz"]) == "foo"

array([ True, False, False])

In [50]:
# pandas also handles element-wise comparisons between different array-like objects of the same length:

pd.Series(["foo", "bar", "baz"]) == pd.Index(["foo", "bar", "qux"])

0     True
1     True
2    False
dtype: bool

In [51]:
pd.Series(["foo", "bar", "baz"]) == np.array(["foo", "bar", "qux"])

0     True
1     True
2    False
dtype: bool

- Tryng to compare `index` or `series` objects of different lengths will raise a ValueError

### Combining Overlapping DataSets.

 - The function implementing this operation is `combine_first()`

In [56]:
df1 = pd.DataFrame(
    {"A": [1.0, np.nan, 3.0, 5.0, np.nan], "B": [np.nan, 2.0, 3.0, np.nan, 6.0]}
)


df1

Unnamed: 0,A,B
0,1.0,
1,,2.0
2,3.0,3.0
3,5.0,
4,,6.0


In [57]:
df2 = pd.DataFrame(
    {
        "A": [5.0, 2.0, 4.0, np.nan, 3.0, 7.0],
        "B": [np.nan, np.nan, 3.0, 4.0, 6.0, 8.0],
    }
)

df2

Unnamed: 0,A,B
0,5.0,
1,2.0,
2,4.0,3.0
3,,4.0
4,3.0,6.0
5,7.0,8.0


In [58]:
df1.combine_first(df2)

Unnamed: 0,A,B
0,1.0,
1,2.0,2.0
2,3.0,3.0
3,5.0,4.0
4,3.0,6.0
5,7.0,8.0


### General DataFrame combine

 - The combine_first() method calls the more general `DataFrame.combine()` which takes another DataFrame and combiner function, aligns the input DataFrame and then passes the combiner function pairs of Series.

In [59]:
# To reproduce the combine_first() : 

def combiner(x, y):
    return np.where(pd.isna(x), y, x)

df1.combine(df2, combiner)

Unnamed: 0,A,B
0,1.0,
1,2.0,2.0
2,3.0,3.0
3,5.0,4.0
4,3.0,6.0
5,7.0,8.0


## Descriptive Statistics

In [60]:
df

Unnamed: 0,one,two,three
a,-0.19704,-0.459026,
b,-0.413446,-1.299196,0.629927
c,-0.494118,-0.804975,0.876716
d,,0.556723,1.096478


In [61]:
df.mean(0)

one     -0.368202
two     -0.501618
three    0.867707
dtype: float64

In [62]:
# The `skipna` option signals whether to exclude missing data.

df.sum(0, skipna=False)

one           NaN
two     -2.006473
three         NaN
dtype: float64

In [63]:
df.sum(axis=1, skipna=True)

a   -0.656067
b   -1.082715
c   -0.422377
d    1.653201
dtype: float64

In [64]:
ts_stand = (df - df.mean()) / df.std()

ts_stand.std()

one      1.0
two      1.0
three    1.0
dtype: float64

In [65]:
xs_stand = df.sub(df.mean(1), axis=0).div(df.std(1), axis=0)

xs_stand.std(1)

a    1.0
b    1.0
c    1.0
d    1.0
dtype: float64

- Methods like `cumsum()` and `cumprod()` preserve the location of NaN values.
- This is somewhat different from `expanding()` and `rolling()` since NaN behaviour is furthermore dictated by a `min_periods` parameter.

In [66]:
df.cumsum()

Unnamed: 0,one,two,three
a,-0.19704,-0.459026,
b,-0.610487,-1.758222,0.629927
c,-1.104605,-2.563196,1.506643
d,,-2.006473,2.603121


In [67]:
#returning the number of unique non-NA values in a series.

series = pd.Series(np.random.randn(500))

series[20:500] = np.nan

series[10:20] = 5

series.nunique()

11

### Summarizing data: describe

In [68]:
series = pd.Series(np.random.randn(1000))

series[::2] = np.nan

series.describe()

count    500.000000
mean       0.035907
std        1.013158
min       -3.195468
25%       -0.659830
50%        0.077289
75%        0.716128
max        3.233550
dtype: float64

In [69]:
frame = pd.DataFrame(np.random.randn(1000, 5), columns=["a", "b", "c", "d", "e"])

frame.iloc[::2] = np.nan

frame.describe()

Unnamed: 0,a,b,c,d,e
count,500.0,500.0,500.0,500.0,500.0
mean,-0.11943,-0.027934,0.044898,0.009818,-0.005304
std,0.999534,0.962786,1.030733,1.031117,1.040974
min,-3.734683,-2.772225,-2.47537,-2.639059,-3.249207
25%,-0.777562,-0.633826,-0.667784,-0.75807,-0.769768
50%,-0.139996,-0.058921,0.020977,-0.002111,0.004859
75%,0.537493,0.578812,0.708664,0.727161,0.730049
max,4.286973,2.821411,3.12731,3.229645,2.916921
