
# 4: Einführung in pandas: Grundlegende Methoden



### Loading Packages

In [42]:
import pandas as pd
import numpy as np

---
## 1. pandas: Basic Functionality

https://pandas.pydata.org/pandas-docs/stable/user_guide/basics.html

https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html

---
### 1.1 Object Creation
https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html#object-creation

In [43]:
#Creating a Series by passing a list of values, letting pandas create a default integer index:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [44]:
#Creating a DataFrame by passing a dictionary of objects that can be converted into a series-like structure:
data_frame = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)
data_frame

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [45]:
#Creating a DataFrame by passing a NumPy array, with a datetime index and labeled columns:
np.random.seed(0)
dates = pd.date_range("20130101", periods=6)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,1.764052,0.400157,0.978738,2.240893
2013-01-02,1.867558,-0.977278,0.950088,-0.151357
2013-01-03,-0.103219,0.410599,0.144044,1.454274
2013-01-04,0.761038,0.121675,0.443863,0.333674
2013-01-05,1.494079,-0.205158,0.313068,-0.854096
2013-01-06,-2.55299,0.653619,0.864436,-0.742165


---
### 1.2 Selecting Data
- https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html#selection
- https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html



In [46]:
#Selecting a single column, which yields a Series, equivalent to df.A:
df["A"]

2013-01-01    1.764052
2013-01-02    1.867558
2013-01-03   -0.103219
2013-01-04    0.761038
2013-01-05    1.494079
2013-01-06   -2.552990
Freq: D, Name: A, dtype: float64

In [47]:
type(df["A"])

pandas.core.series.Series

In [48]:
#Selecting rows via [], which slices the rows:
df[0:1]

Unnamed: 0,A,B,C,D
2013-01-01,1.764052,0.400157,0.978738,2.240893


In [49]:
type(df[0:1])

pandas.core.frame.DataFrame

In [50]:
df["20130102":"20130104"]

Unnamed: 0,A,B,C,D
2013-01-02,1.867558,-0.977278,0.950088,-0.151357
2013-01-03,-0.103219,0.410599,0.144044,1.454274
2013-01-04,0.761038,0.121675,0.443863,0.333674


#### Selection by label

In [51]:
#For getting a cross section using a label:
df.loc[dates[0]] #dates[0] = 2013-01-01

A    1.764052
B    0.400157
C    0.978738
D    2.240893
Name: 2013-01-01 00:00:00, dtype: float64

In [52]:
type(df.loc[dates[0]])

pandas.core.series.Series

In [53]:
#Selecting on a multi-axis by label:
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2013-01-01,1.764052,0.400157
2013-01-02,1.867558,-0.977278
2013-01-03,-0.103219,0.410599
2013-01-04,0.761038,0.121675
2013-01-05,1.494079,-0.205158
2013-01-06,-2.55299,0.653619


#### Selection by position

In [54]:
#Select via the position of the passed integers:
df.iloc[3]

A    0.761038
B    0.121675
C    0.443863
D    0.333674
Name: 2013-01-04 00:00:00, dtype: float64

#### Boolean indexing

In [55]:
#Using a single column’s values to select data:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2013-01-01,1.764052,0.400157,0.978738,2.240893
2013-01-02,1.867558,-0.977278,0.950088,-0.151357
2013-01-04,0.761038,0.121675,0.443863,0.333674
2013-01-05,1.494079,-0.205158,0.313068,-0.854096


---
### 1.3 Setting Data

https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html#setting

In [56]:
#Setting a new column automatically aligns the data by the indexes:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20130102", periods=6)) 
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [57]:
df["F"] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,1.764052,0.400157,0.978738,2.240893,
2013-01-02,1.867558,-0.977278,0.950088,-0.151357,1.0
2013-01-03,-0.103219,0.410599,0.144044,1.454274,2.0
2013-01-04,0.761038,0.121675,0.443863,0.333674,3.0
2013-01-05,1.494079,-0.205158,0.313068,-0.854096,4.0
2013-01-06,-2.55299,0.653619,0.864436,-0.742165,5.0


In [58]:
#Setting values by label:
df.at[dates[0], "A"] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.400157,0.978738,2.240893,
2013-01-02,1.867558,-0.977278,0.950088,-0.151357,1.0
2013-01-03,-0.103219,0.410599,0.144044,1.454274,2.0
2013-01-04,0.761038,0.121675,0.443863,0.333674,3.0
2013-01-05,1.494079,-0.205158,0.313068,-0.854096,4.0
2013-01-06,-2.55299,0.653619,0.864436,-0.742165,5.0


In [59]:
#Setting values by position:
df.iat[0, 1] = 0 
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.978738,2.240893,
2013-01-02,1.867558,-0.977278,0.950088,-0.151357,1.0
2013-01-03,-0.103219,0.410599,0.144044,1.454274,2.0
2013-01-04,0.761038,0.121675,0.443863,0.333674,3.0
2013-01-05,1.494079,-0.205158,0.313068,-0.854096,4.0
2013-01-06,-2.55299,0.653619,0.864436,-0.742165,5.0


In [60]:
#Setting by assigning with a NumPy array:
df.loc[:, "D"] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.978738,5,
2013-01-02,1.867558,-0.977278,0.950088,5,1.0
2013-01-03,-0.103219,0.410599,0.144044,5,2.0
2013-01-04,0.761038,0.121675,0.443863,5,3.0
2013-01-05,1.494079,-0.205158,0.313068,5,4.0
2013-01-06,-2.55299,0.653619,0.864436,5,5.0


---
### 1.4 Viewing data
https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html#viewing-data

In [61]:
#Here is how to view the top and bottom rows of the frame:
df.head(2)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.978738,5,
2013-01-02,1.867558,-0.977278,0.950088,5,1.0


In [62]:
df.tail(3)

Unnamed: 0,A,B,C,D,F
2013-01-04,0.761038,0.121675,0.443863,5,3.0
2013-01-05,1.494079,-0.205158,0.313068,5,4.0
2013-01-06,-2.55299,0.653619,0.864436,5,5.0


In [63]:
#Display the index, columns:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [64]:
df.columns

Index(['A', 'B', 'C', 'D', 'F'], dtype='object')

In [65]:
#DataFrame.to_numpy() gives a NumPy representation of the underlying data:
df.to_numpy()

array([[ 0.        ,  0.        ,  0.97873798,  5.        ,         nan],
       [ 1.86755799, -0.97727788,  0.95008842,  5.        ,  1.        ],
       [-0.10321885,  0.4105985 ,  0.14404357,  5.        ,  2.        ],
       [ 0.76103773,  0.12167502,  0.44386323,  5.        ,  3.        ],
       [ 1.49407907, -0.20515826,  0.3130677 ,  5.        ,  4.        ],
       [-2.55298982,  0.6536186 ,  0.8644362 ,  5.        ,  5.        ]])

In [66]:
#describe() shows a quick statistic summary of your data:
df.describe()

Unnamed: 0,A,B,C,D,F
count,6.0,6.0,6.0,6.0,5.0
mean,0.244411,0.000576,0.615706,5.0,3.0
std,1.579191,0.567058,0.360293,0.0,1.581139
min,-2.55299,-0.977278,0.144044,5.0,1.0
25%,-0.077414,-0.153869,0.345767,5.0,2.0
50%,0.380519,0.060838,0.65415,5.0,3.0
75%,1.310819,0.338368,0.928675,5.0,4.0
max,1.867558,0.653619,0.978738,5.0,5.0


In [67]:
#Transposing your data:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,0.0,1.867558,-0.103219,0.761038,1.494079,-2.55299
B,0.0,-0.977278,0.410599,0.121675,-0.205158,0.653619
C,0.978738,0.950088,0.144044,0.443863,0.313068,0.864436
D,5.0,5.0,5.0,5.0,5.0,5.0
F,,1.0,2.0,3.0,4.0,5.0


In [68]:
#Sorting by an axis:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,F,D,C,B,A
2013-01-01,,5,0.978738,0.0,0.0
2013-01-02,1.0,5,0.950088,-0.977278,1.867558
2013-01-03,2.0,5,0.144044,0.410599,-0.103219
2013-01-04,3.0,5,0.443863,0.121675,0.761038
2013-01-05,4.0,5,0.313068,-0.205158,1.494079
2013-01-06,5.0,5,0.864436,0.653619,-2.55299


In [69]:
#Sorting by values:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D,F
2013-01-02,1.867558,-0.977278,0.950088,5,1.0
2013-01-05,1.494079,-0.205158,0.313068,5,4.0
2013-01-01,0.0,0.0,0.978738,5,
2013-01-04,0.761038,0.121675,0.443863,5,3.0
2013-01-03,-0.103219,0.410599,0.144044,5,2.0
2013-01-06,-2.55299,0.653619,0.864436,5,5.0


---
### 1.5 Operations
https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html#operations

#### Stats

In [70]:
#Performing a descriptive statistic:
df.mean()

A    0.244411
B    0.000576
C    0.615706
D    5.000000
F    3.000000
dtype: float64

In [71]:
#Same operation on the other axis:
df.mean(1)   

2013-01-01    1.494684
2013-01-02    1.568074
2013-01-03    1.490285
2013-01-04    1.865315
2013-01-05    2.120398
2013-01-06    1.793013
Freq: D, dtype: float64


#### Apply
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.apply.html

In [72]:
#Applying functions to the data:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.978738,5,
2013-01-02,1.867558,-0.977278,1.928826,10,1.0
2013-01-03,1.764339,-0.566679,2.07287,15,3.0
2013-01-04,2.525377,-0.445004,2.516733,20,6.0
2013-01-05,4.019456,-0.650163,2.829801,25,10.0
2013-01-06,1.466466,0.003456,3.694237,30,15.0


In [73]:
df.apply(lambda x: x.max() - x.min())

A    4.420548
B    1.630896
C    0.834694
D    0.000000
F    4.000000
dtype: float64

#### Value counts (Histogramming)

https://pandas.pydata.org/pandas-docs/stable/user_guide/basics.html#basics-discretization

In [74]:
#The value_counts() Series method and top-level function computes a histogram of a 1D array of values. 
#It can also be used as a function on regular arrays:
s = pd.Series(np.random.randint(0, 7, size=10))
s

0    4
1    2
2    0
3    0
4    4
5    5
6    5
7    6
8    0
9    4
dtype: int64

In [75]:
s.value_counts()

4    3
0    3
5    2
6    1
2    1
dtype: int64

In [76]:
#count values in df cols:
df['F'].value_counts()

5.0    1
4.0    1
3.0    1
2.0    1
1.0    1
Name: F, dtype: int64

In [77]:
df['D'].value_counts()

5    6
Name: D, dtype: int64

---
### 1.6 Missing data
https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html#missing-data

pandas primarily uses the value `np.nan` to represent missing data. It is by default not included in computations.

In [78]:
#Reindexing allows you to change/add/delete the index on a specified axis. This returns a copy of the data:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])
df1.loc[dates[0] : dates[1], "E"] = 1
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,0.978738,5,,1.0
2013-01-02,1.867558,-0.977278,0.950088,5,1.0,1.0
2013-01-03,-0.103219,0.410599,0.144044,5,2.0,
2013-01-04,0.761038,0.121675,0.443863,5,3.0,


In [79]:
#To drop any rows that have missing data:
df1.dropna(how="any")

Unnamed: 0,A,B,C,D,F,E
2013-01-02,1.867558,-0.977278,0.950088,5,1.0,1.0


In [80]:
#Filling missing data:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.0,0.978738,5,5.0,1.0
2013-01-02,1.867558,-0.977278,0.950088,5,1.0,1.0
2013-01-03,-0.103219,0.410599,0.144044,5,2.0,5.0
2013-01-04,0.761038,0.121675,0.443863,5,3.0,5.0


---
## 2. Working with Text Data: String Methods, Splitting, Concat, Substrings, Indexing
- see https://pandas.pydata.org/pandas-docs/stable/user_guide/text.html#method-summary


#### various String Methods, for example:

In [81]:
s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"])
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

---
## 3. Input/Output 

https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html

- Read and Output CSV, XML, HTML, SQL, JSON, LATEX files
- e.g. `read_csv()`, `to_csv()`


In [82]:
df = pd.read_csv('../wahlverwandschaften.txt', sep = "\n", header=None) 
df.columns = ['lines']
df.head()

Unnamed: 0,lines
0,Die Wahlverwandtschaften
1,Ein Roman
2,von Johann Wolfgang von Goethe
3,Erster Teil
4,Erstes Kapitel
