# General

## Pandas

### Import

In [1]:
import numpy as np
import pandas as pd

*Set of data used*

In [2]:
df = pd.DataFrame([['A', 1, 1.1], ['B', 2, 2.2], ['C', 3, 3.3]], columns=['col1', 'col2', 'col3'], index=[10, 20, 30])
df.to_csv('data.csv')
df.to_pickle('data.pickle')

### Get data

From CSV

In [3]:
df = pd.read_csv('data.csv')

From pickle

In [4]:
df = pd.read_pickle('data.pickle')

### See data

#### Head

In [5]:
df.head(2)

Unnamed: 0,col1,col2,col3
10,A,1,1.1
20,B,2,2.2


#### Tail

In [6]:
df.tail(1)

Unnamed: 0,col1,col2,col3
30,C,3,3.3


#### A column

In [7]:
df['col1']

10    A
20    B
30    C
Name: col1, dtype: object

#### Multiple columns

In [8]:
df[0:3]

Unnamed: 0,col1,col2,col3
10,A,1,1.1
20,B,2,2.2
30,C,3,3.3


In [9]:
df[::2]

Unnamed: 0,col1,col2,col3
10,A,1,1.1
30,C,3,3.3


In [10]:
df[1::]

Unnamed: 0,col1,col2,col3
20,B,2,2.2
30,C,3,3.3


#### Row at one index

In [11]:
df.loc[10]

col1      A
col2      1
col3    1.1
Name: 10, dtype: object

Keep dataframe structure

In [12]:
df.loc[[10]]

Unnamed: 0,col1,col2,col3
10,A,1,1.1


#### Rows at some multiple index

In [13]:
df.loc[[10, 30]]

Unnamed: 0,col1,col2,col3
10,A,1,1.1
30,C,3,3.3


#### Rows in a range of index

In [14]:
df.loc[10:20]

Unnamed: 0,col1,col2,col3
10,A,1,1.1
20,B,2,2.2


In [15]:
df.loc[20::]

Unnamed: 0,col1,col2,col3
20,B,2,2.2
30,C,3,3.3


#### Rows index and columns

In [16]:
df.loc[[10, 30], ['col2', 'col3']]

Unnamed: 0,col2,col3
10,1,1.1
30,3,3.3


In [17]:
df.loc[10::, 'col2'::]

Unnamed: 0,col2,col3
10,1,1.1
20,2,2.2
30,3,3.3


#### Row by position

In [18]:
df.iloc[2]

col1      C
col2      3
col3    3.3
Name: 30, dtype: object

#### Rows by position

In [19]:
df.iloc[1::]

Unnamed: 0,col1,col2,col3
20,B,2,2.2
30,C,3,3.3


In [20]:
df.iloc[[0, 2]]

Unnamed: 0,col1,col2,col3
10,A,1,1.1
30,C,3,3.3


#### Rows positions and columns

In [21]:
df.iloc[2, [1, 2]]

col2      3
col3    3.3
Name: 30, dtype: object

In [22]:
df.iloc[2, 2]

3.2999999999999998

In [23]:
df.iloc[[2], [1, 2]]

Unnamed: 0,col2,col3
30,3,3.3


In [24]:
df.iloc[::2, 1::]

Unnamed: 0,col2,col3
10,1,1.1
30,3,3.3


#### Values of a serie

In [41]:
df['col1'].values

array(['A', 'B', 'C'], dtype=object)

#### Index of a serie

In [42]:
df['col1'].index

Int64Index([10, 20, 30], dtype='int64')

### Iterate over data

#### Row by row (with dict)

In [25]:
for row in df.iterrows():
    print('First row index: ' + str(row[0]))
    print('First row content: ')
    print(row[1])
    print('First row col1 content: ' + row[1]['col1'])
    break

First row index: 10
First row content: 
col1      A
col2      1
col3    1.1
Name: 10, dtype: object
First row col1 content: A


In [26]:
for index, row in df.iterrows():
    print('First row index: ' + str(index))
    print('First row content: ')
    print(row)
    print('First row col1 content: ' + row['col1'])
    break

First row index: 10
First row content: 
col1      A
col2      1
col3    1.1
Name: 10, dtype: object
First row col1 content: A


#### Row by row (with tuples)

In [27]:
for row in df.itertuples():
    print('First row index: ' + str(row.Index))
    print('First row col1 content: ' + row.col1)
    print('First row col2 content: ' + str(row.col2))
    print('First row col3 content: ' + str(row.col3))
    break

First row index: 10
First row col1 content: A
First row col2 content: 1
First row col3 content: 1.1


In [28]:
for index, col1, col2, col3 in df.itertuples():
    print('First row index: ' + str(index))
    print('First row col1 content: ' + col1)
    print('First row col2 content: ' + str(col2))
    print('First row col3 content: ' + str(col3))
    break

First row index: 10
First row col1 content: A
First row col2 content: 1
First row col3 content: 1.1


#### Column by column

In [29]:
for item in df.iteritems():
    print('First column name: ' + item[0])
    print('First column content:')
    print(item[1])
    print('First column, first value at index 10: ' + item[1][10])
    break

First column name: col1
First column content:
10    A
20    B
30    C
Name: col1, dtype: object
First column, first value at index 10: A


### Select data

#### Select with function and loc

In [30]:
df.loc[lambda df: df.col2 > 1]

Unnamed: 0,col1,col2,col3
20,B,2,2.2
30,C,3,3.3


In [31]:
def select_A(df):
    return df.col1 == "A"

df.loc[select_A]

Unnamed: 0,col1,col2,col3
10,A,1,1.1


#### Select with boolean condition

In [32]:
# df[df['col2'] > 2]

selected_index = df['col2'] > 2
df[selected_index]

Unnamed: 0,col1,col2,col3
30,C,3,3.3


In [33]:
# df[[(col1_value.startswith("A") and col2_value == 1) or col2_value == 3 for index, col1_value, col2_value in df[['col1', 'col2']].itertuples()]]

selected_index = [(col1_value.startswith("A") and col2_value == 1) or col2_value == 3 for index, col1_value, col2_value in df[['col1', 'col2']].itertuples()]
df[selected_index] 

Unnamed: 0,col1,col2,col3
10,A,1,1.1
30,C,3,3.3


### Change values

In [36]:
df['col4'] = df.col2 + df.col3
df.head()

Unnamed: 0,col1,col2,col3,col4
10,A,1,1.1,2.1
20,B,2,2.2,4.2
30,C,3,3.3,6.3


In [40]:
df['col4'] = df.apply(lambda row: row.col2 + row.col3 + 1, axis=1)
df.head()

Unnamed: 0,col1,col2,col3,col4
10,A,1,1.1,3.1
20,B,2,2.2,5.2
30,C,3,3.3,7.3


In [44]:
df['col4'] = df['col4'] * 10
df.head()

Unnamed: 0,col1,col2,col3,col4
10,A,1,1.1,310.0
20,B,2,2.2,520.0
30,C,3,3.3,730.0


In [52]:
def set_some_values(row):
    if row.col2 > 1:
        return 'Ok'
    else:
        return np.NAN

df['col5'] = df.apply(set_some_values, axis=1)
df.head()

Unnamed: 0,col1,col2,col3,col4,col5
10,A,1,1.1,310.0,
20,B,2,2.2,520.0,Ok
30,C,3,3.3,730.0,Ok


### Check values

#### Number of unique values in a serie

In [54]:
df['col5'].value_counts(dropna=False)

Ok     2
NaN    1
Name: col5, dtype: int64

#### Is dataframe contain null values

In [55]:
df['col5'].isnull()

10     True
20    False
30    False
Name: col5, dtype: bool

#### Size of the dataframe

*(Number of rows, Number of columns)*

In [57]:
df.shape

(3, 5)

#### If a colum's values are uniques

In [62]:
df['col1'].is_unique

True

### Extra

In [64]:
from itertools import count
for i, (name, group) in zip(count(1), df.groupby('col5')):
    print(i)
    print(name)
    print(group)

1
Ok
   col1  col2  col3   col4 col5
20    B     2   2.2  520.0   Ok
30    C     3   3.3  730.0   Ok
