In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

#### [df to numpy array](#0)
#### [return implicit index under the explicit one](#1)
#### [return  index  of the value in a Series obj](#2)
#### [return  explicit index  of the value in a Series obj](#3)
#### [rename columns](#4)
#### [rename rows](#5)
#### [transpose df](#6)
#### [add a column to the df](#6.1)
#### [delete a row or column](#7)
#### [create list obj out of the column or a row](#8)
#### [apply a function to a row, column or DataFrame obj](#9)
#### [fill missing values](#10)
#### [descriptive statistics on the DataFrame](#11)
#### [descriptive statistics on the DataFrame column](#12)
#### [count value frequency in a column](#14)
#### [central tendency measures (mean, median, mode, standard deviation on a columns](#15)
#### [delete a column from the DataFrame](#16)
#### [delete a row from the DataFrame](#17)
#### [check up if DataFrame has null values](#18)


### <a id="0">df to numpy array</a>


In [2]:
data = pd.DataFrame([[100, 95, 75, 44],
                   [50, 78, 89, 100],
                   [12, 23, 34, 45]], columns=["bacteria", "archaea", "viruses", "fungi"])

In [3]:
data.values

array([[100,  95,  75,  44],
       [ 50,  78,  89, 100],
       [ 12,  23,  34,  45]], dtype=int64)



### <a id="1">return implicit index by explicit one</a>

In [4]:
data.columns.get_loc("fungi")

3

In [5]:
data.index.get_loc(0)

0

### <a id="2">return index of the value in a Series obj</a>

- return implicit index

In [6]:
s1 = pd.Series(np.linspace(100, 1000, 10), index=list(range(10)))

In [7]:
pd.Index(s1).get_loc(700)

6

In [8]:
s1

0     100.0
1     200.0
2     300.0
3     400.0
4     500.0
5     600.0
6     700.0
7     800.0
8     900.0
9    1000.0
dtype: float64



### <a id="3">return explicit index in a Series obj</a>

In [9]:
s1 = pd.Series(np.linspace(100, 1000, 10), index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"])

In [10]:
s1[s1 == 1000].index

Index(['j'], dtype='object')

** or immediately create a simple obj: first el of a list return by tolist()**


In [11]:
s1[s1 == 1000].index.tolist()[0]  

'j'

**but if your Series obj may have more than one value you are looking for, don't specify tolist()[0]**

In [12]:
s1 = pd.Series([1, 2, 3, 1, 2, 3, 4, 5, 6, 7], index=["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"])

- return all the entries were values == 1000
- than take their indices 
- and convert them to list


In [13]:

s1[s1 == 3].index.tolist()

['c', 'f']

### <a id="4">rename columns</a>

- create dictionary old_col : new_col
- pass it to rename() method of df
- use inplace=True

In [14]:
d = {"bacteria":"kingdom_1", "archaea":"kingdom_2", "viruses":"kingdom_3", "fungi":"kingdom_4" }

In [15]:
data.rename(columns={key:value for (key,value) in d.items()}, inplace=True)

In [16]:
data

Unnamed: 0,kingdom_1,kingdom_2,kingdom_3,kingdom_4
0,100,95,75,44
1,50,78,89,100
2,12,23,34,45


**another way: just to assign new values to the 'columns' attribute of the df : **

In [17]:
data.columns = ["bacteria", "archae", "viruses", "fungi"]

In [18]:
data

Unnamed: 0,bacteria,archae,viruses,fungi
0,100,95,75,44
1,50,78,89,100
2,12,23,34,45


### <a id="5">rename rows</a>

In [19]:
data.rename(index={0:"row_1", 1:"row_2", 2:"row_3"}, inplace=True)

In [20]:
data

Unnamed: 0,bacteria,archae,viruses,fungi
row_1,100,95,75,44
row_2,50,78,89,100
row_3,12,23,34,45


**another way: to assign values to 'index' attribute of the df : **

In [21]:
data.index = ["row_1", "row_2", "row_3"]

In [22]:
data

Unnamed: 0,bacteria,archae,viruses,fungi
row_1,100,95,75,44
row_2,50,78,89,100
row_3,12,23,34,45


### <a id="6">transpose df<a>

In [23]:
data.T

Unnamed: 0,row_1,row_2,row_3
bacteria,100,50,12
archae,95,78,23
viruses,75,89,34
fungi,44,100,45


### <a id="6.1">add a column to the df<a>

In [24]:
data['protozoa'] = [10, 15, 20]
data

Unnamed: 0,bacteria,archae,viruses,fungi,protozoa
row_1,100,95,75,44,10
row_2,50,78,89,100,15
row_3,12,23,34,45,20


### <a id="7">delete a row or column</a>

In [25]:
data.drop(["fungi"], axis=1, inplace=True)

In [26]:
data

Unnamed: 0,bacteria,archae,viruses,protozoa
row_1,100,95,75,10
row_2,50,78,89,15
row_3,12,23,34,20


In [27]:
data.drop(["row_1"], axis=0, inplace=True)

In [28]:
data

Unnamed: 0,bacteria,archae,viruses,protozoa
row_2,50,78,89,15
row_3,12,23,34,20


### <a id="8">create list obj out of the column or a row</a>

In [29]:
data.loc[: , "bacteria"].tolist()

[50, 12]

In [30]:
data.loc["row_2"].tolist()

[50, 78, 89, 15]

### <a id="9">apply a function to a row, column or DataFrame obj</a>

In [31]:
data.loc['row_2'].apply(np.log2)

bacteria    5.643856
archae      6.285402
viruses     6.475733
protozoa    3.906891
Name: row_2, dtype: float64

In [32]:
data.loc[ : , 'bacteria'].apply(np.log2)

row_2    5.643856
row_3    3.584963
Name: bacteria, dtype: float64

In [33]:
df1 = pd.DataFrame([[1,2,3,4,5],
                   [6,7,8,9,10]])
df1

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,5
1,6,7,8,9,10


In [34]:
df1.apply(lambda x: x * 100)

Unnamed: 0,0,1,2,3,4
0,100,200,300,400,500
1,600,700,800,900,1000


### <a id="10">fill missing values</a>

In [35]:
df_nan = pd.DataFrame([[100, np.nan, 0.2, np.nan],
                      [150, np.nan, 0.7, np.nan],
                      [99, np.nan, 0.8, np.nan]])

In [36]:
df_nan

Unnamed: 0,0,1,2,3
0,100,,0.2,
1,150,,0.7,
2,99,,0.8,


In [37]:
df_nan.fillna(0, inplace=True)
df_nan

Unnamed: 0,0,1,2,3
0,100,0.0,0.2,0.0
1,150,0.0,0.7,0.0
2,99,0.0,0.8,0.0


In [38]:
sns.get_dataset_names()



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


['anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'exercise',
 'flights',
 'fmri',
 'gammas',
 'iris',
 'mpg',
 'planets',
 'tips',
 'titanic']

In [39]:
titanic = sns.load_dataset('titanic')

### <a id="11">descriptive statistics on the DataFrame</a>

In [40]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [41]:
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


### <a id="12">descriptive statistics on the DataFrame column</a>

In [42]:
titanic['age'].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: age, dtype: float64

### <a id="13">descriptive statistics on the features which contain string values</a>

In [43]:
titanic.describe(include=['O'])

Unnamed: 0,sex,embarked,who,embark_town,alive
count,891,889,891,889,891
unique,2,3,3,3,2
top,male,S,man,Southampton,no
freq,577,644,537,644,549


### <a id="14">count value frequency in a column</a>

In [44]:
titanic['who'].value_counts()

man      537
woman    271
child     83
Name: who, dtype: int64

### <a id="15">central tendency measures on a column</a>

In [45]:
titanic['fare'].mean()

32.2042079685746

In [46]:
titanic['fare'].mode()

0    8.05
dtype: float64

In [47]:
titanic['fare'].median()

14.4542

In [48]:
titanic['fare'].std()

49.693428597180905

### <a id="16">delete a column from the DataFrame</a>

In [49]:
titanic.drop(['who'], axis=1, inplace=True)
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,True,,Southampton,no,True


### <a id="17">delete a row from the DataFrame</a>

In [50]:
data

Unnamed: 0,bacteria,archae,viruses,protozoa
row_2,50,78,89,15
row_3,12,23,34,20


In [52]:
data.drop(['row_2'])

Unnamed: 0,bacteria,archae,viruses,protozoa
row_3,12,23,34,20


### <a id="18">check up if DataFrame has null values</a>

```isnull()``` 
**returns DataFrame with boolean values : **

In [53]:
titanic.isnull().head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,adult_male,deck,embark_town,alive,alone
0,False,False,False,False,False,False,False,False,False,False,True,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,True,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,True,False,False,False


```df.isnull().any()``` **returns columns which contain null values : **

In [54]:
titanic.isnull().any()

survived       False
pclass         False
sex            False
age             True
sibsp          False
parch          False
fare           False
embarked        True
class          False
adult_male     False
deck            True
embark_town     True
alive          False
alone          False
dtype: bool

In [55]:
titanic['survived'].isnull().any()

False

In [56]:
titanic['age'].isnull().any()

True

```df.isnull().sum()``` **gives number of nulls in the columns**

In [57]:
titanic.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [58]:
titanic['age'].isnull().sum()

177

```df.isnull().sum().sum``` **gives total number of nulls in the columns**

In [59]:
titanic.isnull().sum().sum()

869

@@@@@@@@@@@@@@@@@@@@@@@

normalize data
1. import preprocessing
2. use scale function - the easiest way
3. to save 0 in your data, and get values between 0-1, use MinMaxScaler obj 

In [None]:
from sklearn import preprocessing

In [None]:
data_raw = pd.DataFrame([[1, 25, 1500],
                        [0.5, 29, 1200],
                       [0.8, 77, 1000],
                       [0.1, 150, 1987]])

In [None]:
data_raw

In [None]:
data_raw.plot()
plt.show()

In [None]:
data_scaled = pd.DataFrame(preprocessing.scale(data_raw))

In [None]:
data_scaled

In [None]:
data_scaled.plot()
plt.show()

In [None]:
min_max_scaler = preprocessing.MinMaxScaler()
X_data = pd.DataFrame(min_max_scaler.fit_transform(data_raw))

In [None]:
X_data

In [None]:
X_data.plot()
plt.show()

@@@@@@@@@@@@@@@@@@@@@@

df from Series objs
1. create series objs
2. make a list of their vars
3. pass the list to the df constructor

so, series' names are going to be the indices

indices are going to be the columns names of the df


In [None]:
# it's better to give a name to a series obj, it saves your time later

ser1 = pd.Series(np.linspace(1, 5, 5), name="data_1", index=list(range(5)))
ser2 = pd.Series(np.linspace(100,1000, 5), name="data_2", index=list(range(5)))
ser3 = pd.Series(np.linspace(-5, 0, 5), name="data_3", index=list(range(5)))

In [None]:
some_series = [ser1, ser2, ser3]

In [None]:
data_123 = pd.DataFrame(some_series)

In [None]:
data_123

@@@@@@@@@@@@@@@@@@@@@@

mean and standard deviation of a column or a row

In [None]:
data_stat = pd.DataFrame([[1,2,3,4, 5],
                          [10,20,30,40, 50],
                          [100, 200, 300, 400, 500]])

In [None]:
# in columns by default
data_stat.mean()

In [None]:
# axis=1 (in docs denoted as columns, so it's easy to be confused, as it goes across the rows,
# it's like you squeeze your df in one column
data_stat.mean(axis=1)