In [1]:
# Versão da Linguagem Python
from platform import python_version
print('Versão de Python Neste Jupyter Notebook:', python_version())

# usaremos o filtro 'warning' para deixar mais limpo.
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

Versão de Python Neste Jupyter Notebook: 3.10.5


#### 61. How to know the maximum possible correlation value of each column against other columns?

In [2]:
# Input
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1), 
                  columns = list('pqrstuvwxy'), 
                  index = list('abcdefgh'))

df

Unnamed: 0,p,q,r,s,t,u,v,w,x,y
a,30,63,26,23,93,60,50,38,81,85
b,31,54,61,97,2,8,89,54,94,43
c,95,79,42,85,5,62,67,96,86,92
d,7,90,56,94,97,60,16,19,86,77
e,38,29,15,97,53,65,48,9,85,4
f,85,95,46,2,97,51,41,37,67,76
g,37,77,87,20,63,57,36,7,47,97
h,12,47,6,9,51,52,23,9,48,86


In [3]:
# Solution
abs_corrmat = np.abs(df.corr())
max_corr = abs_corrmat.apply(lambda x: sorted(x)[-2])
print('Maximum Correlation possible for each column: ',
      np.round(max_corr.tolist(), 2))

Maximum Correlation possible for each column:  [0.69 0.67 0.59 0.8  0.74 0.63 0.74 0.69 0.8  0.67]


#### 62. How to create a column containing the minimum by maximum of each row?

In [4]:
# Input
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))

In [5]:
# Solution 1
min_by_max = df.apply(lambda x: np.min(x)/np.max(x), 
                      axis = 1)

In [6]:
# Solution 2
min_by_max = np.min(df, axis=1)/np.max(df, axis = 1)

#### 63. How to create a column that contains the penultimate value in each row?

In [7]:
# Input
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))

In [8]:
# Solution
out = df.apply(lambda x: x.sort_values().unique()[-2], axis=1)
df['penultimate'] = out
print(df)

    0   1   2   3   4   5   6   7   8   9  penultimate
0  60  11  18  63  72  41  31  46  46  31           63
1  16   9  67  63  30  88   5  32  12  35           67
2  38  10   5  88  97  68  94  69  85  39           94
3  82  48  42  33  28  61  34  26  90  64           82
4  21  16  75  66  65  60  14  40  82  82           75
5  71  61  85  40  51  80  68  94  57  95           94
6  30  31  34  64  92  22  22  80  48  95           92
7  73  56  49  16  13  56  97  18  24  51           73


#### 64. How to normalize all columns in a dataframe?

In [9]:
# Input
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))

In [10]:
# Solution Q1
out1 = df.apply(lambda x: ((x - x.mean())/x.std()).round(2))

print('Solution Q1\n',out1)

Solution Q1
       0     1     2     3     4     5     6     7     8     9
0 -1.40  0.25  0.51  0.45  0.66  0.72 -1.44  0.45  0.08 -0.06
1 -1.16 -1.67 -1.13 -1.03  0.08 -0.56  0.57 -1.05  0.11 -1.20
2  0.88  0.93 -1.09  1.03 -0.20  0.89  0.76 -1.25 -0.60  0.25
3 -0.95  0.10  1.07  0.64 -0.75 -1.40 -0.91  0.76 -1.95 -1.60
4  0.46 -0.35 -0.61 -1.54 -0.91  0.89 -0.53 -0.22 -0.19  1.33
5  0.55 -1.03 -0.77  1.09  1.40 -1.45  0.63  1.18  0.79 -0.21
6  0.46  0.40  1.27  0.18  1.11  0.20 -0.56 -0.97  1.43  0.90
7  1.15  1.38  0.75 -0.82 -1.39  0.70  1.48  1.09  0.34  0.59


In [11]:
# Solution Q2
out2 = df.apply(lambda x: ((x.max() - x)/(x.max() - x.min())).round(2))

print('Solution Q2\n', out2)

Solution Q2
       0     1     2     3     4     5     6     7     8     9
0  1.00  0.37  0.32  0.24  0.26  0.07  1.00  0.30  0.40  0.47
1  0.91  1.00  1.00  0.80  0.47  0.62  0.31  0.92  0.39  0.86
2  0.11  0.15  0.98  0.02  0.57  0.00  0.25  1.00  0.60  0.37
3  0.82  0.42  0.08  0.17  0.77  0.98  0.82  0.17  1.00  1.00
4  0.27  0.57  0.78  1.00  0.83  0.00  0.69  0.57  0.48  0.00
5  0.24  0.79  0.85  0.00  0.00  1.00  0.29  0.00  0.19  0.53
6  0.27  0.32  0.00  0.34  0.10  0.29  0.70  0.89  0.00  0.15
7  0.00  0.00  0.22  0.72  1.00  0.08  0.00  0.03  0.32  0.25


#### 65. How to compute the correlation of each row with the suceeding row?

In [12]:
# Input
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))

In [13]:
# Solution
[df.iloc[i].corr(df.iloc[i+1]).round(2) for i in range(df.shape[0])[:-1]]

[-0.18, 0.14, 0.29, 0.27, 0.8, -0.3, 0.15]

#### 66. How to replace both the diagonals of dataframe with 0?

In [14]:
# Input
df = pd.DataFrame(np.random.randint(1,100, 100).reshape(10, -1))

In [15]:
# Solution
for i in range(df.shape[0]):
    df.iat[i, i] = 0
    df.iat[df.shape[0]-i-1, i] = 0

#### 67. How to get the particular group of a groupby dataframe by key?

In [16]:
# Input
df = pd.DataFrame({'col1': ['apple', 'banana', 'orange'] * 3, 
                   'col2': np.random.rand(9),
                   'col3': np.random.randint(0, 15, 9)})

df_grouped = df.groupby(['col1'])


In [17]:
# Solution 1
df_grouped.get_group('apple')

Unnamed: 0,col1,col2,col3
0,apple,0.885077,2
3,apple,0.482532,8
6,apple,0.479548,2


In [18]:
# Solution 2
for i, dff in df_grouped:
    if i == 'apple':
        print(dff)

    col1      col2  col3
0  apple  0.885077     2
3  apple  0.482532     8
6  apple  0.479548     2


68. How to get the n’th largest value of a column when grouped
by another column?

In [19]:
# Input
df = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                   'taste': np.random.rand(9),
                   'price': np.random.randint(0, 15, 9)})

print(df)

    fruit     taste  price
0   apple  0.119937      2
1  banana  0.486014     12
2  orange  0.563867      7
3   apple  0.234303     12
4  banana  0.074819      4
5  orange  0.665385      7
6   apple  0.750449      3
7  banana  0.364837      3
8  orange  0.681713      6


In [20]:
# Solution
df_grpd = df['taste'].groupby(df.fruit)
df_grpd.get_group('banana').sort_values().iloc[-2]

0.3648371950951611

#### 69. How to compute grouped mean on pandas dataframe and keep the grouped column as another column (not index)?

In [21]:
# Input
df = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                   'rating': np.random.rand(9),
                   'price': np.random.randint(0, 15, 9)})

In [22]:
# Solution
out = df.groupby('fruit', as_index=False)['price'].mean()
print(out)

    fruit      price
0   apple   7.000000
1  banana  10.666667
2  orange   4.000000


#### 70. How to join two dataframes by 2 columns so they have only the common rows?

In [23]:
# Input
df1 = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                    'weight': ['high', 'medium', 'low'] * 3,
                    'price': np.random.randint(0, 15, 9)})

df2 = pd.DataFrame({'pazham': ['apple', 'orange', 'pine'] * 2,
                    'kilo': ['high', 'low'] * 3,
                    'price': np.random.randint(0, 15, 6)})


In [24]:
# Solution
pd.merge(df1, df2, how='inner', 
         left_on=['fruit', 'weight'], 
         right_on=['pazham', 'kilo'],
         suffixes=['_left', '_right'])

Unnamed: 0,fruit,weight,price_left,pazham,kilo,price_right
0,apple,high,8,apple,high,12
1,apple,high,0,apple,high,12
2,apple,high,4,apple,high,12
3,orange,low,2,orange,low,1
4,orange,low,7,orange,low,1
5,orange,low,8,orange,low,1


#### 71. How to remove rows from a dataframe that are present in another dataframe?

In [25]:
# Input
df1 = pd.DataFrame({'fruit': ['apple', 'orange', 'banana'] * 3,
                    'weight': ['high', 'medium', 'low'] * 3,
                    'price': np.arange(9)})

df2 = pd.DataFrame({'fruit': ['apple', 'orange', 'pine'] * 2,
                    'weight': ['high', 'medium'] * 3,
                    'price': np.arange(6)})

In [26]:
# Solution
print(df1[~df1.isin(df2).all(1)])

    fruit  weight  price
2  banana     low      2
3   apple    high      3
4  orange  medium      4
5  banana     low      5
6   apple    high      6
7  orange  medium      7
8  banana     low      8


#### 72. How to get the positions where values of two columns match?

In [27]:
# Input
df = pd.DataFrame({'fruit1': np.random.choice(['apple', 'orange', 'banana'], 10),
                   'fruit2': np.random.choice(['apple', 'orange', 'banana'], 10)})

In [28]:
# Solution
np.where(df.fruit1 == df.fruit2)

(array([0, 3, 7], dtype=int64),)

#### 73. How to create lags and leads of a column in a dataframFeeed?back

In [29]:
# Input
df = pd.DataFrame(np.random.randint(1, 100, 20).reshape(-1, 4), 
                  columns = list('abcd'))

In [30]:
# Solution
df['a_lag1'] = df['a'].shift(1)
df['b_lead1'] = df['b'].shift(-1)

print(df)

    a   b   c   d  a_lag1  b_lead1
0  33  98  71  23     NaN     26.0
1  45  26  25  11    33.0      4.0
2  85   4  20  73    45.0     35.0
3  73  35  20  39    85.0     64.0
4  79  64  90  31    73.0      NaN


#### 74. How to get the frequency of unique values in the entire dataframe?

In [31]:
# Input
df = pd.DataFrame(np.random.randint(1, 10, 20).reshape(-1, 4), 
                  columns = list('abcd'))

In [32]:
# Solution
pd.value_counts(df.values.ravel())

7    6
2    3
1    2
4    2
3    2
9    2
6    1
5    1
8    1
dtype: int64

#### 75. How to split a text column into two separate columns?

In [33]:
# Input
df = pd.DataFrame(["STD, City State",
                   "33, Kolkata West Bengal",
                   "44, Chennai Tamil Nadu",
                   "40, Hyderabad Telengana",
                   "80, Bangalore Karnataka"],
                  columns = ['row'])

In [34]:
# Solution
df_out = df.row.str.split(',|\t', expand=True)

In [35]:
# Make first row as header
new_header = df_out.iloc[0]
df_out = df_out[1:]
df_out.columns = new_header

print(df_out)

0 STD            City State
1  33   Kolkata West Bengal
2  44    Chennai Tamil Nadu
3  40   Hyderabad Telengana
4  80   Bangalore Karnataka


In [36]:
%reload_ext watermark
%watermark -a "Caique Miranda" -gu "caiquemiranda" -iv

Author: Caique Miranda

Github username: caiquemiranda

pandas: 1.4.3
numpy : 1.23.0

