In [1]:
# Versão da Linguagem Python
from platform import python_version
print('Versão de Python Neste Jupyter Notebook:', python_version())

# usaremos o filtro 'warning' para deixar mais limpo.
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

Versão de Python Neste Jupyter Notebook: 3.10.5


In [2]:
link = 'https://raw.githubusercontent.com/caiquemiranda/pandas-exercises/main/data/cars.csv'
dir = 'data/cars.csv'

#### 41. How to count the number of missing values in each column?

In [3]:
cars = pd.read_csv(link)

In [4]:
n_missings_each_col = cars.apply(lambda x: x.isnull().sum())
n_missings_each_col.argmax()

23

#### 42. How to replace missing values of multiple numeric columns with the mean?

In [5]:
df_out = cars[['Min.Price', 
             'Max.Price']] = cars[['Min.Price',
                                 'Max.Price']].apply(lambda x: x.fillna(x.mean()))

print(df_out.head())

   Min.Price  Max.Price
0  12.900000  18.800000
1  29.200000  38.700000
2  25.900000  32.300000
3  17.118605  44.600000
4  17.118605  21.459091


#### 43. How to use apply function on existing columns with global variables as additional arguments?

In [6]:
d = {'Min.Price': np.nanmean,
     'Max.Price': np.nanmedian}

cars[['Min.Price', 'Max.Price']] = cars[['Min.Price', 'Max.Price']].apply(lambda x, d: x.fillna(d[x.name](x)), args = (d, ))

#### 44. How to select a specific column from a dataframe as a dataframe instead of a series?

In [7]:
rnd = pd.DataFrame(np.arange(20).reshape(-1, 5), 
                  columns = list('aeiou'))

In [8]:
type(rnd[['a']])
type(rnd.loc[:, ['a']])
type(rnd.iloc[:, [0]])

pandas.core.frame.DataFrame

In [9]:
type(rnd.a)
type(rnd['a'])
type(rnd.loc[:, 'a'])
type(rnd.iloc[:, 1])

pandas.core.series.Series

#### 45. How to change the order of columns of a dataframe?

In [10]:
# 01
rnd[list('uiaoe')]

Unnamed: 0,u,i,a,o,e
0,4,2,0,3,1
1,9,7,5,8,6
2,14,12,10,13,11
3,19,17,15,18,16


In [11]:
# 02
def switch_columns(df, col1 = None, col2 = None):
    colnames = df.columns.tolist()
    i1, i2 = colnames.index(col1), colnames.index(col2)
    colnames[i2], colnames[i1] = colnames[i1], colnames[i2]
    
    return df[colnames]

df1 = switch_columns(rnd, 'a', 'i')

In [12]:
# 03
rnd[sorted(rnd.columns)]

Unnamed: 0,a,e,i,o,u
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [13]:
# 04
rnd.sort_index(axis = 1, ascending = False, inplace = True)

#### 46. How to set the number of rows and columns displayed in the output?

In [14]:
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

cars

pd.describe_option()

compute.use_bottleneck : bool
    Use the bottleneck library to accelerate if it is installed,
    the default is True
    Valid values: False,True
    [default: True] [currently: True]
compute.use_numba : bool
    Use the numba engine option for select operations if it is installed,
    the default is False
    Valid values: False,True
    [default: False] [currently: False]
compute.use_numexpr : bool
    Use the numexpr library to accelerate computation if it is installed,
    the default is True
    Valid values: False,True
    [default: True] [currently: True]
display.chop_threshold : float or None
    if set to a float value, all float values smaller then the given threshold
    will be displayed as exactly 0 by repr and friends.
    [default: None] [currently: None]
display.colheader_justify : 'left'/'right'
    Controls the justification of column headers. used by DataFrameFormatter.
    [default: right] [currently: right]
display.column_space No description available.
    [defa

#### 47. How to format or suppress scientific notations in a pandas dataframe?

In [15]:
rnd_2 = pd.DataFrame(np.random.random(4) ** 10, columns = ['random'])

In [16]:
rnd_2.round(4)

Unnamed: 0,random
0,0.4225
1,0.0004
2,0.0036
3,0.0171


In [17]:
rnd_2.apply(lambda x: '%.4f' % x, axis=1)

0    0.4225
1    0.0004
2    0.0036
3    0.0171
dtype: object

In [18]:
rnd_2.applymap(lambda x: '%.4f' % x)

Unnamed: 0,random
0,0.4225
1,0.0004
2,0.0036
3,0.0171


In [19]:
pd.set_option('display.float_format', lambda x: '%.4f' % x)

In [20]:
pd.options.display.float_format = '{:.4f}'.format
print(rnd_2)

   random
0  0.4225
1  0.0004
2  0.0036
3  0.0171


In [21]:
# Reset formatting
pd.options.display.float_format = None

#### 48. How to format all the values in a dataframe as percentages?

In [22]:
out = rnd_2.style.format({'random': '{0:.2%}'.format,})
out

Unnamed: 0,random
0,42.25%
1,0.04%
2,0.36%
3,1.71%


#### 49. How to filter every nth row in a dataframe?

In [23]:
print(cars.iloc[::20, :][['Manufacturer', 'Model', 'Type']])

   Manufacturer    Model     Type
0         Acura  Integra    Small
20     Chrysler  LeBaron  Compact
40        Honda  Prelude   Sporty
60      Mercury   Cougar  Midsize
80       Subaru   Loyale    Small


#### 50. How to create a primary key index by combining relevant columns?

In [24]:
cars_2 = pd.read_csv(link, usecols=[0, 1, 2, 3, 5])

In [25]:
cars_2[['Manufacturer', 'Model', 'Type']] = cars_2[['Manufacturer', 
                                                    'Model', 
                                                    'Type']].fillna('missing')

cars_2.index = cars_2.Manufacturer + '_' + cars_2.Model + '_' + cars_2.Type

print(cars_2.index.is_unique)

True


#### 51. How to get the row number of the nth largest value in a column?

In [26]:
rnd_3 = pd.DataFrame(np.random.randint(1, 30, 30).reshape(10,-1), 
                     columns = list('abc'))

In [27]:
n = 5
rnd_3['a'].argsort()[::-1][n]

9

#### 52. How to find the position of the nth largest value greater than a given value?

In [28]:
ser = pd.Series(np.random.randint(1, 100, 15))

In [29]:
print('ser: ', ser.tolist(),
      'mean: ', round(ser.mean()))

np.argwhere(ser > ser.mean())[1]

ser:  [57, 28, 68, 59, 30, 2, 13, 3, 60, 98, 16, 81, 40, 66, 87] mean:  47


ValueError: Length of values (1) does not match length of index (15)

#### 53. How to get the last n rows of a dataframe with row sum > 100?

In [30]:
rnd_4 = pd.DataFrame(np.random.randint(10, 40, 60).reshape(-1, 4))

In [31]:
rowsums = rnd_4.apply(np.sum, 
                      axis = 1)

last_two_rows = rnd_4.iloc[np.where(rowsums > 100)[0][-2:], :]

#### 54. How to find and cap outliers from a series or dataframe column?

In [32]:
ser_2 = pd.Series(np.logspace(-2, 2, 30))

In [33]:
def cap_outliers(ser, low_perc, high_perc):
    
    low, high = ser.quantile([low_perc, high_perc])
    
    print(low_perc, '%ile: ', 
          low, '|', high_perc, '%ile: ', 
          high)
    
    ser[ser < low] = low
    ser[ser > high] = high
    return(ser)

capped_ser = cap_outliers(ser, .05, .95)

0.05 %ile:  2.7 | 0.95 %ile:  90.29999999999998


#### 55. How to reshape a dataframe to the largest possible square after removing the negative values?

In [34]:
rnd_5 = pd.DataFrame(np.random.randint(-20, 50, 100).reshape(10, -1))
rnd_5

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-10,35,5,3,21,0,-8,-10,4,41
1,13,38,33,43,-17,3,44,8,-7,-12
2,22,38,35,-1,-2,33,31,38,47,-15
3,42,29,49,-6,24,24,-6,21,1,1
4,7,38,-14,26,22,23,-19,41,31,40
5,1,1,-8,-16,43,-20,46,-20,13,46
6,-2,9,15,-14,-8,40,-4,7,0,0
7,37,-10,-15,19,-11,2,-3,8,-7,37
8,33,40,32,-20,43,39,26,-17,-4,33
9,17,0,38,-17,17,1,41,-13,27,17


In [35]:
arr = rnd_5[rnd_5 > 0].values.flatten()
arr_qualified = arr[~np.isnan(arr)]

n = int(np.floor(arr_qualified.shape[0]**.5))

top_indexes = np.argsort(arr_qualified)[::-1]
output = np.take(arr_qualified,
                 sorted(top_indexes[:n**2])).reshape(n, -1)

print(output)

[[35.  5.  3. 21.  4. 41. 13. 38.]
 [33. 43.  3. 44.  8. 22. 38. 35.]
 [33. 31. 38. 47. 42. 29. 49. 24.]
 [24. 21.  1.  1.  7. 38. 26. 22.]
 [23. 41. 31. 40.  1. 43. 46. 13.]
 [46.  9. 15. 40.  7. 37. 19.  2.]
 [ 8. 37. 33. 40. 32. 43. 39. 26.]
 [33. 17. 38. 17.  1. 41. 27. 17.]]


#### 56. How to swap two rows of a dataframe?

In [36]:
rnd_6 = pd.DataFrame(np.arange(25).reshape(5, -1))

In [37]:
def swap_rows(df, i1, i2):
    a, b = df.iloc[i1, :].copy(), df.iloc[i2, :].copy()
    df.iloc[i1, :], df.iloc[i2, :] = b, a
    return df

print(swap_rows(rnd_6, 1, 2))

    0   1   2   3   4
0   0   1   2   3   4
1  10  11  12  13  14
2   5   6   7   8   9
3  15  16  17  18  19
4  20  21  22  23  24


#### 57. How to reverse the rows of a dataframe?

In [38]:
rnd_6.iloc[::-1, :]

Unnamed: 0,0,1,2,3,4
4,20,21,22,23,24
3,15,16,17,18,19
2,5,6,7,8,9
1,10,11,12,13,14
0,0,1,2,3,4


In [39]:
print(rnd_6.loc[rnd_6.index[::-1], :])

    0   1   2   3   4
4  20  21  22  23  24
3  15  16  17  18  19
2   5   6   7   8   9
1  10  11  12  13  14
0   0   1   2   3   4


#### 58. How to create one-hot encodings of a categorical variable (dummy variables)?

In [40]:
rnd_7 = pd.DataFrame(np.arange(25).reshape(5,-1), columns=list('abcde'))

In [42]:
onehot = pd.concat([pd.get_dummies(rnd_7['a']), 
                   rnd_7[list('bcde')]],
                   axis = 1)

print(onehot)

   0  5  10  15  20   b   c   d   e
0  1  0   0   0   0   1   2   3   4
1  0  1   0   0   0   6   7   8   9
2  0  0   1   0   0  11  12  13  14
3  0  0   0   1   0  16  17  18  19
4  0  0   0   0   1  21  22  23  24


#### 59. Which column contains the highest number of row-wise maximum values?

In [43]:
rnd_8 = pd.DataFrame(np.random.randint(1, 100, 40).reshape(10, -1))

In [44]:
print('Column with highest row maxes: ', 
      rnd_8.apply(np.argmax, 
               axis = 1).value_counts().index[0])

Column with highest row maxes:  0


#### 60. How to create a new column that contains the row number of nearest column by euclidean distance?

In [45]:
rnd_9 = pd.DataFrame(np.random.randint(1, 100, 40).reshape(10, -1),
                  columns = list('pqrs'), 
                  index = list('abcdefghij'))

In [46]:
nearest_rows = []
nearest_distance = []

for i, row in rnd_9.iterrows():
    curr = row
    rest = rnd_9.drop(i)
    e_dists = {} 
    
    for j, contestant in rest.iterrows():
        e_dists.update({j: round(np.linalg.norm(curr.values - contestant.values))})

    nearest_rows.append(max(e_dists, key=e_dists.get))
    nearest_distance.append(max(e_dists.values()))

rnd_9['nearest_row'] = nearest_rows
rnd_9['dist'] = nearest_distance

In [47]:
rnd_9

Unnamed: 0,p,q,r,s,nearest_row,dist
a,66,95,71,12,c,129
b,56,11,47,63,g,105
c,90,4,4,69,a,129
d,42,72,70,39,c,110
e,66,92,49,30,c,109
f,29,28,46,68,a,98
g,3,83,6,25,c,125
h,73,13,66,35,g,116
i,84,32,44,74,g,114
j,38,10,87,13,g,115


In [48]:
%reload_ext watermark
%watermark -a "Caique Miranda" -gu "caiquemiranda" -iv

Author: Caique Miranda

Github username: caiquemiranda

numpy : 1.23.0
pandas: 1.4.3

