# Playing with Pandas


Exercises questions are taken from https://www.machinelearningplus.com/python/101-pandas-exercises-python/

Solutions are my own unless specified.

## Basics

In [1]:
##importing pandas and checking version
import pandas as pd
pd.__version__

'0.24.2'

In [15]:
##creating series from a list, numpy array and dictionary objects 
import numpy as np
import string
mylist = list(string.ascii_lowercase)
pd.Series(mylist)

# from numpy array
np.arange(26)
pd.Series(np.arange(26))


## from dictionary

pd.Series(dict(zip(mylist, np.arange(26)))).head(5)



a    0
b    1
c    2
d    3
e    4
dtype: int64

### Converting index of a series to a column of data frame

In [20]:
myarr  = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)
print(ser.index)

Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
       'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'],
      dtype='object')


In [30]:
## adding it as column

pd.DataFrame({'col1' : ser.index, 'col2' : ser}).head()

Unnamed: 0,col1,col2
a,a,0
b,b,1
c,c,2
d,d,3
e,e,4


In [29]:
## or as per https://www.machinelearningplus.com/python/101-pandas-exercises-python/

ser.to_frame().reset_index().head()

Unnamed: 0,index,0
0,a,0
1,b,1
2,c,2
3,d,3
4,e,4


### Combining multiple series to a dataframe

In [32]:
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))


In [35]:
pd.concat([ser1, ser2], axis =1)

Unnamed: 0,0,1
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4
5,f,5
6,g,6
7,h,7
8,i,8
9,j,9


### Assigning name to series index

In [40]:
ser1.index.name = "my_index"

In [45]:
ser1.head()
##Assigining name to series

ser1.name = 'test_name'


In [47]:
ser1.head()

my_index
0    a
1    b
2    c
3    e
4    d
Name: test_name, dtype: object

AttributeError: 'Series' object has no attribute 'test_name'

### Get the items of series A not present in series B

In [52]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])
s

In [60]:
ser1[~(ser1.isin(ser2))]

0    1
1    2
2    3
dtype: int64

### Get the items not common to both series A and series B

In [62]:
pd.concat([ser1[~(ser1.isin(ser2))] ,ser2[~(ser2.isin(ser1))]], axis =0)

0    1
1    2
2    3
2    6
3    7
4    8
dtype: int64

### Get the minimum, 25th percentile, median, 75th, and max of a numeric series?

In [68]:
ser = pd.Series(np.random.normal(10, 5, 25))
ser.describe()


count    25.000000
mean      9.192722
std       5.040666
min      -2.381215
25%       6.351771
50%       9.141433
75%      12.738921
max      20.466051
dtype: float64

In [70]:
## from https://www.machinelearningplus.com/python/101-pandas-exercises-python/
np.percentile(ser, q=[0, 25, 50, 75, 100])

array([-2.38121511,  6.35177087,  9.14143347, 12.73892095, 20.46605133])

### frequency counts of unique items of a series

In [72]:
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))

In [76]:
np.unique( ser,return_counts = True)

(array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'], dtype=object),
 array([5, 3, 7, 3, 2, 6, 2, 2]))

In [78]:
#### from https://www.machinelearningplus.com/python/101-pandas-exercises-python/

ser.value_counts()

c    7
f    6
a    5
b    3
d    3
e    2
g    2
h    2
dtype: int64

### Categorizing numeric series

In [108]:
np.random.seed(100)
ser = pd.Series(np.random.random(20))
print(ser)
## from #### from https://www.machinelearningplus.com/python/101-pandas-exercises-python/
pd.qcut(ser, q=20, labels=['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th','11th','12th', '13th', '14th', '15th', '16th', '17th', '18th', '19th', '20th'])
        

0     0.543405
1     0.278369
2     0.424518
3     0.844776
4     0.004719
5     0.121569
6     0.670749
7     0.825853
8     0.136707
9     0.575093
10    0.891322
11    0.209202
12    0.185328
13    0.108377
14    0.219697
15    0.978624
16    0.811683
17    0.171941
18    0.816225
19    0.274074
dtype: float64


0     12th
1     10th
2     11th
3     18th
4      1st
5      3rd
6     14th
7     17th
8      4th
9     13th
10    19th
11     7th
12     6th
13     2nd
14     8th
15    20th
16    15th
17     5th
18    16th
19     9th
dtype: category
Categories (20, object): [1st < 2nd < 3rd < 4th ... 17th < 18th < 19th < 20th]

### convert a numpy array to a dataframe of given shape

In [110]:
ser = pd.Series(np.random.randint(1, 10, 35))

In [112]:
##reshape series to df 7 rows and 5 cols
pd.DataFrame(ser.to_numpy().reshape(7,5))



Unnamed: 0,0,1,2,3,4
0,4,8,2,2,8
1,8,1,3,4,3
2,6,9,2,1,8
3,7,3,1,9,3
4,6,2,9,2,6
5,5,3,9,4,6
6,1,4,7,4,5


In [113]:
## or as per rom https://www.machinelearningplus.com/python/101-pandas-exercises-python/
df = pd.DataFrame(ser.values.reshape(7,5))

### Find the positions of numbers that are multiples of 3 from a series

In [116]:
np.random.seed(13)
ser = pd.Series(np.random.randint(1, 10, 7))
ser

0    3
1    1
2    1
3    7
4    3
5    5
6    4
dtype: int64

In [119]:
ser.index[ser%3==0]

Int64Index([0, 4], dtype='int64')

In [121]:
##or 
np.argwhere(ser%3==0)

  return getattr(obj, method)(*args, **kwds)


array([[0],
       [4]])

### Extract items at given position in Series

In [124]:
ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]

ser

0     a
1     b
2     c
3     d
4     e
5     f
6     g
7     h
8     i
9     j
10    k
11    l
12    m
13    n
14    o
15    p
16    q
17    r
18    s
19    t
20    u
21    v
22    w
23    x
24    y
25    z
dtype: object

In [126]:
ser.iloc[pos]

0     a
4     e
8     i
14    o
20    u
dtype: object

In [127]:
## or use take function
ser.take(pos)

0     a
4     e
8     i
14    o
20    u
dtype: object

### Stack  two series vertically and horizontally

In [128]:
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))

In [129]:
pd.concat([ser1,ser2],axis =0)

0    0
1    1
2    2
3    3
4    4
0    a
1    b
2    c
3    d
4    e
dtype: object

In [131]:
##or use append function

ser1.append(ser2)

0    0
1    1
2    2
3    3
4    4
0    a
1    b
2    c
3    d
4    e
dtype: object

In [130]:
pd.concat([ser1,ser2],axis =1)

Unnamed: 0,0,1
0,0,a
1,1,b
2,2,c
3,3,d
4,4,e


### Get the positions of items of series A in another series B

In [132]:
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])

In [140]:
ser1[ser1.isin(ser2)].index.to_list()

[0, 4, 5, 8]

###  Compute mean squared error on a actual and predicted series?

In [144]:
actual = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)
print(actual.head(),pred.head())

0    0
1    1
2    2
3    3
4    4
dtype: int64 0    0.689778
1    1.288766
2    2.052837
3    3.787670
4    4.525844
dtype: float64


In [148]:
((actual - pred)**2).mean()

0.2612415103189283

### convert the first character of each element in a series to uppercase?

In [169]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])
ser

In [171]:
#from https://www.machinelearningplus.com/python/101-pandas-exercises-python/
ser.map(lambda x : x[0].upper() +x[1:])

0     How
1      To
2    Kick
3    Ass?
dtype: object

### Ccalculate the number of characters in each word in a series

In [172]:
ser

0     how
1      to
2    kick
3    ass?
dtype: object

In [173]:
ser.map(lambda x : len(x))

0    3
1    2
2    4
3    4
dtype: int64

### Compute difference of differences between consequtive numbers of a series

In [175]:
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])
ser.diff()

0    NaN
1    2.0
2    3.0
3    4.0
4    5.0
5    6.0
6    6.0
7    8.0
dtype: float64

In [176]:
ser.diff().diff()

0    NaN
1    NaN
2    1.0
3    1.0
4    1.0
5    1.0
6    0.0
7    2.0
dtype: float64

### Convert a series of date-strings to a timeseries

In [185]:
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])


In [186]:
ser.astype('datetime64')


0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

In [187]:
## get the day of month, week number, day of year and day of week from a series of date string
### https://www.machinelearningplus.com/python/101-pandas-exercises-python/
from dateutil.parser import parse
ser_ts = ser.map(lambda x: parse(x))
print(ser_ts)
# day of month
print("Date: ", ser_ts.dt.day.tolist())

# week number
print("Week number: ", ser_ts.dt.weekofyear.tolist())

# day of year
print("Day number of year: ", ser_ts.dt.dayofyear.tolist())

# day of week
print("Day of week: ", ser_ts.dt.weekday_name.tolist())


0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]
Date:  [1, 2, 3, 4, 5, 6]
Week number:  [53, 5, 9, 14, 19, 23]
Day number of year:  [1, 33, 63, 94, 125, 157]
Day of week:  ['Friday', 'Wednesday', 'Saturday', 'Thursday', 'Monday', 'Saturday']


#### convert year-month string to dates corresponding to the nth(4th) day of the month

In [190]:
ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])
ser.astype('datetime64')

0   2010-01-01
1   2011-02-01
2   2012-03-01
dtype: datetime64[ns]

In [205]:
## but we want to change it to 4th date
ser.map(lambda x: parse('04' + x))
## referred from ### https://www.machinelearningplus.com/python/101-pandas-exercises-python/


0   2010-01-04
1   2011-02-04
2   2012-03-04
dtype: datetime64[ns]

### filter words that contain atleast 2 vowels from a series

In [206]:
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])

In [209]:
test_list = []
for word in ser:
    count =0
    for letter in 'aeiou':
        if letter in word.lower():
            count = word.lower().count(letter)+count
    if count >=2:
        test_list.append(word)
pd.Series(test_list)

0     Apple
1    Orange
2     Money
dtype: object

### filter valid emails from a series?

In [210]:
emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])

In [217]:

## from referred from ### https://www.machinelearningplus.com/python/101-pandas-exercises-python/

import re
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'
mask = emails.map(lambda x: bool(re.match(pattern, x)))
print(mask)
emails[mask]

0    False
1     True
2     True
3     True
dtype: bool


1    rameses@egypt.com
2            matt@t.co
3    narendra@modi.com
dtype: object

### mean of a series grouped together by other series


In [220]:
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weights = pd.Series(np.linspace(1, 10, 10))
print(fruit, weights)

0     apple
1    banana
2    banana
3    banana
4    banana
5    banana
6    banana
7     apple
8    carrot
9     apple
dtype: object 0     1.0
1     2.0
2     3.0
3     4.0
4     5.0
5     6.0
6     7.0
7     8.0
8     9.0
9    10.0
dtype: float64


In [239]:
combined_series = pd.concat([fruit,weights], axis =1)
combined_series

Unnamed: 0,0,1
0,apple,1.0
1,banana,2.0
2,banana,3.0
3,banana,4.0
4,banana,5.0
5,banana,6.0
6,banana,7.0
7,apple,8.0
8,carrot,9.0
9,apple,10.0


In [244]:
combined_series.groupby(combined_series[0]).mean()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
apple,6.333333
banana,4.5
carrot,9.0


In [234]:
ser1 = pd.Series([1,2,34,5])
ser2 = pd.Series([1,2,34,5])

In [238]:
print(np.vstack((fruit,weights)))

[['apple' 'banana' 'banana' 'banana' 'banana' 'banana' 'banana' 'apple'
  'carrot' 'apple']
 [1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0]]


### compute the euclidean distance between two series

In [245]:
p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

In [246]:
np.linalg.norm(p-q)

18.16590212458495

### create a TimeSeries starting ‘2000-01-01’ and 10 weekends (saturdays) after that having random numbers as values


In [259]:
## referred from machine learning plus
ser = pd.Series(np.random.randint(1,10,10), pd.date_range('2000-01-01', periods=10, freq='W-SAT'))
ser

2000-01-01    3
2000-01-08    1
2000-01-15    3
2000-01-22    3
2000-01-29    8
2000-02-05    9
2000-02-12    7
2000-02-19    2
2000-02-26    8
2000-03-04    9
Freq: W-SAT, dtype: int64

###  fill an intermittent time series so all missing dates show up with values of previous non-missing date

In [260]:
ser = pd.Series([1,10,3,np.nan], index=pd.to_datetime(['2000-01-01', '2000-01-03', '2000-01-06', '2000-01-08']))
print(ser)

2000-01-01     1.0
2000-01-03    10.0
2000-01-06     3.0
2000-01-08     NaN
dtype: float64


In [261]:
# Solution from machine learning plus website
ser.resample('D').ffill()  # fill with previous value



2000-01-01     1.0
2000-01-02     1.0
2000-01-03    10.0
2000-01-04    10.0
2000-01-05    10.0
2000-01-06     3.0
2000-01-07     3.0
2000-01-08     NaN
Freq: D, dtype: float64

In [262]:
# Alternatives
ser.resample('D').bfill()  # fill with next value
ser.resample('D').bfill().ffill()  # fill next else prev value

2000-01-01     1.0
2000-01-02    10.0
2000-01-03    10.0
2000-01-04     3.0
2000-01-05     3.0
2000-01-06     3.0
2000-01-07     3.0
2000-01-08     3.0
Freq: D, dtype: float64

### Compute the autocorrelations of a numeric series

In [271]:
np.random.seed(15)
ser = pd.Series(np.arange(20) + np.random.normal(1, 10, 20))
ser

0     -2.123285
1      5.392847
2      1.440915
3     -1.017897
4      7.355689
5    -11.636053
6     -3.958620
7     -2.877657
8      5.948299
9      5.262516
10     8.994055
11    15.551968
12    19.895177
13    18.105897
14     9.350216
15    21.993907
16    15.370637
17    34.002145
18    25.816272
19    20.148801
dtype: float64

In [275]:
autocorrelations = [ser.autocorr(i).round(2) for i in range(11)]
autocorrelations
print('Lag having highest correlation: ', np.argmax(np.abs(autocorrelations[1:]))+1)

Lag having highest correlation:  2


### Change column values when importing csv to a dataframe

In [24]:
## importin boston housing data set
df = pd.read_csv("https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv")
print(df.columns)
df.head()

Index(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
       'ptratio', 'b', 'lstat', 'medv'],
      dtype='object')


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [25]:
## now change the 'medv' (median house value) column so that values < 25 becomes ‘Low’ and > 25 becomes ‘High’.
print(df.dtypes)
df['medv'] =df['medv'].map(lambda x  : 'Low' if x < 25 else 'High')
df.head()

crim       float64
zn         float64
indus      float64
chas         int64
nox        float64
rm         float64
age        float64
dis        float64
rad          int64
tax          int64
ptratio    float64
b          float64
lstat      float64
medv       float64
dtype: object


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,Low
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,Low
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,High
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,High
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,High


In [27]:
df.dtypes

crim       float64
zn         float64
indus      float64
chas         int64
nox        float64
rm         float64
age        float64
dis        float64
rad          int64
tax          int64
ptratio    float64
b          float64
lstat      float64
medv        object
dtype: object

In [28]:
## or solution from machinelearningplus website
# Solution 1: Using converter parameter
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv', 
                 converters={'medv': lambda x: 'High' if float(x) > 25 else 'Low'})


In [29]:
## we can also import specifed columnts from a csv file using usecols argumnet


##pd.read_csv("link", usecols=[list of column names] )

### extract the row and column number of a particular cell with given criterion

In [34]:
#hich manufacturer, model and type has the highest Price? What is the row and column number of the cell 
#with the highest Price value?W

In [50]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
df.head()
np.where(df['Price']==df.Price.max())

(array([0]),)

In [61]:

df.iloc[np.where(df['Price']==df.Price.max())][['Manufacturer', 'Model','Type']]

Unnamed: 0,Manufacturer,Model,Type
58,Mercedes-Benz,300E,Midsize


In [65]:
##other way
df.loc[df.Price == np.max(df.Price), ['Manufacturer', 'Model', 'Type']]

Unnamed: 0,Manufacturer,Model,Type
58,Mercedes-Benz,300E,Midsize


### Replacing column names 

In [80]:
df.columns.values[0] ='Manuf'

In [81]:
df.columns


Index(['Manuf', 'Model', 'Type', 'Min.Price', 'Price', 'Max.Price', 'MPG.city',
       'MPG.highway', 'AirBags', 'DriveTrain', 'Cylinders', 'EngineSize',
       'Horsepower', 'RPM', 'Rev.per.mile', 'Man.trans.avail',
       'Fuel.tank.capacity', 'Passengers', 'Length', 'Wheelbase', 'Width',
       'Turn.circle', 'Rear.seat.room', 'Luggage.room', 'Weight', 'Origin',
       'Make'],
      dtype='object')

## Working with null values

In [98]:
## checking if df has any null values
any(df.isna())

Unnamed: 0,Manuf,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
1,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,True,False,False,False,False,False,True,...,False,False,False,True,False,False,False,False,False,False
4,False,False,False,True,False,True,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,True,False,False
6,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [113]:
## count of null values for each column
df.apply(lambda x : (x.isna()).sum())

Manuf                  4
Model                  1
Type                   3
Min.Price              7
Price                  2
Max.Price              5
MPG.city               9
MPG.highway            2
AirBags                6
DriveTrain             7
Cylinders              5
EngineSize             2
Horsepower             7
RPM                    3
Rev.per.mile           6
Man.trans.avail        5
Fuel.tank.capacity     8
Passengers             2
Length                 4
Wheelbase              1
Width                  6
Turn.circle            5
Rear.seat.room         4
Luggage.room          19
Weight                 7
Origin                 5
Make                   3
dtype: int64

In [114]:
##replacing missing values of multiple columns with mean
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

In [115]:
df.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,Acura,Integra,Small,12.9,15.9,18.8,25.0,31.0,,Front,...,5.0,177.0,102.0,68.0,37.0,26.5,,2705.0,non-USA,Acura Integra
1,,Legend,Midsize,29.2,33.9,38.7,18.0,25.0,Driver & Passenger,Front,...,5.0,195.0,115.0,71.0,38.0,30.0,15.0,3560.0,non-USA,Acura Legend
2,Audi,90,Compact,25.9,29.1,32.3,20.0,26.0,Driver only,Front,...,5.0,180.0,102.0,67.0,37.0,28.0,14.0,3375.0,non-USA,Audi 90
3,Audi,100,Midsize,,37.7,44.6,19.0,26.0,Driver & Passenger,,...,6.0,193.0,106.0,,37.0,31.0,17.0,3405.0,non-USA,Audi 100
4,BMW,535i,Midsize,,30.0,,22.0,30.0,,Rear,...,4.0,186.0,109.0,69.0,39.0,27.0,13.0,3640.0,non-USA,BMW 535i


In [134]:
print(df[['Min.Price', 'Max.Price']].fillna(df.mean()))

## or from machinelearning plus
df[['Min.Price', 'Max.Price']].apply(lambda x: x.fillna(x.mean()))


    Min.Price  Max.Price
0   12.900000  18.800000
1   29.200000  38.700000
2   25.900000  32.300000
3   17.118605  44.600000
4   17.118605  21.459091
5   14.200000  17.300000
6   19.900000  21.459091
7   22.600000  24.900000
8   26.300000  26.300000
9   33.000000  36.300000
10  37.500000  42.700000
11   8.500000  18.300000
12  11.400000  11.400000
13  13.400000  16.800000
14  13.400000  18.400000
15  14.700000  18.000000
16  14.700000  18.600000
17  18.000000  19.600000
18  34.600000  41.500000
19  18.400000  18.400000
20  14.500000  17.100000
21  29.500000  29.500000
22   7.900000  10.600000
23   8.400000  14.200000
24  11.900000  14.700000
25  17.118605  24.400000
26  14.800000  16.400000
27  18.500000  33.100000
28   7.900000  16.500000
29  17.118605  21.200000
..        ...        ...
63   8.700000  14.900000
64  13.000000  18.300000
65  16.700000  21.500000
66  21.000000  22.000000
67  13.000000  14.000000
68  14.200000  18.400000
69  19.500000  19.500000
70  19.500000  21.900000


Unnamed: 0,Min.Price,Max.Price
0,12.900000,18.800000
1,29.200000,38.700000
2,25.900000,32.300000
3,17.118605,44.600000
4,17.118605,21.459091
5,14.200000,17.300000
6,19.900000,21.459091
7,22.600000,24.900000
8,26.300000,26.300000
9,33.000000,36.300000


In [138]:
## replacing missing values in one column with mean and other column by median
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')

# Solution
d = {'Min.Price': np.nanmean, 'Max.Price': np.nanmedian}
df[['Min.Price', 'Max.Price']].apply(lambda x : x.fillna(x.mean()) if x.name == 'Min.Price' else x.fillna(x.median()))


Unnamed: 0,Min.Price,Max.Price
0,12.900000,18.80
1,29.200000,38.70
2,25.900000,32.30
3,17.118605,44.60
4,17.118605,19.15
5,14.200000,17.30
6,19.900000,19.15
7,22.600000,24.90
8,26.300000,26.30
9,33.000000,36.30


### change the order of columns of a dataframe


In [139]:
df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))
df

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [145]:
## interchanging a and c
df[ ['c','b', 'a','d','e']]


0     0
1     5
2    10
3    15
Name: a, dtype: int64

In [149]:
## sorting columsn in alphabetical order
df[sorted(df.columns, reverse=True)]

Unnamed: 0,e,d,c,b,a
0,4,3,2,1,0
1,9,8,7,6,5
2,14,13,12,11,10
3,19,18,17,16,15


In [167]:
## changing two column positions without hard coding column names
def switch_columns(df, col1=None, col2=None):
    colnames = list(df.columns)
    a, b = colnames.index(col1), colnames.index(col2)
    colnames[b], colnames[a] = colnames[a], colnames[b]
    return df[colnames]

In [169]:
switch_columns(df,'a','c')

Unnamed: 0,c,b,a,d,e
0,2,1,0,3,4
1,7,6,5,8,9
2,12,11,10,13,14
3,17,16,15,18,19


In [166]:
swapped_order('a','c')

(0, 2)

### Formatting in pandas

Set the number of rows and columns displayed in the output

In [170]:
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

Format or supress scientic notations in dataframe, 

In [3]:
df = pd.DataFrame(np.random.random(4)**10, columns=['random'])


Unnamed: 0,random
0,0.213133
1,0.000354
2,0.022061
3,0.632873


In [4]:
df.round(2)

Unnamed: 0,random
0,0.21
1,0.0
2,0.02
3,0.63


In [5]:
## or 
pd.set_option('display.float_format', lambda x: '%.4f' % x)


format all the values in a dataframe as percentages

In [15]:
df = pd.DataFrame(np.random.random(4), columns=['random'])
df

Unnamed: 0,random
0,0.2906
1,0.626
2,0.5368
3,0.0725


In [10]:
## from machinelearningplus website
out = df.style.format({
    'random': '{0:.2%}'.format,
})
out

Unnamed: 0,random
0,56.00%
1,50.97%
2,4.19%
3,67.69%


In [18]:
out = df.style.format({
    'random' : '{0:.5%}'.format,
})
out

Unnamed: 0,random
0,29.06233%
1,62.59717%
2,53.68351%
3,7.24812%


### Filter every nth row of a dataframe

In [20]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
df.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,Acura,Integra,Small,12.9,15.9,18.8,25.0,31.0,,Front,...,5.0,177.0,102.0,68.0,37.0,26.5,,2705.0,non-USA,Acura Integra
1,,Legend,Midsize,29.2,33.9,38.7,18.0,25.0,Driver & Passenger,Front,...,5.0,195.0,115.0,71.0,38.0,30.0,15.0,3560.0,non-USA,Acura Legend
2,Audi,90,Compact,25.9,29.1,32.3,20.0,26.0,Driver only,Front,...,5.0,180.0,102.0,67.0,37.0,28.0,14.0,3375.0,non-USA,Audi 90
3,Audi,100,Midsize,,37.7,44.6,19.0,26.0,Driver & Passenger,,...,6.0,193.0,106.0,,37.0,31.0,17.0,3405.0,non-USA,Audi 100
4,BMW,535i,Midsize,,30.0,,22.0,30.0,,Rear,...,4.0,186.0,109.0,69.0,39.0,27.0,13.0,3640.0,non-USA,BMW 535i


In [36]:
 df.loc[pd.RangeIndex(start=0, stop =93, step=10),]

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,Acura,Integra,Small,12.9,15.9,18.8,25.0,31.0,,Front,...,5.0,177.0,102.0,68.0,37.0,26.5,,2705.0,non-USA,Acura Integra
10,Cadillac,Seville,Midsize,37.5,40.1,42.7,16.0,25.0,Driver & Passenger,Front,...,5.0,204.0,111.0,74.0,44.0,31.0,,3935.0,USA,Cadillac Seville
20,Chrysler,LeBaron,Compact,14.5,15.8,17.1,23.0,28.0,Driver & Passenger,Front,...,6.0,183.0,104.0,68.0,41.0,30.5,14.0,3085.0,USA,Chrysler LeBaron
30,Ford,Festiva,Small,6.9,7.4,7.9,31.0,33.0,,Front,...,4.0,141.0,90.0,63.0,33.0,26.0,12.0,1845.0,USA,Ford Festiva
40,Honda,Prelude,Sporty,17.0,19.8,22.7,24.0,31.0,Driver & Passenger,Front,...,4.0,175.0,100.0,70.0,39.0,23.5,8.0,2865.0,non-USA,Honda Prelude
50,Lincoln,Continental,Midsize,33.3,34.3,35.3,17.0,26.0,Driver & Passenger,,...,6.0,205.0,109.0,73.0,42.0,30.0,19.0,3695.0,USA,Lincoln Continental
60,Mercury,Cougar,Midsize,14.9,14.9,14.9,19.0,26.0,,Rear,...,5.0,199.0,113.0,73.0,38.0,28.0,15.0,3610.0,USA,Mercury Cougar
70,Oldsmobile,Eighty-Eight,Large,19.5,20.7,21.9,,28.0,Driver only,Front,...,6.0,201.0,111.0,74.0,42.0,31.5,17.0,3470.0,USA,Oldsmobile Eighty-Eight
80,Subaru,Loyale,Small,10.5,10.9,11.3,25.0,30.0,,4WD,...,5.0,175.0,97.0,65.0,35.0,27.5,15.0,2490.0,non-USA,Subaru Loyale
90,Volkswagen,Corrado,Sporty,22.9,23.3,23.7,18.0,25.0,,Front,...,4.0,159.0,97.0,66.0,36.0,26.0,15.0,2810.0,non-USA,Volkswagen Corrado


In [38]:
##from machinelearning plus other way
df.iloc[::10, :]

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,Acura,Integra,Small,12.9,15.9,18.8,25.0,31.0,,Front,...,5.0,177.0,102.0,68.0,37.0,26.5,,2705.0,non-USA,Acura Integra
10,Cadillac,Seville,Midsize,37.5,40.1,42.7,16.0,25.0,Driver & Passenger,Front,...,5.0,204.0,111.0,74.0,44.0,31.0,,3935.0,USA,Cadillac Seville
20,Chrysler,LeBaron,Compact,14.5,15.8,17.1,23.0,28.0,Driver & Passenger,Front,...,6.0,183.0,104.0,68.0,41.0,30.5,14.0,3085.0,USA,Chrysler LeBaron
30,Ford,Festiva,Small,6.9,7.4,7.9,31.0,33.0,,Front,...,4.0,141.0,90.0,63.0,33.0,26.0,12.0,1845.0,USA,Ford Festiva
40,Honda,Prelude,Sporty,17.0,19.8,22.7,24.0,31.0,Driver & Passenger,Front,...,4.0,175.0,100.0,70.0,39.0,23.5,8.0,2865.0,non-USA,Honda Prelude
50,Lincoln,Continental,Midsize,33.3,34.3,35.3,17.0,26.0,Driver & Passenger,,...,6.0,205.0,109.0,73.0,42.0,30.0,19.0,3695.0,USA,Lincoln Continental
60,Mercury,Cougar,Midsize,14.9,14.9,14.9,19.0,26.0,,Rear,...,5.0,199.0,113.0,73.0,38.0,28.0,15.0,3610.0,USA,Mercury Cougar
70,Oldsmobile,Eighty-Eight,Large,19.5,20.7,21.9,,28.0,Driver only,Front,...,6.0,201.0,111.0,74.0,42.0,31.5,17.0,3470.0,USA,Oldsmobile Eighty-Eight
80,Subaru,Loyale,Small,10.5,10.9,11.3,25.0,30.0,,4WD,...,5.0,175.0,97.0,65.0,35.0,27.5,15.0,2490.0,non-USA,Subaru Loyale
90,Volkswagen,Corrado,Sporty,22.9,23.3,23.7,18.0,25.0,,Front,...,4.0,159.0,97.0,66.0,36.0,26.0,15.0,2810.0,non-USA,Volkswagen Corrado


### Create create a primary key index by combining relevant columns

In [39]:
#Replace NaNs with ‘missing’ in columns 'Manufacturer', 'Model' and 'Type' and create a index as a combination of 
#these three columns and check if the index is a primary key.

df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv', usecols=[0,1,2,3,5])
df.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Max.Price
0,Acura,Integra,Small,12.9,18.8
1,,Legend,Midsize,29.2,38.7
2,Audi,90,Compact,25.9,32.3
3,Audi,100,Midsize,,44.6
4,BMW,535i,Midsize,,


In [56]:
any(df[['Manufacturer','Model','Type']].isna())
## replacing nas with missing
df[['Manufacturer','Model','Type']]=df[['Manufacturer','Model','Type']].fillna('missing')
np.unique(df[['Manufacturer','Model','Type']].isna())

array([False])

In [65]:
df.index= df['Manufacturer']+'_'+df['Model'] + '_'+df['Type']
df.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Max.Price
Acura_Integra_Small,Acura,Integra,Small,12.9,18.8
missing_Legend_Midsize,missing,Legend,Midsize,29.2,38.7
Audi_90_Compact,Audi,90,Compact,25.9,32.3
Audi_100_Midsize,Audi,100,Midsize,,44.6
BMW_535i_Midsize,BMW,535i,Midsize,,


### Get the row number of the nth largest value in a column

In [67]:
##Find the row position of the 5th largest value of column 'a' in df
df = pd.DataFrame(np.random.randint(1, 30, 30).reshape(10,-1), columns=list('abc'))
df.head()

Unnamed: 0,a,b,c
0,26,4,3
1,9,15,12
2,19,29,15
3,8,11,25
4,24,27,17


In [80]:
print(df.loc[np.argsort(df['a'])])
df.loc[np.argsort(df['a'])].index[-5]

    a   b   c
3   8  11  25
1   9  15  12
5  10  10   3
8  17  12  10
2  19  29  15
6  21   4  11
4  24  27  17
0  26   4   3
9  28  24  13
7  29  17  14


6

### find the position of the nth largest value greater than a given value

In [82]:
##In ser, find the position of the 2nd largest value greater than the mean.



ser = pd.Series(np.random.randint(1, 100, 15))
ser

0     37
1     27
2     12
3     63
4     76
5     66
6     40
7     51
8     69
9     50
10    82
11    74
12    12
13    62
14    21
dtype: int64

In [106]:
ser_greater_than_mean =ser[ser>ser.mean()]
print(ser.mean())
print(np.argwhere(ser > ser.mean()))
np.where(ser>ser.mean())


49.46666666666667
[[ 3]
 [ 4]
 [ 5]
 [ 7]
 [ 8]
 [ 9]
 [10]
 [11]
 [13]]


(array([ 3,  4,  5,  7,  8,  9, 10, 11, 13]),)

### get the last n rows of a dataframe with row sum > 100

In [108]:
### Get the last two rows of df whose row sum is greater than 100.

df = pd.DataFrame(np.random.randint(10, 40, 60).reshape(-1, 4))
df

Unnamed: 0,0,1,2,3
0,26,33,28,27
1,39,33,30,29
2,31,22,10,37
3,17,30,19,27
4,25,17,22,37
5,26,18,13,39
6,38,30,18,20
7,27,13,24,29
8,12,39,19,17
9,23,22,17,14


In [128]:
print(np.sum(df,axis=1))

df.loc[np.where(np.sum(df , axis =1) >100)[0][[-1,-2]], ]

0     114
1     131
2     100
3      93
4     101
5      96
6     106
7      93
8      87
9      76
10     85
11    117
12    129
13     85
14     82
dtype: int64


Unnamed: 0,0,1,2,3
12,33,29,36,31
11,38,28,14,37


### Swap two rows of a dataframe

In [2]:
df = pd.DataFrame(np.arange(25).reshape(5, -1))
df

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


In [3]:
a = df.iloc[1,:].copy()
b= df.iloc[2,:].copy()
df.iloc[1,:] = b
df.iloc[2,:] = a

In [4]:
df

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,10,11,12,13,14
2,5,6,7,8,9
3,15,16,17,18,19
4,20,21,22,23,24


### reverse the rows of a dataframe

In [7]:

df

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


In [10]:
df.iloc[::-1,]

Unnamed: 0,0,1,2,3,4
4,20,21,22,23,24
3,15,16,17,18,19
2,10,11,12,13,14
1,5,6,7,8,9
0,0,1,2,3,4


### Create reate one-hot encodings of a categorical variable 

In [12]:
df = pd.DataFrame(np.arange(25).reshape(5,-1), columns=list('abcde'))
df


Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


In [16]:
##Get one-hot encodings for column 'a' in the dataframe df and append it as columns.
pd.concat( [ pd.get_dummies(df['a']), df[['b','c','d','e']]], axis =1)

Unnamed: 0,0,5,10,15,20,b,c,d,e
0,1,0,0,0,0,1,2,3,4
1,0,1,0,0,0,6,7,8,9
2,0,0,1,0,0,11,12,13,14
3,0,0,0,1,0,16,17,18,19
4,0,0,0,0,1,21,22,23,24


### column contains the highest number of row-wise maximum values

In [33]:
df = pd.DataFrame(np.random.randint(1,100, 40).reshape(10, -1))
df

Unnamed: 0,0,1,2,3
0,10,57,75,79
1,94,27,23,26
2,12,45,59,62
3,38,89,38,47
4,38,35,14,31
5,53,18,71,93
6,66,26,27,72
7,2,64,61,30
8,12,11,6,49
9,76,82,46,58


In [34]:
print(np.argmax(np.array(df),axis =1))
counts = np.unique(np.argmax(np.array(df), axis =1), return_counts = True)
counts

[3 0 3 1 0 3 3 1 3 1]


(array([0, 1, 3]), array([2, 3, 5]))

In [35]:
counts[0][np.argmax((counts[1]))]

3

###  maximum possible correlation value of each column against other columns

In [36]:
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1), columns=list('pqrstuvwxy'), index=list('abcdefgh'))

In [37]:
df


Unnamed: 0,p,q,r,s,t,u,v,w,x,y
a,62,84,16,51,76,20,36,52,49,94
b,93,60,78,98,91,55,44,59,72,65
c,69,1,94,63,65,9,86,94,32,39
d,80,45,11,18,68,75,81,39,24,59
e,60,2,46,24,73,16,98,53,64,27
f,98,25,27,42,97,55,54,53,45,43
g,94,18,11,15,26,82,48,33,76,91
h,65,66,41,53,14,39,52,63,9,84


In [41]:
abs(df.corr())


Unnamed: 0,p,q,r,s,t,u,v,w,x,y
p,1.0,0.105362,0.126778,0.072429,0.234203,0.773795,0.375987,0.364846,0.400128,0.059292
q,0.105362,1.0,0.282717,0.371044,0.040586,0.103829,0.723025,0.212266,0.232823,0.716294
r,0.126778,0.282717,1.0,0.73338,0.211573,0.536443,0.28272,0.861255,0.002021,0.469349
s,0.072429,0.371044,0.73338,1.0,0.330493,0.312774,0.362814,0.59319,0.036101,0.023175
t,0.234203,0.040586,0.211573,0.330493,1.0,0.17538,0.085008,0.106547,0.302747,0.523178
u,0.773795,0.103829,0.536443,0.312774,0.17538,1.0,0.322788,0.735275,0.182484,0.358802
v,0.375987,0.723025,0.28272,0.362814,0.085008,0.322788,1.0,0.277817,0.184362,0.805735
w,0.364846,0.212266,0.861255,0.59319,0.106547,0.735275,0.277817,1.0,0.357549,0.39785
x,0.400128,0.232823,0.002021,0.036101,0.302747,0.182484,0.184362,0.357549,1.0,0.001128
y,0.059292,0.716294,0.469349,0.023175,0.523178,0.358802,0.805735,0.39785,0.001128,1.0


In [42]:
np.abs(df.corr()).apply(lambda x: sorted(x)[-2])

p    0.773795
q    0.723025
r    0.861255
s    0.733380
t    0.523178
u    0.773795
v    0.805735
w    0.861255
x    0.400128
y    0.805735
dtype: float64

### create a column containing the minimum by maximum of each row

In [44]:
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,21,16,37,38,41,40,51,70,96,72
1,20,91,61,32,57,19,46,59,16,79
2,92,65,7,99,11,41,79,15,95,27
3,55,92,38,48,24,58,1,7,49,89
4,64,25,28,45,7,49,20,16,96,23
5,38,45,32,1,93,17,82,37,65,7
6,73,97,96,87,38,26,80,96,55,64
7,61,24,27,97,86,77,27,97,77,33


In [46]:
df['ratio'] =df.max( axis=1)/df.min( axis=1)

In [47]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,ratio
0,21,16,37,38,41,40,51,70,96,72,6.0
1,20,91,61,32,57,19,46,59,16,79,5.6875
2,92,65,7,99,11,41,79,15,95,27,14.142857
3,55,92,38,48,24,58,1,7,49,89,92.0
4,64,25,28,45,7,49,20,16,96,23,13.714286
5,38,45,32,1,93,17,82,37,65,7,93.0
6,73,97,96,87,38,26,80,96,55,64,3.730769
7,61,24,27,97,86,77,27,97,77,33,4.041667


### create a column that contains the penultimate value in each row

In [49]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,ratio
0,21,16,37,38,41,40,51,70,96,72,6.0
1,20,91,61,32,57,19,46,59,16,79,5.6875
2,92,65,7,99,11,41,79,15,95,27,14.142857
3,55,92,38,48,24,58,1,7,49,89,92.0
4,64,25,28,45,7,49,20,16,96,23,13.714286
5,38,45,32,1,93,17,82,37,65,7,93.0
6,73,97,96,87,38,26,80,96,55,64,3.730769
7,61,24,27,97,86,77,27,97,77,33,4.041667


In [56]:
## Create a new column 'penultimate' which has the second largest value of each row of df

my_list =[]

for i in range(df.shape[0]):
    my_list.append(np.sort(np.array(df) , axis =1)[i][-2])
    
my_list

df['penultimate'] = my_list

In [57]:
df

## or from machinelearning plus website
#out = df.apply(lambda x: x.sort_values().unique()[-2], axis=1)
#df['penultimate'] = out
#print(df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,ratio,penultimate
0,21,16,37,38,41,40,51,70,96,72,6.0,72.0
1,20,91,61,32,57,19,46,59,16,79,5.6875,79.0
2,92,65,7,99,11,41,79,15,95,27,14.142857,95.0
3,55,92,38,48,24,58,1,7,49,89,92.0,92.0
4,64,25,28,45,7,49,20,16,96,23,13.714286,64.0
5,38,45,32,1,93,17,82,37,65,7,93.0,93.0
6,73,97,96,87,38,26,80,96,55,64,3.730769,96.0
7,61,24,27,97,86,77,27,97,77,33,4.041667,97.0


### Normalize all columns in data frame

In [58]:
## Normalize all columns of df by subtracting the column mean and divide by standard deviation
df = pd.DataFrame(np.random.randint(1,100, 80).reshape(8, -1))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,44,4,60,51,34,61,12,1,11,38
1,79,44,78,66,23,21,59,56,2,50
2,29,79,58,61,22,64,67,84,98,53
3,25,92,5,57,19,16,68,90,26,67
4,41,84,64,27,49,8,69,5,38,88
5,34,19,32,48,62,61,23,50,79,34
6,53,16,31,2,82,76,72,15,48,64
7,77,45,52,54,37,88,20,48,84,39


In [62]:
df.apply(lambda x: (x-x.mean())/x.std() ,axis =0)



Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.181825,-1.296312,0.535933,0.248059,-0.31669,0.387013,-1.433676,-1.251369,-1.049073,-0.885189
1,1.515211,-0.114489,1.307677,0.956799,-0.814345,-0.944644,0.399869,0.363301,-1.30254,-0.226444
2,-0.909127,0.919606,0.450184,0.720552,-0.859586,0.486887,0.711962,1.185314,1.401111,-0.061757
3,-1.103074,1.303698,-1.822172,0.531555,-0.995311,-1.111101,0.750973,1.36146,-0.626627,0.706779
4,-0.327286,1.067334,0.707432,-0.885925,0.361931,-1.377432,0.789985,-1.133938,-0.288671,1.859582
5,-0.666693,-0.853128,-0.664557,0.106311,0.950069,0.387013,-1.004549,0.187155,0.866013,-1.10477
6,0.254555,-0.941765,-0.707432,-2.067158,1.854897,0.886384,0.90702,-0.840362,-0.007041,0.542092
7,1.418238,-0.084943,0.192936,0.389807,-0.180966,1.285881,-1.121583,0.12844,1.006828,-0.830293


### replacing diagnols of a dataframe by 0s

In [66]:
df = pd.DataFrame(np.random.randint(1,100, 100).reshape(10, -1))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,56,69,31,18,69,51,45,96,94,54
1,79,26,6,20,15,92,44,26,58,84
2,4,52,80,13,95,10,93,26,24,56
3,72,57,34,27,40,88,6,69,12,97
4,46,32,58,13,17,32,31,57,25,69
5,82,36,81,46,48,21,94,43,86,77
6,14,37,33,3,35,33,12,97,96,76
7,19,2,71,48,28,76,51,2,77,15
8,1,2,29,37,74,14,7,3,26,12
9,65,72,29,13,62,73,98,8,54,33


In [80]:
for i in range(df.shape[1]):
    df.iloc[i,i]= 0
    df.iloc[i,df.shape[1]-i-1] =0

df

   

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,69,31,18,69,51,45,96,94,0
1,79,0,6,20,15,92,44,26,0,84
2,4,52,0,13,95,10,93,0,24,56
3,72,57,34,0,40,88,0,69,12,97
4,46,32,58,13,0,0,31,57,25,69
5,82,36,81,46,0,0,94,43,86,77
6,14,37,33,0,35,33,0,97,96,76
7,19,2,0,48,28,76,51,0,77,15
8,1,0,29,37,74,14,7,3,0,12
9,0,72,29,13,62,73,98,8,54,0


###  get the particular group of a groupby dataframe by key

In [82]:
df = pd.DataFrame({'col1': ['apple', 'banana', 'orange'] * 3,
                   'col2': np.random.rand(9),
                   'col3': np.random.randint(0, 15, 9)})
df

Unnamed: 0,col1,col2,col3
0,apple,0.533539,12
1,banana,0.25387,7
2,orange,0.645774,10
3,apple,0.764036,14
4,banana,0.209721,14
5,orange,0.965496,9
6,apple,0.03905,10
7,banana,0.188692,13
8,orange,0.079088,0


In [84]:
grouped = df.groupby('col1')
grouped.get_group('banana')


Unnamed: 0,col1,col2,col3
1,banana,0.25387,7
4,banana,0.209721,14
7,banana,0.188692,13


In [89]:
grouped['col2'].count()

col1
apple     3
banana    3
orange    3
Name: col2, dtype: int64

### get the n’th largest value of a column when grouped by another column

In [90]:
df = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                   'rating': np.random.rand(9),
                   'price': np.random.randint(0, 15, 9)})

df

Unnamed: 0,fruit,rating,price
0,apple,0.789798,10
1,banana,0.280112,13
2,orange,0.387373,13
3,apple,0.114644,2
4,banana,0.449384,12
5,orange,0.675102,14
6,apple,0.583387,6
7,banana,0.468891,6
8,orange,0.745345,6


In [99]:

##find the second largest value of 'taste' for 'banana'

df_grouped  = df.groupby('fruit')

sorted(df_grouped['rating'].get_group('banana'))[-2]



0.4493838864182501

In [100]:
## other way from machinelearningplus
df_grpd = df['rating'].groupby(df.fruit)
df_grpd.get_group('banana').sort_values().iloc[-2]

0.4493838864182501

### Compute grouped mean on pandas dataframe and keep the grouped column as another column (not index)

In [101]:
df = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                   'rating': np.random.rand(9),
                   'price': np.random.randint(0, 15, 9)})
df

Unnamed: 0,fruit,rating,price
0,apple,0.606268,4
1,banana,0.846716,8
2,orange,0.5631,1
3,apple,0.466973,5
4,banana,0.289952,3
5,orange,0.416465,9
6,apple,0.091591,5
7,banana,0.638587,1
8,orange,0.10881,9


In [106]:
##Compute the mean price of every fruit, while keeping the fruit as another column instead of an index.

df[['fruit','price']].groupby('fruit',as_index=False).mean()

Unnamed: 0,fruit,price
0,apple,4.666667
1,banana,4.0
2,orange,6.333333


### join two dataframes by 2 columns so they have only the common rows

In [107]:
df1 = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                    'weight': ['high', 'medium', 'low'] * 3,
                    'price': np.random.randint(0, 15, 9)})

df2 = pd.DataFrame({'pazham': ['apple', 'orange', 'pine'] * 2,
                    'kilo': ['high', 'low'] * 3,
                    'price': np.random.randint(0, 15, 6)})


df1,df2

(    fruit  weight  price
 0   apple    high      8
 1  banana  medium      6
 2  orange     low      0
 3   apple    high     14
 4  banana  medium     13
 5  orange     low     10
 6   apple    high      7
 7  banana  medium      9
 8  orange     low      6,    pazham  kilo  price
 0   apple  high      4
 1  orange   low      3
 2    pine  high     10
 3   apple   low      7
 4  orange  high     12
 5    pine   low      5)

In [113]:
df1.join(df2, how ='inner', on = ['fruit','pazham'])

ValueError: len(left_on) must equal the number of levels in the index of "right"

In [116]:
## from machinelearning plus
pd.merge(df1, df2, how='inner', left_on=['fruit', 'weight'], right_on=['pazham', 'kilo'], suffixes=['_left', '_right'])

Unnamed: 0,fruit,weight,price_left,pazham,kilo,price_right
0,apple,high,8,apple,high,4
1,apple,high,14,apple,high,4
2,apple,high,7,apple,high,4
3,orange,low,0,orange,low,3
4,orange,low,10,orange,low,3
5,orange,low,6,orange,low,3


### remove rows from a dataframe that are present in another dataframe

In [125]:
df1 = pd.DataFrame({'fruit': ['apple', 'banana', 'orange'] * 3,
                    'weight': ['high', 'medium', 'low'] * 3,
                    'price': np.random.randint(0, 15, 9)})

df2 = pd.DataFrame({'pazham': ['apple', 'orange', 'pine'] * 2,
                    'kilo': ['high', 'low'] * 3,
                    'price': np.random.randint(0, 15, 6)})

In [128]:
df1[~df1.isin(df2).all(1)]

Unnamed: 0,fruit,weight,price
0,apple,high,7
1,banana,medium,0
2,orange,low,1
3,apple,high,9
4,banana,medium,5
5,orange,low,10
6,apple,high,13
7,banana,medium,10
8,orange,low,3


In [127]:
print(df1)
df2

    fruit  weight  price
0   apple    high      7
1  banana  medium      0
2  orange     low      1
3   apple    high      9
4  banana  medium      5
5  orange     low     10
6   apple    high     13
7  banana  medium     10
8  orange     low      3


Unnamed: 0,pazham,kilo,price
0,apple,high,1
1,orange,low,8
2,pine,high,11
3,apple,low,8
4,orange,high,4
5,pine,low,11


### split a text column into two separate columns

In [129]:
df = pd.DataFrame(["STD, City    State",
"33, Kolkata    West Bengal",
"44, Chennai    Tamil Nadu",
"40, Hyderabad    Telengana",
"80, Bangalore    Karnataka"], columns=['row'])
df

Unnamed: 0,row
0,"STD, City State"
1,"33, Kolkata West Bengal"
2,"44, Chennai Tamil Nadu"
3,"40, Hyderabad Telengana"
4,"80, Bangalore Karnataka"


0    [row    STD,  City    State\nName: 0,  dtype: ...
1    [row    33,  Kolkata    West Bengal\nName: 1, ...
2    [row    44,  Chennai    Tamil Nadu\nName: 2,  ...
3    [row    40,  Hyderabad    Telengana\nName: 3, ...
4    [row    80,  Bangalore    Karnataka\nName: 4, ...
dtype: object

In [133]:
## from machine learning plus
df_out = df.row.str.split(',|\t', expand=True)

# Make first row as header
new_header = df_out.iloc[0]
df_out = df_out[1:]
df_out.columns = new_header
print(df_out)

0 STD            City    State
1  33   Kolkata    West Bengal
2  44    Chennai    Tamil Nadu
3  40   Hyderabad    Telengana
4  80   Bangalore    Karnataka
