In [1]:
#https://www.machinelearningplus.com/python/101-pandas-exercises-python/

#1. How to import pandas and check the version?

#Answer
import numpy as np  # optional
import pandas as pd
print(pd.__version__)
print(pd.show_versions(as_json=True))

1.4.4




{
  "system": {
    "commit": "ca60aab7340d9989d9428e11a51467658190bb6b",
    "python": "3.9.13.final.0",
    "python-bits": 64,
    "OS": "Windows",
    "OS-release": "10",
    "Version": "10.0.22621",
    "machine": "AMD64",
    "processor": "Intel64 Family 6 Model 154 Stepping 3, GenuineIntel",
    "byteorder": "little",
    "LC_ALL": null,
    "LANG": null,
    "LOCALE": {
      "language-code": "English_United States",
      "encoding": "936"
    }
  },
  "dependencies": {
    "pandas": "1.4.4",
    "numpy": "1.21.5",
    "pytz": "2022.1",
    "dateutil": "2.8.2",
    "setuptools": "63.4.1",
    "pip": "22.2.2",
    "Cython": "0.29.32",
    "pytest": "7.1.2",
    "hypothesis": null,
    "sphinx": "5.0.2",
    "blosc": null,
    "feather": null,
    "xlsxwriter": "3.0.3",
    "lxml.etree": "4.9.1",
    "html5lib": null,
    "pymysql": null,
    "psycopg2": null,
    "jinja2": "2.11.3",
    "IPython": "7.31.1",
    "pandas_datareader": null,
    "bs4": "4.11.1",
    "bottleneck": "1

In [13]:
# 2. How to create a series from a list, numpy array and dict?
# Create a pandas series from each of the items below: a list, numpy and a dictionary

# Input
import numpy as np
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))

#Answer
ser1 = pd.Series(mylist)
ser2 = pd.Series(myarr)
ser3 = pd.Series(mydict)

df = ser1.to_frame()
df

Unnamed: 0,0
0,a
1,b
2,c
3,e
4,d
5,f
6,g
7,h
8,i
9,j


In [11]:
# 3. How to convert the index of a series into a column of a dataframe?

# Input
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)

# Solution
df = ser.to_frame().reset_index()
df

Unnamed: 0,index,0
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4
5,f,5
6,g,6
7,h,7
8,i,8
9,j,9


In [14]:
# 4. How to combine many series to form a dataframe?

# Input
import numpy as np
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))

# Solution 1
df = pd.concat([ser1, ser2], axis=1)
df

Unnamed: 0,0,1
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4
5,f,5
6,g,6
7,h,7
8,i,8
9,j,9


In [15]:
# 5. How to assign name to the series’ index?

# Input
ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))

# Solution
ser.name = 'alphabets'
ser.head()

0    a
1    b
2    c
3    e
4    d
Name: alphabets, dtype: object

In [17]:
# 6. How to get the items of series A not present in series B?

# Input
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

# Solution
ser1[~ser1.isin(ser2)]

3    4
4    5
dtype: int64

In [18]:
# 7. How to get the items not common to both series A and series B?

# Input
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

# Solution
ser_u = pd.Series(np.union1d(ser1, ser2))  # union
ser_i = pd.Series(np.intersect1d(ser1, ser2))  # intersect
ser_u[~ser_u.isin(ser_i)]

0    1
1    2
2    3
5    6
6    7
7    8
dtype: int64

In [19]:
# 8. How to get the minimum, 25th percentile, median, 75th, and max of a numeric series?

# Input
state = np.random.RandomState(100)
ser = pd.Series(state.normal(10, 5, 25))

# Solution
np.percentile(ser, q=[0, 25, 50, 75, 100])

array([ 1.25117263,  7.70986507, 10.92259345, 13.36360403, 18.0949083 ])

In [20]:
# 9. How to get frequency counts of unique items of a series?

# Input
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))

# Solution
ser.value_counts()

f    5
d    5
b    4
h    4
c    4
a    3
e    3
g    2
dtype: int64

In [21]:
# 10. How to keep only top 2 most frequent values as it is and replace everything else as ‘Other’?

# Input
np.random.RandomState(100)
ser = pd.Series(np.random.randint(1, 5, [12]))

# Solution
print("Top 2 Freq:", ser.value_counts())
ser[~ser.isin(ser.value_counts().index[:2])] = 'Other'
ser


Top 2 Freq: 3    4
1    4
2    3
4    1
dtype: int64


0     Other
1     Other
2     Other
3         3
4         1
5         1
6     Other
7         1
8         3
9         1
10        3
11        3
dtype: object

In [22]:
# 11. How to bin a numeric series to 10 groups of equal size?

# Input
ser = pd.Series(np.random.random(20))
print(ser.head())

# Solution
pd.qcut(ser, q=[0, .10, .20, .3, .4, .5, .6, .7, .8, .9, 1], 
        labels=['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th']).head()

0    0.724626
1    0.783831
2    0.434524
3    0.420961
4    0.091998
dtype: float64


0    7th
1    8th
2    4th
3    3rd
4    1st
dtype: category
Categories (10, object): ['1st' < '2nd' < '3rd' < '4th' ... '7th' < '8th' < '9th' < '10th']

In [24]:
# 12. How to convert a numpy array to a dataframe of given shape?

# Input
ser = pd.Series(np.random.randint(1, 10, 35))

# Solution
df = pd.DataFrame(ser.values.reshape(7,5))

df

Unnamed: 0,0,1,2,3,4
0,6,7,4,7,6
1,6,1,2,2,6
2,9,8,7,7,4
3,7,3,5,8,8
4,9,7,9,4,2
5,7,7,5,7,6
6,3,2,6,9,9


In [28]:
# 13. How to find the positions of numbers that are multiples of 3 from a series?

# Input
ser = pd.Series(np.random.randint(1, 10, 7))

# Solution
print(ser)
np.argwhere(ser % 3==0)

0    7
1    7
2    5
3    5
4    7
5    5
6    9
dtype: int32


ValueError: Length of values (1) does not match length of index (7)

In [29]:
# 14. How to extract items at given positions from a series

# Input
ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]

# Solution
ser.take(pos)

0     a
4     e
8     i
14    o
20    u
dtype: object

In [31]:
# 15. How to stack two series vertically and horizontally ?

# Input
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))

# Output
# Vertical
ser1.append(ser2)

# Horizontal
df = pd.concat([ser1, ser2], axis=1)
print(df)

   0  1
0  0  a
1  1  b
2  2  c
3  3  d
4  4  e


  ser1.append(ser2)


In [32]:
# 16. How to get the positions of items of series A in another series B?

# Input
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])

# Solution 1
[np.where(i == ser1)[0].tolist()[0] for i in ser2]

# Solution 2
[pd.Index(ser1).get_loc(i) for i in ser2]

[5, 4, 0, 8]

In [33]:
# 17. How to compute the mean squared error on a truth and predicted series?

# Input
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)

# Solution
np.mean((truth-pred)**2)

0.385337732823815

In [34]:
# 18. How to convert the first character of each element in a series to uppercase?

# Input
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

# Solution 1
ser.map(lambda x: x.title())

# Solution 2
ser.map(lambda x: x[0].upper() + x[1:])

# Solution 3
pd.Series([i.title() for i in ser])

0     How
1      To
2    Kick
3    Ass?
dtype: object

In [37]:
# 19. How to calculate the number of characters in each word in a series?

# Input
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

# Solution
ser.map(lambda x: len(x))

0    3
1    2
2    4
3    4
dtype: int64

In [38]:
# 20. How to compute difference of differences between consequtive numbers of a series?

# Input
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])

# Solution
print(ser.diff().tolist())
print(ser.diff().diff().tolist())

[nan, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0, 8.0]
[nan, nan, 1.0, 1.0, 1.0, 1.0, 0.0, 2.0]


In [39]:
# 21. How to convert a series of date-strings to a timeseries?

# Input
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

# Solution 1
from dateutil.parser import parse
ser.map(lambda x: parse(x))

# Solution 2
pd.to_datetime(ser)

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]