In [None]:
import pandas as pd
import numpy as np
print(pd.show_versions(as_json=True))

In [None]:
#01 create pandas series from list, numpy array and dictionary
mylist = list('abcdefjklmnoeqeustlstxzy')
myarr = np.arange(28)
mydict = dict(zip(mylist, myarr))

ser1 = pd.Series(mylist)
print(ser1.head())
ser2 = pd.Series(myarr)
print(ser2)
ser3 = pd.Series(mydict)
print(ser3.head())

In [None]:
#02 convert the index of a series into a column of a dateframe
mylist2 = list('abcdefjklmnoeqeustlstxzyxxxxccc')
myarr2 = np.arange(28)
mydict2 = dict(zip(mylist2, myarr2))
ser = pd.Series(mydict2)

df2 = ser.to_frame().reset_index()
print(df2.head())

In [None]:
#03 combine many series to form a dataframe
# method 1
ser1 = pd.Series(list('abcdefjklmnoeqeustlstxzyxxxxccc'))
ser2 = pd.Series(np.arange(28))
df3 = pd.concat([ser1, ser2], axis=1)
print(df3.head())
# method 2
df33 = pd.DataFrame({'col1':ser1,'col2':ser2})
print(df33.head())

In [None]:
#04 assign name to the series index
ser3 = pd.Series(list('abcdefjklmnoeqeustlstxzyxxxxccc'))
ser3.name = 'alphabets'
ser3.head()

In [None]:
#05 get the items of series A not present in series B
serA = pd.Series([1,2,3,4,5])
serB = pd.Series([4,5,6,7,8])
print(serA[~serA.isin(serB)])
print(serB[~serB.isin(serA)])

In [None]:
#06 get the items not common to both series A and series B
serA = pd.Series([1,2,3,4,5])
serB = pd.Series([4,5,6,7,8])

ser_u = pd.Series(np.union1d(serA, serB))     # Union
ser_i = pd.Series(np.intersect1d(serA, serB)) #Intersect 
ser_u[~ser_u.isin(ser_i)]

In [None]:
#07 get the minimum, 25th percentile, median, 75th and max of a numeric series
state = np.random.RandomState(100)
ser = pd.Series(state.normal(10,5,25))

np.percentile(ser, q=[0,25, 50, 75,100])

In [None]:
#08 get the frequency counts of unique items of a series
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8,size=30)))
ser.value_counts()

In [None]:
#09 keep only top 2 most frequent values as it is and replace everything else as 'other'
np.random.RandomState(100)
ser = pd.Series(np.random.randint(1,5,[12]))
print(ser)
print(ser.value_counts)
print("Top 2 Freq:", ser.value_counts())
ser[~ser.isin(ser.value_counts().index[:2])] = 'other'
ser

In [None]:
#10 bin a numeric series to 10 groups of equal size
ser = pd.Series(np.random.random(20))
print(ser.head())
pd.qcut(ser, q=[0,0.10,0.20,0.30,0.40,0.50,0.60,0.70,0.80,0.90,1],
        labels=['1st','2nd','3rd','4th','5th','6th','7th','8th','9th','10th']).head()

In [None]:
#11 convert a numpy array to a dataframe of given shape> reshape a series into a dataframe with 7 rows and 5 columns
ser = pd.Series(np.random.randint(1,10,35))
df = pd.DataFrame(ser.values.reshape(7,5))
print(df)

In [None]:
#12 find the positions of numbers that are multiples of 3 from a series
ser11 = pd.Series(np.random.randint(1,10,7))
ser11
print(ser11)
np.argwhere(ser11 %3==0)

In [34]:
#13 extract items at given positions from a series 
ser12 = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 6]
ser.take(pos)

0    8
4    8
6    6
dtype: int32

In [36]:
#14 stack two series vertically and horizontally
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))
print(ser1.append(ser2))              # vertically
df = pd.concat([ser1,ser2],axis=1)    # horizontally
print(df)

0    0
1    1
2    2
3    3
4    4
0    a
1    b
2    c
3    d
4    e
dtype: object
   0  1
0  0  a
1  1  b
2  2  c
3  3  d
4  4  e


In [38]:
#15 get the positions of items of series A in another series B
ser1 = pd.Series([10,9,6,5,3,1,12,8,13])
ser2 = pd.Series([1,3,10,13])
# method 1
print([np.where(i==ser1)[0].tolist()[0] for i in ser2])
# mehtod 2
print([pd.Index(ser1).get_loc(i) for i in ser2])

[5, 4, 0, 8]
[5, 4, 0, 8]


In [39]:
#16 compute the mean squared error on a truth and predicted series
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)
np.mean((truth - pred)**2)

0.29721678126088025

In [42]:
#17 convert the first character of each element in a series to upercase
ser = pd.Series(['im','the','best','of','best','in','the','whole','damn','usa'])
# method 1
ser.map(lambda x: x.title())
# method 2
ser.map(lambda x: x[0].upper() + x[1:])
# method 3
pd.Series([i.title() for i in ser])

0       Im
1      The
2     Best
3       Of
4     Best
5       In
6      The
7    Whole
8     Damn
9      Usa
dtype: object

In [43]:
#18 calcualte the number of characters in each word in a series
ser = pd.Series(['im','the','best','of','best','in','the','whole','damn','usa'])
ser.map(lambda x: len(x))

0    2
1    3
2    4
3    2
4    4
5    2
6    3
7    5
8    4
9    3
dtype: int64

In [44]:
#19 compute difference of differences between consequtive numbers of a series
ser = pd.Series([1,3,6,10,15,21,27,35])
print(ser.diff().tolist())
print(ser.diff().diff().tolist())

[nan, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0, 8.0]
[nan, nan, 1.0, 1.0, 1.0, 1.0, 0.0, 2.0]


In [47]:
#20 convert a series of date strings to a timeseries 
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])
# method 1
print(pd.to_datetime(ser))
# method2 
from dateutil.parser import parse
ser.map(lambda x: parse(x))

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]


0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

In [53]:
#21 get the day of month, week number, day of year and day of week from a series of date strings
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

from dateutil.parser import parse
ser_ts = ser.map(lambda x: parse(x))
# day of month
print("Date:", ser_ts.dt.day.tolist())
# week number
print("Week number:", ser_ts.dt.weekofyear.tolist())
# day of year
print("Day number of year:", ser_ts.dt.dayofyear.tolist())
# day of week
print("Day of week:", ser_ts.dt.dayofweek.tolist())

Date: [1, 2, 3, 4, 5, 6]
Week number: [53, 5, 9, 14, 19, 23]
Day number of year: [1, 33, 63, 94, 125, 157]
Day of week: [4, 2, 5, 3, 0, 5]


  print("Week number:", ser_ts.dt.weekofyear.tolist())


In [58]:
#22 convert year-month string to dates corresponding to the 4th day of the month
ser = pd.Series(['Jan 2010','Feb 2011','Mar 2012'])
# method 1
from dateutil.parser import parse
ser.map(lambda x:parse('04'+ x))
# method 2
ser_ts = ser.map(lambda x:parse(x))
# construct date string with date as 4
ser_datestr = ser_ts.dt.year.astype('str') + '-' + ser_ts.dt.month.astype('str') + '-' + '04'
# format it
[parse(i).strftime('%Y-%m-%d') for i in ser_datestr]

['2010-01-04', '2011-02-04', '2012-03-04']

In [60]:
#23 filter words that contain atleast 2 vowels from a series 
ser = pd.Series(['Apple','Orange','Plan','Python','Money','Women'])
from collections import Counter
mask = ser.map(lambda x: sum([Counter(x.lower()).get(i,0) for i in list('aeiou')]) >=2)
ser[mask]

0     Apple
1    Orange
4     Money
5     Women
dtype: object

In [61]:
#24 filter valid emails from a series 
emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])

# method 1 as series of strings
import re
pattern = '[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}' # {2,4} meaning?
mask = emails.map(lambda x: bool(re.match(pattern, x)))
emails[mask]

1    rameses@egypt.com
2            matt@t.co
3    narendra@modi.com
dtype: object

In [62]:
# method 2 as series of list
emails.str.findall(pattern,flags=re.IGNORECASE)

0                     []
1    [rameses@egypt.com]
2            [matt@t.co]
3    [narendra@modi.com]
dtype: object

In [63]:
# method 3 as list
[x[0] for x in [re.findall(pattern, email) for email in emails] if len(x)>0]

['rameses@egypt.com', 'matt@t.co', 'narendra@modi.com']

In [65]:
#25 get the mean of a series grouped by another series> compute the mean of weights of each fruit
fruit = pd.Series(np.random.choice(['apple','banana','carrot'],10))
weight = pd.Series(np.linspace(1,10,10))
print(weight.tolist())
print(fruit.tolist())

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
['banana', 'carrot', 'carrot', 'apple', 'apple', 'apple', 'apple', 'banana', 'banana', 'apple']


In [66]:
weight.groupby(fruit).mean()

apple     6.4
banana    6.0
carrot    2.5
dtype: float64