In [1]:
import pandas as pd
import numpy as np

This is a numpy exercise. They come from the [Machine Learning Plus](https://www.machinelearningplus.com/python/101-pandas-exercises-python/)
<br> There is 75 question, so I will paste a question, and do the exercise.

#### 1.How to import pandas and check the version?

In [2]:
pd.__version__

'0.25.2'

#### 2. Create a pandas series from each of the items below: a list, numpy and a dictionary

In [3]:
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))

In [4]:
pd.Series(mylist)[:3]

0    a
1    b
2    c
dtype: object

In [5]:
pd.Series(myarr)[:3]

0    0
1    1
2    2
dtype: int32

In [6]:
pd.Series(mydict)[:3]

a    0
b    1
c    2
dtype: int64

#### 3.Convert the series ser into a dataframe with its index as another column on the dataframe.

In [7]:
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)
ser[:5]

a    0
b    1
c    2
e    3
d    4
dtype: int64

In [8]:
ser.to_frame().reset_index().head()

Unnamed: 0,index,0
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4


#### 4.Combine ser1 and ser2 to form a dataframe.

In [9]:
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))

In [10]:
pd.DataFrame({'ser1':ser1,'ser2': ser2}).head()

Unnamed: 0,ser1,ser2
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4


In [11]:
pd.concat([ser1, ser2], axis=1).head()

Unnamed: 0,0,1
0,a,0
1,b,1
2,c,2
3,e,3
4,d,4


#### 5. Give a name to the series ser calling it ‘alphabets’.

In [12]:
ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser[:3]

0    a
1    b
2    c
dtype: object

In [13]:
ser.name = 'alphabet'

In [14]:
ser[:3]

0    a
1    b
2    c
Name: alphabet, dtype: object

#### 6. From ser1 remove items present in ser2.

In [15]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

In [16]:
ser1[~ser1.isin(ser2)]

0    1
1    2
2    3
dtype: int64

#### 7. Get all items of ser1 and ser2 not common to both.

In [17]:
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

In [18]:
intersect = pd.Series(np.intersect1d(ser1, ser2)) 
intersect

0    4
1    5
dtype: int64

In [19]:
union = pd.Series(np.union1d(ser1, ser2)) 
union

0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
dtype: int64

In [20]:
union[~union.isin(intersect)]

0    1
1    2
2    3
5    6
6    7
7    8
dtype: int64

#### 8. Compute the minimum, 25th percentile, median, 75th, and maximum of ser.

In [21]:
ser = pd.Series(np.random.normal(10, 5, 25))
ser[:3]

0    12.759384
1     9.928015
2     7.588495
dtype: float64

In [22]:
ser.min()

3.3704335396842398

In [23]:
ser.quantile(.25)

7.184664348818881

In [24]:
ser.mean()

10.197098289513848

In [25]:
ser.quantile(.75)

12.759384325255114

In [26]:
ser.max()

21.160884273145214

In [27]:
np.percentile(ser, q=[0, 25, 50, 75, 100])

array([ 3.37043354,  7.18466435,  9.49151034, 12.75938433, 21.16088427])

#### 9. Calculte the frequency counts of each unique value ser.

In [28]:
ser = pd.Series(np.take(list('abcdefgh'), np.random.randint(8, size=30)))
ser[:3]

0    c
1    f
2    c
dtype: object

In [29]:
import collections

In [30]:
collections.Counter(ser)

Counter({'c': 6, 'f': 6, 'h': 3, 'g': 5, 'e': 4, 'b': 1, 'd': 2, 'a': 3})

In [31]:
ser.value_counts()

c    6
f    6
g    5
e    4
h    3
a    3
d    2
b    1
dtype: int64

#### 10. From ser, keep the top 2 most frequent items as it is and replace everything else as ‘Other’.

In [32]:
np.random.RandomState(100)
ser = pd.Series(np.random.randint(1, 5, [12]))
ser

0     4
1     3
2     1
3     2
4     2
5     1
6     1
7     3
8     4
9     3
10    3
11    2
dtype: int32

In [33]:
ser.value_counts()

3    4
2    3
1    3
4    2
dtype: int64

In [34]:
~ser.isin(ser.value_counts().index[:2])

0      True
1     False
2      True
3     False
4     False
5      True
6      True
7     False
8      True
9     False
10    False
11    False
dtype: bool

In [35]:
ser[~ser.isin(ser.value_counts().index[:2])] = 'Other'

In [36]:
ser

0     Other
1         3
2     Other
3         2
4         2
5     Other
6     Other
7         3
8     Other
9         3
10        3
11        2
dtype: object

#### 11. Bin the series ser into 10 equal deciles and replace the values with the bin name.

In [37]:
ser = pd.Series(np.random.random(20))
ser[:3]

0    0.993302
1    0.599823
2    0.931294
dtype: float64

In [38]:
labels=['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th']

In [39]:
pd.cut(ser,bins = 10, labels=labels)

0     10th
1      7th
2     10th
3      1st
4      3rd
5      2nd
6     10th
7      5th
8      4th
9      1st
10     3rd
11    10th
12     8th
13     5th
14     6th
15     1st
16     5th
17     9th
18    10th
19     4th
dtype: category
Categories (10, object): [1st < 2nd < 3rd < 4th ... 7th < 8th < 9th < 10th]

#### 12. Reshape the series ser into a dataframe with 7 rows and 5 columns

In [40]:
ser = pd.Series(np.random.randint(1, 10, 35))
ser[:3]

0    4
1    8
2    9
dtype: int32

In [41]:
pd.DataFrame(ser.values.reshape(7,5))

Unnamed: 0,0,1,2,3,4
0,4,8,9,8,3
1,1,4,4,7,5
2,4,9,7,6,2
3,7,5,7,5,6
4,7,9,1,4,3
5,3,1,9,2,2
6,9,1,8,2,7


#### 13. Find the positions of numbers that are multiples of 3 from ser.

In [42]:
ser = pd.Series(np.random.randint(1, 10, 7))
ser

0    9
1    5
2    8
3    6
4    3
5    6
6    7
dtype: int32

In [43]:
ser % 3 == 0

0     True
1    False
2    False
3     True
4     True
5     True
6    False
dtype: bool

In [44]:
ser[ser % 3 == 0].index

Int64Index([0, 3, 4, 5], dtype='int64')

#### 14. From ser, extract the items at positions in list pos.

In [45]:
ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]
ser[:5]

0    a
1    b
2    c
3    d
4    e
dtype: object

#### 15.Stack ser1 and ser2 vertically and horizontally (to form a dataframe).

In [46]:
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))

In [47]:
ser1

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [48]:
ser2

0    a
1    b
2    c
3    d
4    e
dtype: object

In [49]:
pd.concat([ser1,ser2])

0    0
1    1
2    2
3    3
4    4
0    a
1    b
2    c
3    d
4    e
dtype: object

In [50]:
pd.concat([ser1,ser2], axis=1)

Unnamed: 0,0,1
0,0,a
1,1,b
2,2,c
3,3,d
4,4,e


#### 16. Get the positions of items of ser2 in ser1 as a list.

In [51]:
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])

In [52]:
ser1.isin(ser2)

0     True
1    False
2    False
3    False
4     True
5     True
6    False
7    False
8     True
dtype: bool

In [53]:
ser1[ser1.isin(ser2)].index

Int64Index([0, 4, 5, 8], dtype='int64')

In [54]:
[np.where(i == ser1)[0].tolist()[0] for i in ser2]

[5, 4, 0, 8]

#### 17. Compute the mean squared error of truth and pred series.

In [55]:
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)

In [56]:
np.mean((truth - pred)**2)

0.2103176613983775

#### 18. Change the first character of each word to upper case in each word of ser.

In [57]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])
ser

0     how
1      to
2    kick
3    ass?
dtype: object

In [58]:
ser.str.capitalize()

0     How
1      To
2    Kick
3    Ass?
dtype: object

In [59]:
ser.map(lambda x: x.title())

0     How
1      To
2    Kick
3    Ass?
dtype: object

In [60]:
ser.map(lambda x: x[0].upper() + x[1:])

0     How
1      To
2    Kick
3    Ass?
dtype: object

In [61]:
pd.Series([i.title() for i in ser])

0     How
1      To
2    Kick
3    Ass?
dtype: object

#### 19. Calculate the number of characters in each word in a series

In [62]:
ser = pd.Series(['how', 'to', 'kick', 'ass?'])
ser

0     how
1      to
2    kick
3    ass?
dtype: object

In [63]:
ser.str.len()

0    3
1    2
2    4
3    4
dtype: int64

In [64]:
ser.map(lambda x: len(x))

0    3
1    2
2    4
3    4
dtype: int64

#### 20.Difference of differences between the consequtive numbers of ser.

In [65]:
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])

In [66]:
ser.diff()

0    NaN
1    2.0
2    3.0
3    4.0
4    5.0
5    6.0
6    6.0
7    8.0
dtype: float64

In [67]:
ser.diff().diff()

0    NaN
1    NaN
2    1.0
3    1.0
4    1.0
5    1.0
6    0.0
7    2.0
dtype: float64

#### 21. Convert a series of date-strings to a timeseries

In [68]:
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])
ser

0         01 Jan 2010
1          02-02-2011
2            20120303
3          2013/04/04
4          2014-05-05
5    2015-06-06T12:20
dtype: object

In [69]:
pd.to_datetime(ser)

0   2010-01-01 00:00:00
1   2011-02-02 00:00:00
2   2012-03-03 00:00:00
3   2013-04-04 00:00:00
4   2014-05-05 00:00:00
5   2015-06-06 12:20:00
dtype: datetime64[ns]

#### 22. Get the day of month, week number, day of year and day of week from ser.

In [70]:
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])
ser

0         01 Jan 2010
1          02-02-2011
2            20120303
3          2013/04/04
4          2014-05-05
5    2015-06-06T12:20
dtype: object

In [71]:
pd.DatetimeIndex(ser).month

Int64Index([1, 2, 3, 4, 5, 6], dtype='int64')

In [72]:
pd.DatetimeIndex(ser).week

Int64Index([53, 5, 9, 14, 19, 23], dtype='int64')

In [73]:
pd.DatetimeIndex(ser).day_name()

Index(['Friday', 'Wednesday', 'Saturday', 'Thursday', 'Monday', 'Saturday'], dtype='object')

In [74]:
pd.DatetimeIndex(ser).dayofyear

Int64Index([1, 33, 63, 94, 125, 157], dtype='int64')

#### 23. Change ser to dates that start with 4th of the respective months.

In [75]:
ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])
ser

0    Jan 2010
1    Feb 2011
2    Mar 2012
dtype: object

In [76]:
from dateutil.parser import parse
ser.map(lambda x: parse('04 ' + x))

0   2010-01-04
1   2011-02-04
2   2012-03-04
dtype: datetime64[ns]

#### 24. From ser, extract words that contain atleast 2 vowels.

In [77]:
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])
ser

0     Apple
1    Orange
2      Plan
3    Python
4     Money
dtype: object

In [78]:
vovels = 'aeiou'

In [79]:
collections.Counter(ser)

Counter({'Apple': 1, 'Orange': 1, 'Plan': 1, 'Python': 1, 'Money': 1})

In [80]:
ser.str.count(r'[aeiou]')

0    1
1    2
2    1
3    1
4    2
dtype: int64

In [81]:
ser[ser.str.lower().str.count(r'[aeiouy]') >= 2]

0     Apple
1    Orange
3    Python
4     Money
dtype: object

In [82]:
mask = ser.map(lambda x: sum([collections.Counter(x.lower()).get(i, 0) for i in list('aeiouy')]) >= 2)
ser[mask]

0     Apple
1    Orange
3    Python
4     Money
dtype: object

#### 25. Extract the valid emails from the series emails. The regex pattern for valid emails is provided as reference.

In [83]:
emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'

In [84]:
emails.str.contains(pattern)

0    False
1     True
2     True
3     True
dtype: bool

In [85]:
emails[emails.str.contains(pattern)]

1    rameses@egypt.com
2            matt@t.co
3    narendra@modi.com
dtype: object

#### 26. Compute the mean of weights of each fruit.

In [86]:
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weights = pd.Series(np.linspace(1, 10, 10))
print(weights.tolist())
print(fruit.tolist())

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
['carrot', 'apple', 'carrot', 'carrot', 'apple', 'apple', 'carrot', 'apple', 'banana', 'carrot']


In [87]:
fruits = pd.concat([fruit, weights], axis=1)
fruits

Unnamed: 0,0,1
0,carrot,1.0
1,apple,2.0
2,carrot,3.0
3,carrot,4.0
4,apple,5.0
5,apple,6.0
6,carrot,7.0
7,apple,8.0
8,banana,9.0
9,carrot,10.0


In [88]:
fruits.groupby(fruits.columns[0]).mean()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
apple,5.25
banana,9.0
carrot,5.0


In [89]:
weights.groupby(fruit).mean()

apple     5.25
banana    9.00
carrot    5.00
dtype: float64