In [None]:
import pandas as pd
import numpy as np
import datetime
from dateutil.parser import parse

ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])
ser_ts = ser.map(lambda x: parse(x))
print(ser_ts)
ser_datestr = ser_ts.dt.year.astype('str') + '-' + ser_ts.dt.month.astype('str') + '-' + '04'
[parse(i).strftime('%Y-%m-%d') for i in ser_datestr]


0   2010-01-18
1   2011-02-18
2   2012-03-18
dtype: datetime64[ns]


['2010-01-04', '2011-02-04', '2012-03-04']

In [None]:
def myfunc(a, b):
  return a + b

x = map(myfunc, (1,2,3), (4,5,6))
for i in x:
  print(i)

5
7
9


In [None]:
ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])
ser_ts= ser.map(lambda x: parse('04 ' + x))
print(ser_ts)

0   2010-01-04
1   2011-02-04
2   2012-03-04
dtype: datetime64[ns]


In [None]:
#Filter in Map function
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])

# Solution
from collections import Counter
mask = ser.map(lambda x: sum([Counter(x.lower()).get(i, 0) for i in list('aeiou')]) >= 2)
ser[mask]

0     Apple
1    Orange
4     Money
dtype: object

In [None]:
# filter series
import re
emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'

[x[0] for x in [re.findall(pattern, email) for email in emails] if len(x) > 0]

['rameses@egypt.com', 'matt@t.co', 'narendra@modi.com']

In [None]:
#Mean of a series
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weights = pd.Series(np.linspace(1, 10, 10))
print(weights.tolist())
print(fruit.tolist())
weights.groupby(fruit).mean()

[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]
['banana', 'apple', 'banana', 'apple', 'apple', 'carrot', 'apple', 'carrot', 'apple', 'banana']


apple     5.400000
banana    4.666667
carrot    7.000000
dtype: float64

In [None]:
#euclidean distance
p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])
sum(((p-q)**2))**.5

18.16590212458495

In [None]:
 #local maxima (or peaks)
 ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])
 #dd = np.diff(np.sign(np.diff(ser)))
 dd=np.diff(np.sign(np.diff(ser)))
 np.where(dd==-2)[0]+1

array([1, 5, 7])

In [None]:
#replace missing spaces
my_str = 'dbc deb abed gade'
ser = pd.Series(list(my_str))
freq = ser.value_counts()
print(freq)
least_freq = freq.dropna().index[-1]
"".join(ser.replace(' ', least_freq))

d    4
b    3
e    3
     3
a    2
g    1
c    1
dtype: int64


'dbccdebcabedcgade'

In [None]:
pd.Series(np.random.randint(1,10,10),pd.date_range('2000-01-01', periods=10, freq='W-SAT'))

2000-01-01    6
2000-01-08    5
2000-01-15    8
2000-01-22    9
2000-01-29    3
2000-02-05    5
2000-02-12    2
2000-02-19    6
2000-02-26    3
2000-03-04    2
Freq: W-SAT, dtype: int64

In [None]:
ser = pd.Series([1,10,3,np.nan], index=pd.to_datetime(['2000-01-01', '2000-01-03', '2000-01-06', '2000-01-08']))
ser

2000-01-01     1.0
2000-01-03    10.0
2000-01-06     3.0
2000-01-08     NaN
dtype: float64

In [None]:
ser = pd.Series(np.arange(20) + np.random.normal(1, 10, 20))
[ser.autocorr(i).round(2) for i in range(18)]

[1.0,
 0.25,
 -0.05,
 -0.17,
 0.02,
 -0.03,
 0.36,
 0.24,
 -0.0,
 -0.15,
 0.01,
 0.2,
 -0.19,
 -0.03,
 0.22,
 -0.32,
 -0.37,
 0.7]

In [None]:
#rows as strides
pd.Series(range(15))

0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
11    11
12    12
13    13
14    14
dtype: int64

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
#np.max(df.Price)
#df.loc[df.Price == np.max(df.Price), ['Manufacturer', 'Model', 'Type','Price']]
print(df.columns)

Index(['Manufacturer', 'Model', 'Type', 'Min.Price', 'Price', 'Max.Price',
       'MPG.city', 'MPG.highway', 'AirBags', 'DriveTrain', 'Cylinders',
       'EngineSize', 'Horsepower', 'RPM', 'Rev.per.mile', 'Man.trans.avail',
       'Fuel.tank.capacity', 'Passengers', 'Length', 'Wheelbase', 'Width',
       'Turn.circle', 'Rear.seat.room', 'Luggage.room', 'Weight', 'Origin',
       'Make'],
      dtype='object')


In [None]:
#Rename the column Type as CarType in df and replace the ‘.’ in column names with ‘_’.
df=df.rename(columns = {'Type':'CarType'})
df.columns = df.columns.map(lambda x: x.replace('.', '_'))
print(df.columns)

Index(['Manufacturer', 'Model', 'CarType', 'Min_Price', 'Price', 'Max_Price',
       'MPG_city', 'MPG_highway', 'AirBags', 'DriveTrain', 'Cylinders',
       'EngineSize', 'Horsepower', 'RPM', 'Rev_per_mile', 'Man_trans_avail',
       'Fuel_tank_capacity', 'Passengers', 'Length', 'Wheelbase', 'Width',
       'Turn_circle', 'Rear_seat_room', 'Luggage_room', 'Weight', 'Origin',
       'Make'],
      dtype='object')


In [None]:
df.isnull().sum()

Manufacturer           4
Model                  1
CarType                3
Min_Price              7
Price                  2
Max_Price              5
MPG_city               9
MPG_highway            2
AirBags                6
DriveTrain             7
Cylinders              5
EngineSize             2
Horsepower             7
RPM                    3
Rev_per_mile           6
Man_trans_avail        5
Fuel_tank_capacity     8
Passengers             2
Length                 4
Wheelbase              1
Width                  6
Turn_circle            5
Rear_seat_room         4
Luggage_room          19
Weight                 7
Origin                 5
Make                   3
dtype: int64

In [None]:
n_missings_each_col = df.apply(lambda x: x.isnull().sum())
n_missings_each_col
#n_missings_each_col.argmax()

Manufacturer           4
Model                  1
CarType                3
Min_Price              7
Price                  2
Max_Price              5
MPG_city               9
MPG_highway            2
AirBags                6
DriveTrain             7
Cylinders              5
EngineSize             2
Horsepower             7
RPM                    3
Rev_per_mile           6
Man_trans_avail        5
Fuel_tank_capacity     8
Passengers             2
Length                 4
Wheelbase              1
Width                  6
Turn_circle            5
Rear_seat_room         4
Luggage_room          19
Weight                 7
Origin                 5
Make                   3
dtype: int64

In [None]:
n_missings_each_col.argmax()

23

In [None]:
df_out = df[['Min_Price', 'Max_Price']] = df[['Min_Price', 'Max_Price']].apply(lambda x: x.fillna(x.mean()))
df_out.head()

Unnamed: 0,Min_Price,Max_Price
0,12.9,18.8
1,29.2,38.7
2,25.9,32.3
3,17.118605,44.6
4,17.118605,21.459091


In [None]:
d = {'Min_Price': np.nanmean, 'Max_Price': np.nanmedian}
df[['Min_Price', 'Max_Price']] = df[['Min_Price', 'Max_Price']].apply(lambda x, d: x.fillna(d[x.name](x)), args=(d, ))
df[['Min_Price', 'Max_Price']]

Unnamed: 0,Min_Price,Max_Price
0,12.900000,18.800000
1,29.200000,38.700000
2,25.900000,32.300000
3,17.118605,44.600000
4,17.118605,21.459091
...,...,...
88,16.600000,22.700000
89,17.600000,22.400000
90,22.900000,23.700000
91,21.800000,23.500000


In [None]:
df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))
df

Unnamed: 0,a,b,c,d,e
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19


In [None]:
df = pd.DataFrame(np.arange(20).reshape(-1, 5), columns=list('abcde'))