#### 21. How to convert a series of date-strings to a timeseries?

In [None]:
ser = pd.Series(['01 Jan 2010', 
                 '02-02-2011',
                 '20120303',
                 '2013/04/04', 
                 '2014-05-05',
                 '2015-06-06T12:20'])

In [None]:
# Input
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05'
, '2015-06-06T12:20'])

# Solution 1
from dateutil.parser import parse
ser.map(lambda x: parse(x))
# Solution 2
pd.to_datetime(ser)

#### 22. How to get the day of month, week number, day of year and day of week from a series of date strings?

In [None]:
# Input
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05'
, '2015-06-06T12:20'])
# Solution
from dateutil.parser import parse
ser_ts = ser.map(lambda x: parse(x))
# day of month
print("Date: ", ser_ts.dt.day.tolist())
# week number
print("Week number: ", ser_ts.dt.weekofyear.tolist())
# day of year
print("Day number of year: ", ser_ts.dt.dayofyear.tolist())
# day of week
print("Day of week: ", ser_ts.dt.weekday_name.tolist())

#### 23. How to convert year-month string to dates corresponding to the 4th day of the month?

In [None]:
# Input
ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])
# Solution 1
from dateutil.parser import parse
# Parse the date
ser_ts = ser.map(lambda x: parse(x))
# Construct date string with date as 4
ser_datestr = ser_ts.dt.year.astype('str') + '-' + ser_ts.dt.month.astype('str') +
'-' + '04'
# Format it.
[parse(i).strftime('%Y-%m-%d') for i in ser_datestr]
# Solution 2
ser.map(lambda x: parse('04 ' + x))

#### 24. How to filter words that contain atleast 2 vowels from a series?

In [None]:
# Input
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])
# Solution
from collections import Counter
mask = ser.map(lambda x: sum([Counter(x.lower()).get(i, 0) for i in list('aeiou')])
>= 2)
ser[mask]

#### 25. How to filter valid emails from a series?

In [None]:
# Input
emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co',
'narendra@modi.com'])
# Solution 1 (as series of strings)
import re
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'
mask = emails.map(lambda x: bool(re.match(pattern, x)))
emails[mask]
# Solution 2 (as series of list)
emails.str.findall(pattern, flags=re.IGNORECASE)
# Solution 3 (as list)
[x[0] for x in [re.findall(pattern, email) for email in emails] if len(x) > 0]

#### 26. How to get the mean of a series grouped by another series?

In [None]:
# Input
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weights = pd.Series(np.linspace(1, 10, 10))
# Solution
weights.groupby(fruit).mean()

#### 27. How to compute the euclidean distance between two series?

In [None]:
# Input
p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])
# Solution
sum((p - q)**2)**.5
# Solution (using func)
np.linalg.norm(p-q)

#### 28. How to find all the local maxima (or peaks) in a numeric series?

In [None]:
# Input
ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])
# Solution
dd = np.diff(np.sign(np.diff(ser)))
peak_locs = np.where(dd == -2)[0] + 1
peak_locs

#### 29.  How to replace missing spaces in a string with the least frequent character?

In [None]:
# Input
my_str = 'dbc deb abed gade'
# Solution
ser = pd.Series(list('dbc deb abed gade'))
freq = ser.value_counts()
print(freq)
least_freq = freq.dropna().index[-1]
"".join(ser.replace(' ', least_freq))

#### 30. How to create a TimeSeries starting ‘2000-01-01’ and 10 weekends (saturdays) after that having random numbers as values?

In [None]:
# Solution
ser = pd.Series(np.random.randint(1,10,10), pd.date_range('2000-01-01', periods=10,
freq='W-SAT'))
ser

#### 31. How to fill an intermittent time series so all missing dates show up with values of previous non-missing date?

In [None]:
# Input
ser = pd.Series([1,10,3, np.nan], index=pd.to_datetime(['2000-01-01', '2000-01-03',
'2000-01-06', '2000-01-08']))
# Solution
ser.resample('D').ffill() # fill with previous value
# Alternatives
ser.resample('D').bfill() # fill with next value
ser.resample('D').bfill().ffill() # fill next else prev value

#### 32. How to compute the autocorrelations of a numeric series?

In [None]:
# Input
ser = pd.Series(np.arange(20) + np.random.normal(1, 10, 20))
# Solution
autocorrelations = [ser.autocorr(i).round(2) for i in range(11)]
print(autocorrelations[1:])
print('Lag having highest correlation: ', np.argmax(np.abs(autocorrelations[1:]))+1)

#### 33. How to import only every nth row from a csv file to create a dataframe?

In [None]:
# Solution 1: Use chunks and for-loop
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHo
using.csv', chunksize=50)
df2 = pd.DataFrame()
for chunk in df:
df2 = df2.append(chunk.iloc[0,:])
# Solution 2: Use chunks and list comprehension
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHo
using.csv', chunksize=50)
df2 = pd.concat([chunk.iloc[0] for chunk in df], axis=1)
df2 = df2.transpose()
# Solution 3: Use csv reader
import csv
with open('BostonHousing.csv', 'r') as f:
reader = csv.reader(f)
out = []
for i, row in enumerate(reader):
if i%50 == 0:
out.append(row)
df2 = pd.DataFrame(out[1:], columns=out[0])
print(df2.head())

#### 34. How to change column values when importing csv to a dataframe?

In [None]:
# Solution 1: Using converter parameter
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHo
using.csv',
converters={'medv': lambda x: 'High' if float(x) > 25 else 'Low'})
# Solution 2: Using csv reader
import csv
with open('BostonHousing.csv', 'r') as f:
reader = csv.reader(f)
out = []
for i, row in enumerate(reader):
if i > 0:
row[13] = 'High' if float(row[13]) > 25 else 'Low'
out.append(row)
df = pd.DataFrame(out[1:], columns=out[0])
print(df.head())

#### 35. How to create a dataframe with rows as strides from a given
series?

In [None]:
L = pd.Series(range(15))
def gen_strides(a, stride_len=5, window_len=5):
n_strides = ((a.size-window_len)//stride_len) + 1
return np.array([a[s:(s+window_len)] for s in np.arange(0, a.size, stride_len)[:
n_strides]])
gen_strides(L, stride_len=2, window_len=4)

#### 36. How to import only specified columns from a csv file?

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BostonHo
using.csv', usecols=['crim', 'medv'])
print(df.head())

#### 37. How to get the nrows, ncolumns, datatype, summary stats of each column of a dataframe? Also get the array and list equivalent.

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_m
iss.csv')
# number of rows and columns
print(df.shape)
# datatypes
print(df.dtypes)
# how many columns under each dtype
print(df.get_dtype_counts())
print(df.dtypes.value_counts())
# summary statistics
df_stats = df.describe()
# numpy array
df_arr = df.values
# list
df_list = df.values.tolist()
Feedback

#### 38. How to extract the row and column number of a particular cell with given criterion?

In [None]:
# Input
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_m
iss.csv')
# Solution
# Get Manufacturer with highest price
df.loc[df.Price == np.max(df.Price), ['Manufacturer', 'Model', 'Type']]
# Get Row and Column number
row, col = np.where(df.values == np.max(df.Price))
# Get the value
df.iat[row[0], col[0]]
df.iloc[row[0], col[0]]
# Alternates
df.at[row[0], 'Price']
df.get_value(row[0], 'Price')
# The difference between `iat` - `iloc` vs `at` - `loc` is:
# `iat` snd `iloc` accepts row and column numbers.
# Whereas `at` and `loc` accepts index and column names.

#### 39. How to rename a specific columns in a dataframe?

In [None]:
# Input
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_m
iss.csv')
# Solution
# Step 1:
df=df.rename(columns = {'Type':'CarType'})
# or
df.columns.values[2] = "CarType"
# Step 2:
df.columns = df.columns.map(lambda x: x.replace('.', '_'))
print(df.columns)

#### 40. How to check if a dataframe has any missing values?

In [None]:
# Input
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_m
iss.csv')
# Solution
df.isnull().values.any()