In [188]:
import pandas as pd
import numpy as np
import datetime as dt

# Series
Series is a one-dimensional labeled array capable of holding any data type (integers, strings, floating point numbers, Python objects, etc.). The axis labels are collectively referred to as the index. 

### create a series

In [24]:
ds = pd.Series(["a", 2, "foo", "bar", 9])
ds

0      a
1      2
2    foo
3    bar
4      9
dtype: object

### convert Panda series to Python list

In [25]:
ds = pd.Series(["a", 2, "foo", "bar", 9])
print (type(ds))
list = ds.to_list()
print (list)

<class 'pandas.core.series.Series'>
['a', 2, 'foo', 'bar', 9]


### operate over multiple series

In [26]:
ds1 = pd.Series([2, 4, 6, 8, 10])
ds2 = pd.Series([1, 3, 5, 7, 9])

multiplied = ds1 * ds2
multiplied

0     2
1    12
2    30
3    56
4    90
dtype: int64

###  compare the elements of two series

In [27]:
ds1 = pd.Series([2, 4, 3, 3, 10])
ds2 = pd.Series([1, 3, 5, 7, 9])
print("Series One Larger:")
print(ds1 < ds2)

Series One Larger:
0    False
1    False
2     True
3     True
4    False
dtype: bool


### convert a dictionary to a series

In [32]:
dict = {'a': 100, 'b': 200, 'c': 300, 'd': 400, 'e': 800}
ds = pd.Series(dict)
print(type(ds))
ds

<class 'pandas.core.series.Series'>


a    100
b    200
c    300
d    400
e    800
dtype: int64

### convert a NumPy array to a Pandas series

In [35]:
npa = np.array([2, 4, 3, 3, 10])
print(type(npa))
ds = pd.Series(npa)
print(type(ds))
ds

<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


0     2
1     4
2     3
3     3
4    10
dtype: int32

### change the data type of a Series

In [41]:
ds = pd.Series([100, 200, "foo", "bar", 300.12])
ds_num = pd.to_numeric(ds, errors='coerce')  #coerce errors returns NaN as opposed to errors
ds_num

0    100.00
1    200.00
2       NaN
3       NaN
4    300.12
dtype: float64

### convert the first column of a DataFrame as a Series

In [45]:
df = pd.DataFrame(
    {"A" : [1, 3, 8],
     "B" : [2, 5, 7],
     "C" : [3, 5, 9]})
ds = pd.Series(df["A"])
print(type(ds))
ds

<class 'pandas.core.series.Series'>


0    1
1    3
2    8
Name: A, dtype: int64

### convert a series to a nupy array

In [51]:
ds = pd.Series([100, 200, "foo", "bar", 300.12])
npa = np.array(ds)
print(type(npa))
print(npa)

<class 'numpy.ndarray'>
[100 200 'foo' 'bar' 300.12]


### convert series of lists to one series
pd.Series.apply will invoke function on values of series https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.apply.html

In [62]:
ds = pd.Series([["Red", "Green", "Blue"], ["Red", "Black"], "White"])
ds = ds.apply(pd.Series).stack().reset_index(drop=True)  #reset index value otherwise get 'gaps' or mult index values
print(ds)

0      Red
1    Green
2     Blue
3      Red
4    Black
5    White
dtype: object


### sort a series

In [64]:
ds = pd.Series([100, 200, 1, 5.6, 300.12])
ds_sorted = ds.sort_values()
ds_sorted

2      1.00
3      5.60
0    100.00
1    200.00
4    300.12
dtype: float64

### add data to existing series

In [73]:
ds = pd.Series([100, 200, 58, 56, 88])
ds2 = pd.Series(["foo", "bar"])
ds = ds.append(ds2).reset_index(drop=True)
ds

0    100
1    200
2     58
3     56
4     88
5    foo
6    bar
dtype: object

### create a subset of a given series based on value and condition

In [75]:
ds = pd.Series([100, 200, 58, 56, 88])
ds_less_than_100 = ds[ds < 100]
ds_less_than_100

2    58
3    56
4    88
dtype: int64

### change the order of index of a series

In [76]:
ds = pd.Series(data=["A", "C", "D", "B"], index=[4, 2, 3, 1])
print (ds)
ds_reindex = ds.reindex([1, 2, 3, 4])
ds_reindex

4    A
2    C
3    D
1    B
dtype: object


1    B
2    C
3    D
4    A
dtype: object

### create the mean and standard deviation of a Series

In [84]:
ds = pd.Series([1, 5, 7, 9, 12])
mean = ds.mean()
std_dev = ds.std()
print(f"Mean: {mean}  Std Dev: {std_dev}")

Mean: 6.8  Std Dev: 4.147288270665545


### items of a series present in another series

In [96]:
ds1 = pd.Series([1, 5, 7, 9, 12])
ds2 = pd.Series([1, 8, 7, 15, 12])

#value in other series
val_in_other_series = ds1[ds1.isin(ds2)]  
print(val_in_other_series)

0     1
2     7
4    12
dtype: int64


### items of a series NOT present in another series

In [97]:
ds1 = pd.Series([1, 5, 7, 9, 12])
ds2 = pd.Series([1, 8, 7, 15, 12])

#value not in other series
val_not_in_other_series = ds1[~ds1.isin(ds2)]  
print(val_not_in_other_series)

1    5
3    9
dtype: int64


### compute the min, 25th perc, median, 75th perc, max

In [102]:
numbers = pd.Series(np.random.normal(10, 4, 100))
result = np.percentile(numbers, q=[0, 25, 50, 75, 100])
result

array([ 1.16329638,  9.18209927, 10.91630888, 14.94066182, 24.30158889])

### frequency counts of unique values in series

In [113]:
numbers = pd.Series(np.random.randint(1, 10, 100))
results = numbers.value_counts()
results

8    14
2    14
7    13
6    13
5    11
4    11
1    10
9     8
3     6
dtype: int64

### relabel non-top values as 'other'

In [138]:
np.random.RandomState(100)
num_series = pd.Series(np.random.randint(1, 10, [100]))
result = num_series[~num_series.isin(num_series.value_counts().index[:1])] = 'Other'
num_series

0     Other
1     Other
2     Other
3     Other
4     Other
      ...  
95        5
96    Other
97    Other
98    Other
99    Other
Length: 100, dtype: object

###  find the positions of numbers that are multiples of 5

In [151]:
num_series = pd.Series(np.random.randint(1, 11, 20))
results = np.argwhere(num_series % 5 == 0)
results  #can validate by printing num_series

array([[ 0],
       [ 1],
       [ 3],
       [14],
       [15],
       [19]], dtype=int64)

### extract items at given positions of a series

In [156]:
ds = pd.Series(["apple", "banana", "orange", "pineapple", "pear"])
result_1 = ds.take([2])
result_2 = ds.take([0, 1, 2])
print(result_1)
print(result_2)

2    orange
dtype: object
0     apple
1    banana
2    orange
dtype: object


### get the positions of items of a series in another series

In [159]:
ds1 = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
ds2 = pd.Series([1, 3, 5, 7, 10])

result = [pd.Index(ds1).get_loc(i) for i in ds2]
print("Positions of items of series2 in series1:")
result

Positions of items of series2 in series1:


[0, 2, 4, 6, 9]

### convert the first and last character of each word to upper case in series

In [162]:
ds = pd.Series(["bill s", "ted r", "sarah h", "adam r"])
result = ds.map(lambda x: x[0].upper() + x[1:-1] + x[-1].upper()) 
result

0     Bill S
1      Ted R
2    Sarah H
3     Adam R
dtype: object

### calculate the number of characters in each word in a series

In [207]:
ds = pd.Series(["bill", "ted", "sarah", "adam"])
result = ds.map(lambda x: len(x))
type(result)
result

0    4
1    3
2    5
3    4
dtype: int64

### compute difference between consecutive numbers of a series

In [168]:
ds = pd.Series([1, 5, 7, 9, 12])
result = ds.diff()  #.diff provies first discrete difference of element
result[1:]

1    4.0
2    2.0
3    2.0
4    3.0
dtype: float64

### convert date strings to a timeseries

In [190]:
ds = pd.Series(['01 Jan 2015', '10-02-2016', '20180307', '2014/05/06', '2016-04-12', '2019-04-06T11:20'])
date_ds = pd.to_datetime(ds)
date_ds

0   2015-01-01 00:00:00
1   2016-10-02 00:00:00
2   2018-03-07 00:00:00
3   2014-05-06 00:00:00
4   2016-04-12 00:00:00
5   2019-04-06 11:20:00
dtype: datetime64[ns]

### get the day of month, year, week from date strings

In [203]:
from dateutil.parser import parse

ds = pd.Series(['01 Jan 2015', '10-02-2016', '20180307', '2014/05/06', '2016-04-12', '2019-04-06T11:20'])
ds = ds.map(lambda x: parse(x))
day_of_month = ds.dt.day.to_list()
day_of_year = ds.dt.dayofyear.to_list()
week_number = ds.dt.weekofyear.to_list()
day_of_week = ds.dt.weekday_name.to_list()
print(f"Day of month: {day_of_month}")
print(f"Day of year: {day_of_year}")
print(f"Week number: {week_number}")
print(f"Day of week: {day_of_week}")

Day of month: [1, 2, 7, 6, 12, 6]
Day of year: [1, 276, 66, 126, 103, 96]
Week number: [1, 39, 10, 19, 15, 14]
Day of week: ['Thursday', 'Sunday', 'Wednesday', 'Tuesday', 'Tuesday', 'Saturday']


### convert year-month string to year-month-day

In [211]:
from dateutil.parser import parse

ds = pd.Series(["June 2011", "July 2015", "May 2020", "January 2010"])
ds = ds.map(lambda x: parse("1" + x))
ds

0   2011-06-01
1   2015-07-01
2   2020-05-01
3   2010-01-01
dtype: datetime64[ns]

### filter words from a given series that contain condition

In [219]:
ds = pd.Series(["basketball", "tennis", "football", "baseball", "golf", "lacross", "handball"])

#words that dont contain 'ball'
result = ds[~ds.str.contains('ball')]  # ~ negates the condition
result 

1     tennis
4       golf
5    lacross
dtype: object