In [3]:
import pandas as pd
import numpy as np

## Starting with series

In [2]:
## Series are one dimensional -- index isnt included as the values

songs2 = pd.Series([145,142,38,13],
                   name='counts',
                   index = ['Paul', 'John', 'George', 'Ringo'])
                        
songs2.index

Index(['Paul', 'John', 'George', 'Ringo'], dtype='object')

### NaN value

In [3]:
nan_series = pd.Series([2,4, np.nan],
                       index = ['Texas', 'Oklahoma', 'California'])

nan_series ## dtype is float and ints are converted to floats because nan is not supported by int64

Texas         2.0
Oklahoma      4.0
California    NaN
dtype: float64

In [4]:
nan_series.count() ## doesnt include the nan value

2

In [5]:
nan_series.size ## includes the nan value

3

### Similar to numpy

In [6]:
numpy_ser = np.array([145,142,38,13])

numpy_ser[1]

142

In [7]:
songs2[1]

142

In [8]:
numpy_ser.mean()

84.5

In [9]:
songs2.mean()

84.5

### Filtering a series

In [10]:
songs2[songs2 > songs2.median()]

Paul    145
John    142
Name: counts, dtype: int64

In [11]:
boolean_array = songs2 > songs2.median()
boolean_array

Paul       True
John       True
George    False
Ringo     False
Name: counts, dtype: bool

In [12]:
songs2[boolean_array] ## returns the entrys that have true and drops the rest

Paul    145
John    142
Name: counts, dtype: int64

In [13]:
numpy_ser[numpy_ser > np.median(numpy_ser)]

array([145, 142])

### Categorical data

In [14]:
# use less memory
# imporve performance
# can have ordering
# can perform operations on categories

s = pd.Series(['m', 'l', 'xs', 's', 'xl'], dtype ='category')
s

0     m
1     l
2    xs
3     s
4    xl
dtype: category
Categories (5, object): ['l', 'm', 's', 'xl', 'xs']

In [15]:
a = pd.Series(['m', 'l', 'xs', 's', 'xl']) # begins as dtype object as default

## adds ordering to categorical data s is smaller than m and l
size_type = pd.api.types.CategoricalDtype(
    categories = ['s', 'm', 'l'], ordered = True)
s3 = a.astype(size_type)

s3 > 's' ## returns true for data thats larger than s -- which is m and l

0     True
1     True
2    False
3    False
4    False
dtype: bool

In [16]:
s.cat.reorder_categories(['xs', 's', 'm', 'l', 'xl'], ordered=True) ## must specify all the values in the series when reordering or
                                                                    ## pandas will throw a ValueError

0     m
1     l
2    xs
3     s
4    xl
dtype: category
Categories (5, object): ['xs' < 's' < 'm' < 'l' < 'xl']

In [17]:
temp_series = pd.Series([100,76,98,67,56,101,77],
                       index = ['day1', 'day2', 'day3', 'day4', 'day5', 'day6', 'day7'])

temp_series[temp_series.mean() < temp_series] ## filtering out days that have a temperature below the mean

day1    100
day3     98
day6    101
dtype: int64

In [18]:
colors = pd.Series(['Blue', 'Yellow', 'Green'], dtype='category')
colors

0      Blue
1    Yellow
2     Green
dtype: category
Categories (3, object): ['Blue', 'Green', 'Yellow']

## Deep dive into series

In [4]:
path = r'C:\data_sets\vehicles.csv'
df = pd.read_csv(path)

  df = pd.read_csv(path)


In [5]:
city_mpg = df.city08
highway_mpg = df.highway08

In [21]:
city_mpg
l1 = [1,2,3]
string1 = 'cole'
len(dir(pd))

141

In [23]:
seconds_series = pd.Series(pd.date_range("2000-01-01", periods=3, freq="s"))

len(dir(seconds_series.dt))

83

## Operators and Dunder methods

In [29]:
(city_mpg+highway_mpg)/2

series1 = pd.Series([1,2,3,4], index = [1,2,2,5]) ## lines up the indexes to add
series2 = pd.Series([6,7,8,9], index = [1,2,4,4]) ## if one doesnt have an index it fills the empty spot with NaN unless fill_value is used

series1.add(series2, fill_value = 0)

1     7.0
2     9.0
2    10.0
4     8.0
4     9.0
5     4.0
dtype: float64

## Aggregate Methods

In [30]:
# aggregrate methods collapse the values of a series down to a scalar
# ex: .mean, .sum, .count

In [32]:
city_mpg.mean()

18.91014982438981

In [35]:
city_mpg.is_unique

False

In [38]:
city_mpg.is_monotonic_increasing

False

In [7]:
city_mpg.quantile(.5) ## return scalar or series 

17.0

In [44]:
city_mpg.quantile([.5, .8, .9])

0.5    17.0
0.8    22.0
0.9    25.0
Name: city08, dtype: float64

In [46]:
(city_mpg # returns the sum of all the values that are greater than ( .gt ) 20.
 .gt(20)
 .sum())

12079

In [55]:
city_mpg.count() # count returns count of non missing values

44986

In [58]:
city_mpg.size # size returns count of all values even missing values

44986

In [66]:
(city_mpg
 .unique()
 .size)

130

In [68]:
city_mpg.mean()

18.91014982438981

In [70]:
city_mpg.max()

150

In [87]:
city_mpg.agg(['mean', 'count', 'size', 'max', 'nunique'])

mean          18.91015
count      44986.00000
size       44986.00000
max          150.00000
nunique      130.00000
Name: city08, dtype: float64

## Conversion methods

In [88]:
# converting to different data types for better performance, more manipulation, or less memory

In [95]:
city_mpg.dtype # can be converted from in64 to int16 but not int8 bc int8 range is -127 to 127 -- some cars have mpg of 150

dtype('int64')

In [96]:
city_mpg.nbytes

359888

In [99]:
city_mpg.astype('Int16').nbytes # 359888 to 134958 bytes

134958

In [102]:
make = df.make # create a series from the make column
make.nbytes # since the make column includes strings -- must use memory_usage(deep=True) to include strings

359888

In [105]:
make.memory_usage(deep=True)

2849754

In [107]:
(make
 .astype('category') # making the make series have categorical data saves a lot more memory
 .memory_usage(deep=True)) 

103884

## Manipulation methods

In [110]:
make.value_counts()

Chevrolet           4287
Ford                3644
GMC                 2686
Dodge               2655
Toyota              2260
                    ... 
London Taxi            1
Excalibur Autos        1
ASC Incorporated       1
Mahindra               1
Qvale                  1
Name: make, Length: 141, dtype: int64

In [113]:
top5 = make.value_counts().index[:5] # only using the top makes in the series
make.where(make.isin(top5), other='Other') # leaves the values that are in the top 5 and sets the rest to the other parameter

0        Other
1        Other
2        Dodge
3        Dodge
4        Other
         ...  
44981    Other
44982    Other
44983    Other
44984    Other
44985    Other
Name: make, Length: 44986, dtype: object