In [1]:
# Setup
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime

# Introducing Pandas Objects
1. Series Objects
2. Dataframe Objects
3. Index Objects

In [2]:
# Series-1

ser = pd.Series([99, 98, 97, 96])       # Create from list
print(ser)                              # Display values
print(ser.values)                       # Access values
print(ser.index)                        # Access indices
print(ser[2])                           # Access entry
print(ser[0:2])                         # Access sliced values

0    99
1    98
2    97
3    96
dtype: int64
[99 98 97 96]
RangeIndex(start=0, stop=4, step=1)
97
0    99
1    98
dtype: int64


In [3]:
# Series-2

ser = pd.Series([99, 98, 97, 96],               # Create with specific indices 
                index = ['a', 'b', 'c', 'd'])
print(ser)
print(ser['b'])                                 # Access entry from named indices


a    99
b    98
c    97
d    96
dtype: int64
98


In [4]:
# Series-3

salesDict = {   'Mon'   : 54,           # Dictionary for series creation
                'Tues'  : 32,
                'Wed'   : 42,
                'Thurs' : 13,
                'Fri'   : 84}

salesSeries = pd.Series(salesDict)      # Series creation from dictionary
print(salesSeries)
print(salesSeries['Tues'])              # Access like a dictionary

Mon      54
Tues     32
Wed      42
Thurs    13
Fri      84
dtype: int64
32


In [5]:
# Dataframe-1

profitSeries = (salesSeries.copy() * 2.50) - 20             # Create another series

weekProfit = pd.DataFrame({'Sale Count': salesSeries,       # Create dataframe from two series
                           'Profit': profitSeries})
print(weekProfit)                                           # Display dataframe
print(weekProfit.index)                                     # Display indices
print(weekProfit.columns)                                   # Display columns
print(weekProfit['Sale Count'])                             # Display column

       Sale Count  Profit
Mon            54   115.0
Tues           32    60.0
Wed            42    85.0
Thurs          13    12.5
Fri            84   190.0
Index(['Mon', 'Tues', 'Wed', 'Thurs', 'Fri'], dtype='object')
Index(['Sale Count', 'Profit'], dtype='object')
Mon      54
Tues     32
Wed      42
Thurs    13
Fri      84
Name: Sale Count, dtype: int64


In [6]:
# Index-1

ind = pd.Index([99, 98, 97, 96, 95])        # Create index
print(ind)                                  # Display
print(ind[2])                               # Access like an array

try:                                        # Immutable
    ind[2] = 999
except:
    print("This is not allowed")

Index([99, 98, 97, 96, 95], dtype='int64')
97
This is not allowed


# Data Indexing and Selection
1. Series Data Selection
2. Dataframe Data Selection

In [7]:
# Series Data Selection-1

ser = pd.Series([99, 98, 97, 96],               # Create series
                index = ['a', 'b', 'c', 'd'])

print(ser['a'])                                 # Access by index
print('c' in ser)                               # Python 'in' keyword
print('e' in ser)                               # Python 'in' keyword
print(ser['b':'d'])                             # Slicing
print(ser[['a', 'c']])                          # Fancy indexing

print(ser.loc['a':'c'])                         # Using explicit reference
print(ser.iloc[1:3])                            # Using implicit reference

99
True
False
b    98
c    97
d    96
dtype: int64
a    99
c    97
dtype: int64
a    99
b    98
c    97
dtype: int64
b    98
c    97
dtype: int64


In [8]:
# Dataframe Data Selection-1

weekProfit = pd.DataFrame({'Sale Count': salesSeries,                           # Create dataframe from two series
                           'Profit': profitSeries})

print(weekProfit['Sale Count'])
weekProfit['Profit per sale'] = weekProfit['Profit']/ weekProfit['Sale Count']  # Create a new column
print(weekProfit)
print(weekProfit.values)                                                        # Access values
print(weekProfit.T)                                                             # Transpose

Mon      54
Tues     32
Wed      42
Thurs    13
Fri      84
Name: Sale Count, dtype: int64
       Sale Count  Profit  Profit per sale
Mon            54   115.0         2.129630
Tues           32    60.0         1.875000
Wed            42    85.0         2.023810
Thurs          13    12.5         0.961538
Fri            84   190.0         2.261905
[[ 54.         115.           2.12962963]
 [ 32.          60.           1.875     ]
 [ 42.          85.           2.02380952]
 [ 13.          12.5          0.96153846]
 [ 84.         190.           2.26190476]]
                       Mon    Tues       Wed      Thurs         Fri
Sale Count        54.00000  32.000  42.00000  13.000000   84.000000
Profit           115.00000  60.000  85.00000  12.500000  190.000000
Profit per sale    2.12963   1.875   2.02381   0.961538    2.261905


# Operating on Data in Pandas
1. Ufuncs on Series
2. Ufuncs on Dataframe

In [9]:
# Ufuncs on Series-1

print(weekProfit)
print(np.exp(weekProfit['Sale Count'] * .02))       # Index perserved Ufunc


       Sale Count  Profit  Profit per sale
Mon            54   115.0         2.129630
Tues           32    60.0         1.875000
Wed            42    85.0         2.023810
Thurs          13    12.5         0.961538
Fri            84   190.0         2.261905
Mon      2.944680
Tues     1.896481
Wed      2.316367
Thurs    1.296930
Fri      5.365556
Name: Sale Count, dtype: float64


In [10]:
# Ufuncs on Series-2

ser1 = pd.Series([99, 98, 97, 96],                  # Create series
                index = ['a', 'b', 'c', 'd'])
ser2 = pd.Series([1, 2, 3, 4],                      # Create series
                index = ['e', 'b', 'c', 'd'])

print(ser1/ ser2)                                   #Index alignment
print(ser1.subtract(ser2, fill_value = 0))          #Index alignment with fill

a          NaN
b    49.000000
c    32.333333
d    24.000000
e          NaN
dtype: float64
a    99.0
b    96.0
c    94.0
d    92.0
e    -1.0
dtype: float64


In [35]:
# Ufuncs on Dataframe

salesSeries2 = salesSeries.copy()*1.5
profitSeries2 = (salesSeries2.copy() * 2.50) - 20 

weekProfit2 = pd.DataFrame({'Sale Count': salesSeries2,         # Create 2nd dataframe
                           'Profit': profitSeries2})
weekProfit2 = weekProfit2.drop(['Mon'])                         # Delete row
print(weekProfit2.subtract(weekProfit))                         # Index alignment of a dataframe

       Profit  Profit per sale  Sale Count
Fri    105.00              NaN        42.0
Mon       NaN              NaN         NaN
Thurs   16.25              NaN         6.5
Tues    40.00              NaN        16.0
Wed     52.50              NaN        21.0


# Handling Missing Data
1. NaN and None

In [12]:
# NaN and None

weekProfit.loc['Mon', 'Profit'] = np.nan        # Insert NaN value
print(weekProfit)
weekProfit.loc['Mon', 'Profit'] += 10           # Value is still NaN
print(weekProfit)
print(weekProfit.isnull())                      # Check for Null value
print(weekProfit.dropna())                      # Drop Null rows
print(weekProfit.fillna(0))                     # Fill NaN values

       Sale Count  Profit  Profit per sale
Mon            54     NaN         2.129630
Tues           32    60.0         1.875000
Wed            42    85.0         2.023810
Thurs          13    12.5         0.961538
Fri            84   190.0         2.261905
       Sale Count  Profit  Profit per sale
Mon            54     NaN         2.129630
Tues           32    60.0         1.875000
Wed            42    85.0         2.023810
Thurs          13    12.5         0.961538
Fri            84   190.0         2.261905
       Sale Count  Profit  Profit per sale
Mon         False    True            False
Tues        False   False            False
Wed         False   False            False
Thurs       False   False            False
Fri         False   False            False
       Sale Count  Profit  Profit per sale
Tues           32    60.0         1.875000
Wed            42    85.0         2.023810
Thurs          13    12.5         0.961538
Fri            84   190.0         2.261905
       Sale

# Hierarchical Indexing
1. Series Multi-Index
2. Dataframe Multi-Index

In [13]:
# Series Multi-Index

# Create data for Series
weekTuples = [('Week 1', 'Mon'),
              ('Week 1', 'Tues'),
              ('Week 1', 'Wed'),
              ('Week 1', 'Thurs'),
              ('Week 1', 'Fri'),
              ('Week 2', 'Mon'),
              ('Week 2', 'Tues'),
              ('Week 2', 'Wed'),
              ('Week 2', 'Thurs'),
              ('Week 2', 'Fri'),
              ]
index = pd.MultiIndex.from_tuples(weekTuples)
saleList = [54, 32, 42, 13, 84, 22, 78, 64, 89, 26]

# Create Series
sales = pd.Series(saleList, index = index)

# Access data
print(sales[:, 'Thurs'])

Week 1    13
Week 2    89
dtype: int64


In [14]:
# Dataframe Multi-Index

# Turn stacked Series to a dataframe
salesDF = sales.unstack()
print(salesDF)

# Create data for DF
index = pd.MultiIndex.from_tuples(weekTuples)
saleList2 = np.array([12, 78, 46, 15, 78, 94, 78, 49, 78, 84, 56, 78, 94, 45, 78, 61, 64, 94, 78, 31])
saleList2 = saleList2.reshape(10, 2)

saleDF = pd.DataFrame(saleList2, 
                  index = [['Week1','Week1','Week1','Week1','Week1','Week2','Week2','Week2','Week2','Week2'],
                            ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri', 'Mon', 'Tues', 'Wed', 'Thurs', 'Fri']], 
                            columns = ['Store 1', 'Store 2'])
print(saleDF)

# Access specific data
print(saleDF.loc[('Week2', 'Tues'), 'Store 2'])

        Fri  Mon  Thurs  Tues  Wed
Week 1   84   54     13    32   42
Week 2   26   22     89    78   64
             Store 1  Store 2
Week1 Mon         12       78
      Tues        46       15
      Wed         78       94
      Thurs       78       49
      Fri         78       84
Week2 Mon         56       78
      Tues        94       45
      Wed         78       61
      Thurs       64       94
      Fri         78       31
45


# Concat and Append
1. Concat 
2. Append

In [15]:
# Concat

# Series Concat
series1 = pd.Series([99, 98, 97, 1], index=[1, 2, 3, 4])
series2 = pd.Series([96, 95, 94, 2], index=[5, 6, 7, 8])
print(pd.concat([series1, series2]))

# Dataframe Concat
df1 = pd.DataFrame(series1)
df2 = pd.DataFrame(series2)
print(df1)
print()
print(df2)
print()
print(pd.concat([df1, df2]))

1    99
2    98
3    97
4     1
5    96
6    95
7    94
8     2
dtype: int64
    0
1  99
2  98
3  97
4   1

    0
5  96
6  95
7  94
8   2

    0
1  99
2  98
3  97
4   1
5  96
6  95
7  94
8   2


In [16]:
# Append

# Dataframe Concat
df1 = pd.DataFrame(series1)
df2 = pd.DataFrame(series2)
print(df1)
print()
print(df2)
print()
#df1.append(df2)   Was deprecated the removed as stated in https://pandas.pydata.org/pandas-docs/version/1.5/reference/api/pandas.DataFrame.append.html
print(df1)

    0
1  99
2  98
3  97
4   1

    0
5  96
6  95
7  94
8   2

    0
1  99
2  98
3  97
4   1


# Merge
1. Merge

In [17]:
# Merge-1

# Two Louisville stores
df1 = pd.DataFrame({'Store Location': ['Savannah', 'Louisville', 'Louisville', 'Atlanta'],
                    'Profit': [123456, 51753, 654852, 317962]})
df2 = pd.DataFrame({'Store Location': ['Atlanta', 'Savannah', 'Louisville'],
                    'Open Year': [2004, 2008, 2012]})
df3 = pd.merge(df1, df2)

print(df1, '\n')
print(df2, '\n')
print(df3)

  Store Location  Profit
0       Savannah  123456
1     Louisville   51753
2     Louisville  654852
3        Atlanta  317962 

  Store Location  Open Year
0        Atlanta       2004
1       Savannah       2008
2     Louisville       2012 

  Store Location  Profit  Open Year
0       Savannah  123456       2008
1     Louisville   51753       2012
2     Louisville  654852       2012
3        Atlanta  317962       2004


In [18]:
#Merge-2

df4 = pd.DataFrame({'Store Location': ['Savannah', 'Louisville',  'Atlanta'],
                    'Supervisor': ['Kim', 'Lowry', 'Carrier']})

print(df3, '\n')
print(df4, '\n')
print(pd.merge(df3, df4), '\n')

  Store Location  Profit  Open Year
0       Savannah  123456       2008
1     Louisville   51753       2012
2     Louisville  654852       2012
3        Atlanta  317962       2004 

  Store Location Supervisor
0       Savannah        Kim
1     Louisville      Lowry
2        Atlanta    Carrier 

  Store Location  Profit  Open Year Supervisor
0       Savannah  123456       2008        Kim
1     Louisville   51753       2012      Lowry
2     Louisville  654852       2012      Lowry
3        Atlanta  317962       2004    Carrier 



In [19]:
#Merge-3

df5 = pd.DataFrame({'Store': ['Atlanta', 'Savannah', 'Louisville'],
                    'Open Year': [2004, 2008, 2012]})

df6 = pd.merge(df1, df5, left_on = 'Store Location', right_on = 'Store')
print(df6)

  Store Location  Profit       Store  Open Year
0       Savannah  123456    Savannah       2008
1     Louisville   51753  Louisville       2012
2     Louisville  654852  Louisville       2012
3        Atlanta  317962     Atlanta       2004


# Aggregation and Grouping
1. Aggregations
2. GroupBy

In [20]:
# Aggregations-1

# Series
randSeed = np.random.RandomState(24)
series1 = pd.Series(randSeed.rand(6)) * 10
print(series1)
print(series1.sum())                            # Aggregation
print(series1.mean())                           # Aggregation

0    9.600173
1    6.995120
2    9.998673
3    2.200673
4    3.610564
5    7.398410
dtype: float64
39.80361289865721
6.633935483109535


In [21]:
# Aggregations-2

# Dataframes
df = pd.DataFrame({'C1': randSeed.rand(6) * 10,
                   'C2': randSeed.rand(6) * 10})
print(df,'\n')
print(df.mean(), '\n')                              # Aggregation
print(df.mean(axis = 'columns'))                    # Aggregation by column

         C1        C2
0  9.964557  7.096516
1  3.163470  9.001424
2  1.365446  5.341154
3  3.839800  2.472938
4  3.205193  6.718066
5  3.664148  5.617291 

C1    4.200436
C2    6.041231
dtype: float64 

0    8.530536
1    6.082447
2    3.353300
3    3.156369
4    4.961629
5    4.640719
dtype: float64


In [22]:
# GroupBy-1

df = pd.DataFrame({'keys': ['D', 'E', 'F', 'D', 'E', 'F'],
                   'values': range(99, 93, -1)}, columns=['keys', 'values'])
print(df, '\n')
print(df.groupby('keys').mean())                # Using GroupBy

  keys  values
0    D      99
1    E      98
2    F      97
3    D      96
4    E      95
5    F      94 

      values
keys        
D       97.5
E       96.5
F       95.5


In [23]:
# GroupBy-2

df = pd.DataFrame({'keys': ['D', 'E', 'F', 'D', 'E', 'F'],
                   'values1': range(99, 93, -1),
                   'values2': randSeed.randint(0, 10, 6)}, columns=['keys', 'values1', 'values2'])
print(df.groupby('keys').aggregate(['min', 'median', 'max']))

     values1            values2           
         min median max     min median max
keys                                      
D         96   97.5  99       1    4.0   7
E         95   96.5  98       6    6.0   6
F         94   95.5  97       1    1.0   1


# Pivot Tables
1. Pivot Table

In [24]:
# Pivot Tables-1

# Acquire a dataset from seaborn.
penguins = sns.load_dataset('penguins')
print(penguins)


    species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0    Adelie  Torgersen            39.1           18.7              181.0   
1    Adelie  Torgersen            39.5           17.4              186.0   
2    Adelie  Torgersen            40.3           18.0              195.0   
3    Adelie  Torgersen             NaN            NaN                NaN   
4    Adelie  Torgersen            36.7           19.3              193.0   
..      ...        ...             ...            ...                ...   
339  Gentoo     Biscoe             NaN            NaN                NaN   
340  Gentoo     Biscoe            46.8           14.3              215.0   
341  Gentoo     Biscoe            50.4           15.7              222.0   
342  Gentoo     Biscoe            45.2           14.8              212.0   
343  Gentoo     Biscoe            49.9           16.1              213.0   

     body_mass_g     sex  
0         3750.0    Male  
1         3800.0  Female  
2     

In [25]:
# Pivot Tables-2

# Simple pivot table describing the average body mass in grams of three different penguin species to sex.  
print(penguins.pivot_table('body_mass_g', index = 'sex', columns = 'species'))

species       Adelie    Chinstrap       Gentoo
sex                                           
Female   3368.835616  3527.205882  4679.741379
Male     4043.493151  3938.970588  5484.836066


In [26]:
# Pivot Tables-3

# Multi-level pivot table describing the average body mass in grams of three different penguin species on three different islands to sex.  
print(penguins.pivot_table('body_mass_g', index = ['sex', 'island'], columns = 'species'))

species                Adelie    Chinstrap       Gentoo
sex    island                                          
Female Biscoe     3369.318182          NaN  4679.741379
       Dream      3344.444444  3527.205882          NaN
       Torgersen  3395.833333          NaN          NaN
Male   Biscoe     4050.000000          NaN  5484.836066
       Dream      4045.535714  3938.970588          NaN
       Torgersen  4034.782609          NaN          NaN


# Vectorized String Operations
1. Vectorized String Operations

In [27]:
# Vectorized String Operations-1

# Create series for manipulation
strData = ['eXaMpLe1', 'EXAMPLE2', 'example3', 'Example4']
strSer = pd.Series(strData)
print(strSer)

0    eXaMpLe1
1    EXAMPLE2
2    example3
3    Example4
dtype: object


In [28]:
# Vectorized String Operations-2

#Examples of pandas string methods
print(strSer.str.lower())
print(strSer.str.upper())
print(strSer.str.capitalize())
print(strSer.str.isupper())

0    example1
1    example2
2    example3
3    example4
dtype: object
0    EXAMPLE1
1    EXAMPLE2
2    EXAMPLE3
3    EXAMPLE4
dtype: object
0    Example1
1    Example2
2    Example3
3    Example4
dtype: object
0    False
1     True
2    False
3    False
dtype: bool


# Working with Time Series
1. Date and Time
2. Indexing by Pandas Time
3. Frequencies and Offsets

In [29]:
# Date and Time-1

# Python Date
date1 = datetime(year=2024, month=6, day=5)
print(date1)

# Numpy Date
date2 = np.array('2024-06-05', dtype=np.datetime64)
print(date2)

# Pandas Date
date3 = pd.to_datetime('5th of June, 2024')
print(date3)

2024-06-05 00:00:00
2024-06-05
2024-06-05 00:00:00


In [30]:
# Indexing by Pandas Time-1

# Create dataframe
dates = pd.DatetimeIndex(['2024-07-23', '2024-08-23',
                          '2024-09-23', '2024-10-23'])
data = pd.Series([0, 1, 2, 3], index=dates)
print(data)

# Access dataframe with date
print(data['2024-08-23'])

2024-07-23    0
2024-08-23    1
2024-09-23    2
2024-10-23    3
dtype: int64
1


In [31]:
# Indexing by Pandas Time-1

# Create time range
timeRange = pd.timedelta_range(0, periods = 13, freq = '2h')
print(timeRange)

TimedeltaIndex(['0 days 00:00:00', '0 days 02:00:00', '0 days 04:00:00',
                '0 days 06:00:00', '0 days 08:00:00', '0 days 10:00:00',
                '0 days 12:00:00', '0 days 14:00:00', '0 days 16:00:00',
                '0 days 18:00:00', '0 days 20:00:00', '0 days 22:00:00',
                '1 days 00:00:00'],
               dtype='timedelta64[ns]', freq='2h')


# Eval and Query
1. Dataframe eval
2. Dataframe query

In [36]:
# Dataframe eval

# Create Dataframe
df = pd.DataFrame(randSeed.rand(200, 3) * 10, columns=['A', 'B', 'C'])
print(df.head())

# eval for column opperation
result1 = df.eval('(A + B) / (C - 1)')
print(result1.head())

          A         B         C
0  6.047945  4.606039  5.661392
1  5.688322  5.943732  5.104737
2  0.536752  7.556226  7.302303
3  4.533575  6.819248  3.232493
4  5.003490  1.989412  2.275592
0    2.285580
1    2.833813
2    1.284130
3    5.085266
4    5.482084
dtype: float64


In [33]:
# Dataframe query

# Query operation then display
result2 = df.query('A < 5 and C > 5')
print(result2.head())

           A         B         C
1   3.448427  9.462354  9.202870
3   1.882603  3.616646  9.375828
5   3.359947  2.372184  5.246915
11  4.910423  4.364557  7.728514
13  4.314714  4.631845  7.097495
