# Pandas-DataFrame And Series

Pandas is a powerful data manipulation library in Python, widely used for data analysis and data cleaning. It provides two primary data structures: Series and DataFrame. A series is a one-dimensional array-like object, while a DataFrame is a two-dimensional, size-mutable, and potentially heterogeneous tabular data structure with labeled axes(rows and columns)

In [169]:
import pandas as pd

In [170]:
# Series
# A Pandas Series is a one dimensional array-like object that can hold any data type. It is similar to a column in a spreadsheet or a table.

data = [1,2,3,4,5]

series = pd.Series(data)
print("Series \n", series)
print(type(series))

Series 
 0    1
1    2
2    3
3    4
4    5
dtype: int64
<class 'pandas.core.series.Series'>


In [171]:
# Create a Series from dictionary
data = {"a": 1, "b": 2, "c": 3}
series_dict = pd.Series(data)
print("Series from dictionary \n", series_dict)
# Output: a    1
#         b    2
#         c    3
print(type(series_dict))

Series from dictionary 
 a    1
b    2
c    3
dtype: int64
<class 'pandas.core.series.Series'>


In [172]:
data = [10, 20, 30]

index = ["a", "b", "c"]

pd.Series(data, index = index)

a    10
b    20
c    30
dtype: int64

In [173]:
# DataFrame
# DataFrame is a two-dimensional, size-mutable, and potentially heterogeneous tabular data structure with labeled axes (rows and columns). It is similar to a spreadsheet or SQL table, or a dictionary of Series objects.


# Create a dataframe from dictionary

data = {
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [25, 30, 35],
    "City": ["New York", "Los Angeles", "Chicago"],
}

df = pd.DataFrame(data)
print(df)
print(type(df))

      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago
<class 'pandas.core.frame.DataFrame'>


In [174]:
import numpy as np
np.array(df)

array([['Alice', 25, 'New York'],
       ['Bob', 30, 'Los Angeles'],
       ['Charlie', 35, 'Chicago']], dtype=object)

In [175]:
# Create a dataframe from list dictionaries
data = [
    {"Name": "Alice", "Age": 25, "City": "New York"},
    {"Name": "Bob", "Age": 30, "City": "Los Angeles"},
    {"Name": "Charlie", "Age": 35, "City": "Chicago"},
    {"Name": "David", "Age": 40, "City": "Houston"},
]
df = pd.DataFrame(data)
print(df)

      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago
3    David   40      Houston


In [176]:
print(np.array(df))

[['Alice' 25 'New York']
 ['Bob' 30 'Los Angeles']
 ['Charlie' 35 'Chicago']
 ['David' 40 'Houston']]


In [177]:
df = pd.read_csv('sales_data.csv')

df.head(5) # Display first 5 rows

Unnamed: 0,Transaction ID,Date,Product Category,Product Name,Units Sold,Unit Price,Total Revenue,Region,Payment Method
0,10001,2024-01-01,Electronics,iPhone 14 Pro,2,999.99,1999.98,North America,Credit Card
1,10002,2024-01-02,Home Appliances,Dyson V11 Vacuum,1,499.99,499.99,Europe,PayPal
2,10003,2024-01-03,Clothing,Levi's 501 Jeans,3,69.99,209.97,Asia,Debit Card
3,10004,2024-01-04,Books,The Da Vinci Code,4,15.99,63.96,North America,Credit Card
4,10005,2024-01-05,Beauty Products,Neutrogena Skincare Set,1,89.99,89.99,Europe,PayPal


In [178]:
df.tail(5) # Display last 5 rows

Unnamed: 0,Transaction ID,Date,Product Category,Product Name,Units Sold,Unit Price,Total Revenue,Region,Payment Method
235,10236,2024-08-23,Home Appliances,Nespresso Vertuo Next Coffee and Espresso Maker,1,159.99,159.99,Europe,PayPal
236,10237,2024-08-24,Clothing,Nike Air Force 1 Sneakers,3,90.0,270.0,Asia,Debit Card
237,10238,2024-08-25,Books,The Handmaid's Tale by Margaret Atwood,3,10.99,32.97,North America,Credit Card
238,10239,2024-08-26,Beauty Products,Sunday Riley Luna Sleeping Night Oil,1,55.0,55.0,Europe,PayPal
239,10240,2024-08-27,Sports,Yeti Rambler 20 oz Tumbler,2,29.99,59.98,Asia,Credit Card


In [179]:

data = {
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [25, 30, 35],
    "City": ["New York", "Los Angeles", "Chicago"],
}

df = pd.DataFrame(data)

df['Name']  # Accessing a single column
# print(type(df['Name']))  # Output: <class 'pandas.core.series.Series'>
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago


In [180]:
df.loc[0]  # Accessing a single row by label
# df.loc[0:1]  # Accessing multiple rows by label

# Note: df.loc[0] means accessing the row with index label 0(first row whose index will be 0) if no custom index is provided.
# If you provide custom index labels, use those labels to access rows.
# Example:
# data = {
#     "Name": ["Alice", "Bob", "Charlie"],
#     "Age": [25, 30, 35],
#     "City": ["New York", "Los Angeles", "Chicago"],
# }
# ids = ['id_1', 'id_2', 'id_3']
# df = pd.DataFrame(data, index=ids)

# print(df.loc['id_2', 'Name'])  # Output: Bob


Name       Alice
Age           25
City    New York
Name: 0, dtype: object

In [181]:
df.loc[2] # Accessing a single row by label

Name    Charlie
Age          35
City    Chicago
Name: 2, dtype: object

In [182]:
df.loc[:,'Name'] # Accessing a single column for all rows

0      Alice
1        Bob
2    Charlie
Name: Name, dtype: object

In [183]:
df.loc[[0,2], ['Name', 'City']] # Accessing multiple rows and multiple columns

Unnamed: 0,Name,City
0,Alice,New York
2,Charlie,Chicago


In [184]:
df.loc[:, ['Name', 'Age']]  # Accessing 'Name' and 'Age' columns for all rows

Unnamed: 0,Name,Age
0,Alice,25
1,Bob,30
2,Charlie,35


In [185]:
df.iloc[0]  # Accessing a single row by position

Name       Alice
Age           25
City    New York
Name: 0, dtype: object

In [186]:
df.iloc[0,0] # Accessing a single value by position or you can also use df.iloc[0][0]

'Alice'

In [187]:
df.iloc[0, :] # Accessing all columns for the first row

Name       Alice
Age           25
City    New York
Name: 0, dtype: object

In [188]:
df.iloc[0:2, :2] # Accessing a subset of the DataFrame (first two rows and first two columns)

Unnamed: 0,Name,Age
0,Alice,25
1,Bob,30


In [189]:
df.iloc[:, :] # Accessing all rows and all columns this is same as df.iloc[::, ::]

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago


In [190]:
df.iloc[[0,1],[0,1,2]] # Accessing multiple rows and multiple columns by position

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles


In [191]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago


In [192]:
# Accessing a specified element
df.at[0, 'Age']

np.int64(25)

In [193]:
df.at[2, 'Name']

'Charlie'

In [194]:
# Accessing specified element using iat
df.iat[2,2] # Accessing element at 3rd row and 3rd column (0-based index)

'Chicago'

In [195]:
# Summary of .loc, .iloc, .at, and .iat
# .loc: Label-based indexing for rows and columns. Use row and column labels.
# .iloc: Position-based indexing for rows and columns. Use integer positions.
# .at: Fast access to a single value for a row/column label pair.
# .iat: Fast access to a single value for a row/column pair by integer position.


In [196]:
# Data manipulation with dataframes
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago


In [197]:
# Add a new column
df['Salary'] = [50000, 60000, 70000]

In [198]:
df

Unnamed: 0,Name,Age,City,Salary
0,Alice,25,New York,50000
1,Bob,30,Los Angeles,60000
2,Charlie,35,Chicago,70000


In [201]:
# Remove a column
df.drop("Salary", axis = 1, inplace = True)
# default value of axis is 0 which means we want to drop a row. So we change it to 1 to drop a column.
# if we dont pass inplace = True it will return a new dataframe without the column but the original dataframe will remain unchanged.
# So, to make the change permanent in the original dataframe, we use inplace=True.

In [202]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago


In [203]:
# Add age to the column
df['Age'] = df['Age'] + 1 # Incrementing age by 1
df

Unnamed: 0,Name,Age,City
0,Alice,26,New York
1,Bob,31,Los Angeles
2,Charlie,36,Chicago


In [211]:
df.drop(0) # Drop the first row (index 0). Use inplace=True to make the change permanent.

Unnamed: 0,Name,Age,City
1,Bob,31,Los Angeles
2,Charlie,36,Chicago


In [213]:
df = pd.read_csv('sales_data.csv')

df.head(5) # Display first 5 rows
print(df.describe())  # Summary statistics for numerical columns

       Transaction ID  Units Sold   Unit Price  Total Revenue
count       240.00000  240.000000   240.000000     240.000000
mean      10120.50000    2.158333   236.395583     335.699375
std          69.42622    1.322454   429.446695     485.804469
min       10001.00000    1.000000     6.500000       6.500000
25%       10060.75000    1.000000    29.500000      62.965000
50%       10120.50000    2.000000    89.990000     179.970000
75%       10180.25000    3.000000   249.990000     399.225000
max       10240.00000   10.000000  3899.990000    3899.990000
