# Pandas
Pandas is an open-source data manipulation and analysis library for Python. It provides data structures and functions needed to manipulate structured data seamlessly. Pandas is particularly well-suited for working with tabular data, such as data stored in spreadsheets or databases.

In [45]:
import pandas as pd
import numpy as np

## 1. DataFrame

In [43]:
# Create a DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['New York', 'Los Angeles', 'Chicago']
}
df = pd.DataFrame(data)


# Display the DataFrame
print(df)            # df
print()

# Select a column
print(df['Name'])
print()

# Filter rows based on a condition
print(df[df['Age'] > 30])
print()

# Add a new column
df['Salary'] = [70000, 80000, 90000]
print(df)
print()

# Calculate summary statistics
print(df.describe())

      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago

0      Alice
1        Bob
2    Charlie
Name: Name, dtype: object

      Name  Age     City
2  Charlie   35  Chicago

      Name  Age         City  Salary
0    Alice   25     New York   70000
1      Bob   30  Los Angeles   80000
2  Charlie   35      Chicago   90000

        Age   Salary
count   3.0      3.0
mean   30.0  80000.0
std     5.0  10000.0
min    25.0  70000.0
25%    27.5  75000.0
50%    30.0  80000.0
75%    32.5  85000.0
max    35.0  90000.0


## 2. Series
One-dimensional labeled array, capable of holding any data type (integers, floats, strings, etc.). It’s like a column in a spreadsheet.\
**Labels:**
Pandas Series has an index, which acts as labels for each element.
NumPy arrays are just indexed by their position.\
**Data Type Flexibility:**
Pandas Series can hold multiple data types within the same array.
NumPy arrays are typically of a single data type.\
**Functionality:**
Series come with a lot of built-in methods for data analysis and handling, like handling missing data.
NumPy arrays are more focused on efficient numerical computations.

In [132]:
labels = ['A', 'B', 'C', 'D']
list1 = [1, 2, 3, 4]            # built-in Python list
list2 = np.array([40, 50, 60])  # NumPy array, which is more suited for numerical operations
dictionary = {
    'A': 10,
    'B': 20,
    'C': 30
}

In [66]:
pd.Series(data=list1)

0    1
1    2
2    3
3    4
dtype: int64

In [68]:
pd.Series(data=list1, index=labels)

A    1
B    2
C    3
D    4
dtype: int64

In [72]:
pd.Series(data=labels)

0    A
1    B
2    C
3    D
dtype: object

In [86]:
series = pd.Series([sum, print, len])
print(series)
series[0]([1, 2, 3])             # prints the sum 

0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object


6

In [114]:
pd.Series(list2)

0    40
1    50
2    60
dtype: int32

In [275]:
pd.Series(dictionary)    # pd.DataFrame(dictionary) can't be used directly as values are not in array format
# pd.DataFrame(dictionary.items())

A    10
B    20
C    30
dtype: int64

In [122]:
dataset = {
    "Cars": ["BMW", "Bugatti", "Lamborgini"],
    "Color": ["Red", "Blue", "Green"]
     }
pd.Series(dataset)

Cars     [BMW, Bugatti, Lamborgini]
Color            [Red, Blue, Green]
dtype: object

In [124]:
pd.DataFrame(dataset)

Unnamed: 0,Cars,Color
0,BMW,Red
1,Bugatti,Blue
2,Lamborgini,Green


In [152]:
fruits = ["Apple", "Orange", "Banana", "Grapes"]
colors = ["Red", "Orange", "Yellow", "Green"]

print(pd.Series(fruits, colors), "\n")
print(pd.DataFrame({'Fruits': fruits, 'Colors': colors}))

Red        Apple
Orange    Orange
Yellow    Banana
Green     Grapes
dtype: object 

   Fruits  Colors
0   Apple     Red
1  Orange  Orange
2  Banana  Yellow
3  Grapes   Green


In [154]:
pd.Series([1, 2, 3, 4], ['a', 'b', 'c', 'd'])

a    1
b    2
c    3
d    4
dtype: int64

## 3. Random

In [157]:
import random

In [171]:
# Seed is used to save the state of the random generator
random.seed(40)                             # 40 is an arbitrary choice
print("First:", random.randint(50, 100))    # Prints 79

random.seed(40)
print("Second:", random.randint(50, 100))   # Prints 79 again


First: 79
Second: 79


In [175]:
random.seed(10)
print("First:", random.random())

random.seed(10)
print("Second:", random.random())

First: 0.5714025946899135
Second: 0.5714025946899135


In [177]:
# randn: normal distribution (positive and negative values)
# np.random.seed(5)
pd.DataFrame(np.random.randn(5, 4), index='A B C D E'.split(), columns= "P Q R S".split())

Unnamed: 0,P,Q,R,S
A,-1.273791,0.620852,0.092272,-0.545843
B,-1.461032,-0.736882,1.318198,-1.598129
C,0.661423,-0.603724,-0.662586,-0.163146
D,0.364428,0.019136,-1.158215,0.790234
E,0.32662,-0.458257,1.269882,-1.499336


In [195]:
# rand: positive values only
df = pd.DataFrame(np.random.rand(5, 4), index="A B C D E".split(), columns="P Q R S".split())
print(df)

print(df['P'])
print(df[['R', 'S']])

          P         Q         R         S
A  0.667536  0.163226  0.119159  0.367234
B  0.750007  0.618174  0.138822  0.919497
C  0.922283  0.303475  0.180022  0.513056
D  0.131925  0.494034  0.585740  0.191216
E  0.467193  0.248384  0.850440  0.046406
A    0.667536
B    0.750007
C    0.922283
D    0.131925
E    0.467193
Name: P, dtype: float64
          R         S
A  0.119159  0.367234
B  0.138822  0.919497
C  0.180022  0.513056
D  0.585740  0.191216
E  0.850440  0.046406


In [207]:
print(type(df['P']))
print(type(df[['P','Q']]))

<class 'pandas.core.series.Series'>
<class 'pandas.core.frame.DataFrame'>


In [211]:
# Add a new column
df["T"] = [1, 2, 3, 4, 5]
df

Unnamed: 0,P,Q,R,S,T
A,0.667536,0.163226,0.119159,0.367234,1
B,0.750007,0.618174,0.138822,0.919497,2
C,0.922283,0.303475,0.180022,0.513056,3
D,0.131925,0.494034,0.58574,0.191216,4
E,0.467193,0.248384,0.85044,0.046406,5


In [219]:
# Drop a row or column (not modifying the original dataframe)
print(df.drop("T", axis=1))
print(df.drop("D", axis=0))

          P         Q         R         S
A  0.667536  0.163226  0.119159  0.367234
B  0.750007  0.618174  0.138822  0.919497
C  0.922283  0.303475  0.180022  0.513056
D  0.131925  0.494034  0.585740  0.191216
E  0.467193  0.248384  0.850440  0.046406
          P         Q         R         S  T
A  0.667536  0.163226  0.119159  0.367234  1
B  0.750007  0.618174  0.138822  0.919497  2
C  0.922283  0.303475  0.180022  0.513056  3
E  0.467193  0.248384  0.850440  0.046406  5


In [221]:
# Locate a row by label
df.loc['A']

P    0.667536
Q    0.163226
R    0.119159
S    0.367234
T    1.000000
Name: A, dtype: float64

In [225]:
# Locate a row by index
df.iloc[2]

P    0.922283
Q    0.303475
R    0.180022
S    0.513056
T    3.000000
Name: C, dtype: float64

In [227]:
# Locate a cell by index and column
df.loc['B', 'R']

0.1388216035841202

In [229]:
# Locate a part of the dataframe
df.loc[['A','B'],['R','S']]

Unnamed: 0,R,S
A,0.119159,0.367234
B,0.138822,0.919497


In [237]:
# Zip: Combine 2 lists
a1 = ['A', 'A', 'A', 'B', 'B', 'B']
b1 = [1, 2, 3, 1, 2, 3]

p = list(zip(a1, b1))
p

[('A', 1), ('A', 2), ('A', 3), ('B', 1), ('B', 2), ('B', 3)]

In [239]:
df = pd.DataFrame(np.random.randn(6, 2), index=p, columns=['X', 'Y'])
df

Unnamed: 0,X,Y
"(A, 1)",0.166971,1.710417
"(A, 2)",-0.506196,0.371051
"(A, 3)",1.143017,0.548831
"(B, 1)",0.341021,0.576256
"(B, 2)",-0.944996,0.019611
"(B, 3)",-1.892134,-0.783534


#### ?.  Write a python program to convert a list to series.

In [245]:
import pandas as pd

In [247]:
l = [2, 3, 4, 5]
pd.Series(l)

0    2
1    3
2    4
3    5
dtype: int64

#### ?.  Write a python program to generate a series of dates from 1st August 2024 to 24 August 2024.

In [251]:
import datetime

In [261]:
start = datetime.datetime.strptime("01-08-2024", "%d-%m-%Y")
end = datetime.datetime.strptime("24-08-2024","%d-%m-%Y")

dates = pd.date_range(start, end)        # use len() for no: of days
print(dates)

DatetimeIndex(['2024-08-01', '2024-08-02', '2024-08-03', '2024-08-04',
               '2024-08-05', '2024-08-06', '2024-08-07', '2024-08-08',
               '2024-08-09', '2024-08-10', '2024-08-11', '2024-08-12',
               '2024-08-13', '2024-08-14', '2024-08-15', '2024-08-16',
               '2024-08-17', '2024-08-18', '2024-08-19', '2024-08-20',
               '2024-08-21', '2024-08-22', '2024-08-23', '2024-08-24'],
              dtype='datetime64[ns]', freq='D')


#### ?.  Convert a dictionary into dataframe and display it.

In [267]:
d = {
    "1": "A",
    "2": "B",
    "3": "C",
    "4": "D",
    "5": "E",
}
pd.DataFrame(d.items())

Unnamed: 0,0,1
0,1,A
1,2,B
2,3,C
3,4,D
4,5,E


#### ?. Convert a 2D list and convert it into data frame and display it.

In [286]:
l = [
    [1, 2],
    [3, 4],
    [5, 6],
]
df = pd.DataFrame(l)
df

Unnamed: 0,0,1
0,1,2
1,3,4
2,5,6
