## Pandas

### Pandas is a powerful data manipulation library in Python, widely used for data analysis and data cleaning. It provides two primary data structures: Series and DataFrame

### A Series is like a one dimensional array-like object

### A DataFrame is a two-dimensional, size-mutable, and potentially heterogeneous tabular data structure with labeled axes (rows and columns)

In [1]:
import pandas as pd

In [13]:
# Series
data = [1, 2, 3, 4, 5]
series = pd.Series(data)
print("Series: \n", series)

print(type(series))

Series: 
 0    1
1    2
2    3
3    4
4    5
dtype: int64
<class 'pandas.core.series.Series'>


In [15]:
# Create Series from Dictionary

data = {'a': 1, 'b': 2, 'c': 3}

series_dict = pd.Series(data)
print("Series: \n", series_dict)

Series: 
 a    1
b    2
c    3
dtype: int64


In [28]:
data = [10, 20, 30]
index = ['a', 'b', 'c']

series = pd.Series(data, index=index)
print(series)

a    10
b    20
c    30
dtype: int64


In [29]:
# Converting a Series into Numpy Array

import numpy as np

arr1 = np.array(series)
print(arr1)

[10 20 30]


In [4]:
# Data Frame

# Create a DataFrame from a Dictionary of Lists

data = {
    "Name": ["Aditya", "Yogesh", "Ariyan"],
    "Age": [20, 30, 40],
    "Country": ["Sweden", "Australia", "Canada"]
}

df = pd.DataFrame(data)

print(df)

print(type(df))

     Name  Age    Country
0  Aditya   20     Sweden
1  Yogesh   30  Australia
2  Ariyan   40     Canada
<class 'pandas.core.frame.DataFrame'>


In [9]:
# Converting a DataFrame into a Numpy Array

import numpy as np

arr = np.array(df)

print(arr)

print(arr.ndim)
print(arr.size)
print(arr.shape)

[['Aditya' 20 'Sweden']
 ['Yogesh' 30 'Australia']
 ['Ariyan' 40 'Canada']]
2
9
(3, 3)


In [5]:
# Create a DataFrame from a List of Dictionaries

data = [
    {'Name': 'Aditya', 'Age': 20, 'Country': 'Sweden'},
    {'Name': 'Yogesh', 'Age': 30, 'Country': 'Australia'},
    {'Name': 'Aditya', 'Age': 20, 'Country': 'Canada'},
]

df = pd.DataFrame(data)

print(df)

print(type(df))

     Name  Age    Country
0  Aditya   20     Sweden
1  Yogesh   30  Australia
2  Aditya   20     Canada
<class 'pandas.core.frame.DataFrame'>


In [3]:
# Reading data from CSV

# df = pd.read_csv("data.csv")

# print(df)
# df.head(5)
# df.tail(5)

In [66]:
# Accessing Data from DataFrame

data = {
    "Name": ["Aditya", "Yogesh", "Ariyan"],
    "Age": [20, 30, 40],
    "Country": ["Sweden", "Australia", "Canada"]
}
df = pd.DataFrame(data)

df

Unnamed: 0,Name,Age,Country
0,Aditya,20,Sweden
1,Yogesh,30,Australia
2,Ariyan,40,Canada


In [28]:
print(df["Name"])

print(type(df["Name"]))

0    Aditya
1    Yogesh
2    Ariyan
Name: Name, dtype: object
<class 'pandas.core.series.Series'>


In [3]:
data = {
    "Name": ["Alen", "Brian", "Caren"],
    "Marks": [85, 90, 78]
}

df = pd.DataFrame(data, index=["s1", "s2", "s3"])

df

Unnamed: 0,Name,Marks
s1,Alen,85
s2,Brian,90
s3,Caren,78


# Accessing Individual Rows using loc

## Label-based indexing

## loc selects data by row and column labels (index names and column names).

In [None]:
df.loc["s1"]

df.loc["s1", "Marks"]

df.loc["s3", "Name"]

# Slicing Rows (inclusive)

df.loc["s1":"s2"]

Unnamed: 0,Name,Marks
s1,Alen,85
s2,Brian,90


# Accessing Elements using iloc

## Integer position-based indexing

## iloc selects data based on numerical index positions (0-based indexing).

In [12]:
df

Unnamed: 0,Name,Marks
s1,Alen,85
s2,Brian,90
s3,Caren,78


In [5]:
df.iloc[0]

df.iloc[2]

# df.iloc[0][1]
df.iloc[0, 1]

# Slicing Rows (exclusive)

df.iloc[0 : 2]

Unnamed: 0,Name,Marks
s1,Alen,85
s2,Brian,90


In [15]:
# Accessing a Specified Element

data = {
    "Name": ["Aditya", "Yogesh", "Ariyan"],
    "Age": [20, 30, 40],
    "Country": ["Sweden", "Australia", "Canada"]
}
df = pd.DataFrame(data)

df

Unnamed: 0,Name,Age,Country
0,Aditya,20,Sweden
1,Yogesh,30,Australia
2,Ariyan,40,Canada


In [21]:
df.at[1, "Age"]

np.int64(30)

In [22]:
df.at[2, "Age"]

np.int64(40)

In [16]:
# Accessing a specified element using iat

df.iat[2, 2]

df.iat[1, 1]

np.int64(30)

In [17]:
# Data Manipulation with DataFrame

df

Unnamed: 0,Name,Age,Country
0,Aditya,20,Sweden
1,Yogesh,30,Australia
2,Ariyan,40,Canada


In [67]:
# Adding a new column

df["Salary"] = [50000, 60000, 70000]

df

Unnamed: 0,Name,Age,Country,Salary
0,Aditya,20,Sweden,50000
1,Yogesh,30,Australia,60000
2,Ariyan,40,Canada,70000


In [None]:
# Removing a Column Temporarily -> Temporary Operation

# df.drop('Salary')
# df.drop('Salary', axis=0)

# df.drop("Salary", axis=1)

Unnamed: 0,Name,Age,Country
0,Aditya,20,Sweden
1,Yogesh,30,Australia
2,Ariyan,40,Canada


In [None]:
# Deleting a Column Permanently from a DataFrame

df.drop("Salary", axis=1, inplace=True)

In [18]:
df

Unnamed: 0,Name,Age,Country
0,Aditya,20,Sweden
1,Yogesh,30,Australia
2,Ariyan,40,Canada


In [19]:
# Add Age by 1

df["Age"] = df["Age"] + 1
df

Unnamed: 0,Name,Age,Country
0,Aditya,21,Sweden
1,Yogesh,31,Australia
2,Ariyan,41,Canada


In [72]:
# Adding a New Row

df.loc[3] = ["Hitesh", 51, "India"]
df

Unnamed: 0,Name,Age,Country
0,Aditya,21,Sweden
1,Yogesh,31,Australia
2,Ariyan,41,Canada
3,Hitesh,51,India


In [None]:
# Removing a Row

# df.drop(3)

df.drop(3, axis=0, inplace=True)

In [76]:
df

Unnamed: 0,Name,Age,Country
0,Aditya,21,Sweden
1,Yogesh,31,Australia
2,Ariyan,41,Canada


In [2]:
df2 = pd.read_csv("data.csv")

df2.head(5)
df2.tail(5)

Unnamed: 0,Date,Category,Value,Product,Sales,Region
45,2023-02-15,B,99.0,Product2,599.0,West
46,2023-02-16,B,6.0,Product1,938.0,South
47,2023-02-17,B,69.0,Product3,143.0,West
48,2023-02-18,C,65.0,Product3,182.0,North
49,2023-02-19,C,11.0,Product3,708.0,North


In [None]:
# Displays the data type of each column

print("Data Types: \n", df2.dtypes)

Data Types: 
 Date         object
Category     object
Value       float64
Product      object
Sales       float64
Region       object
dtype: object


In [26]:
# Describe the DataFrame

print("Statistical Summary: \n", df2.describe())

Statistical Summary: 
            Value       Sales
count  47.000000   46.000000
mean   51.744681  557.130435
std    29.050532  274.598584
min     2.000000  108.000000
25%    27.500000  339.000000
50%    54.000000  591.500000
75%    70.000000  767.500000
max    99.000000  992.000000


In [None]:
# Insights of each columns

df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Date      50 non-null     object 
 1   Category  50 non-null     object 
 2   Value     47 non-null     float64
 3   Product   50 non-null     object 
 4   Sales     46 non-null     float64
 5   Region    50 non-null     object 
dtypes: float64(2), object(4)
memory usage: 2.5+ KB
