## Data Manipulation and Analysis with Pandas

Data manipulation and analysis are crucial skills in data science. The Pandas library in Python provides powerful tools for handling and analyzing data efficiently. 

With Pandas, you can easily clean, transform, and analyze your data, making it an indispensable tool for any data scientist.


In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('data/data.csv')
df.head()

Unnamed: 0,Date,Category,Value,Product,Sales,Region
0,2023-01-01,A,28.0,Product1,754.0,East
1,2023-01-02,B,39.0,Product3,110.0,North
2,2023-01-03,C,32.0,Product2,398.0,East
3,2023-01-04,B,8.0,Product1,522.0,East
4,2023-01-05,B,26.0,Product3,869.0,North


In [4]:
df.tail()

Unnamed: 0,Date,Category,Value,Product,Sales,Region
45,2023-02-15,B,99.0,Product2,599.0,West
46,2023-02-16,B,6.0,Product1,938.0,South
47,2023-02-17,B,69.0,Product3,143.0,West
48,2023-02-18,C,65.0,Product3,182.0,North
49,2023-02-19,C,11.0,Product3,708.0,North


In [5]:
df.describe()

Unnamed: 0,Value,Sales
count,47.0,46.0
mean,51.744681,557.130435
std,29.050532,274.598584
min,2.0,108.0
25%,27.5,339.0
50%,54.0,591.5
75%,70.0,767.5
max,99.0,992.0


In [6]:
df.dtypes

Date         object
Category     object
Value       float64
Product      object
Sales       float64
Region       object
dtype: object

### Handling Missing Values

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Date      50 non-null     object 
 1   Category  50 non-null     object 
 2   Value     47 non-null     float64
 3   Product   50 non-null     object 
 4   Sales     46 non-null     float64
 5   Region    50 non-null     object 
dtypes: float64(2), object(4)
memory usage: 2.5+ KB


In [8]:
df.isnull()

Unnamed: 0,Date,Category,Value,Product,Sales,Region
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
5,False,False,False,False,False,False
6,False,False,False,False,False,False
7,False,False,False,False,False,False
8,False,False,False,False,False,False
9,False,False,False,False,False,False


In [9]:
df[df.isnull()]

Unnamed: 0,Date,Category,Value,Product,Sales,Region
0,,,,,,
1,,,,,,
2,,,,,,
3,,,,,,
4,,,,,,
5,,,,,,
6,,,,,,
7,,,,,,
8,,,,,,
9,,,,,,


In [10]:
df.isnull().any(axis=1)

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11     True
12    False
13    False
14    False
15     True
16    False
17     True
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28     True
29    False
30    False
31    False
32    False
33     True
34    False
35     True
36    False
37     True
38    False
39    False
40    False
41    False
42    False
43    False
44    False
45    False
46    False
47    False
48    False
49    False
dtype: bool

In [11]:
df.isnull().any()

Date        False
Category    False
Value        True
Product     False
Sales        True
Region      False
dtype: bool

In [12]:
df.isnull().sum()

Date        0
Category    0
Value       3
Product     0
Sales       4
Region      0
dtype: int64

In [14]:
df_filled = df.fillna(0)
df_filled

Unnamed: 0,Date,Category,Value,Product,Sales,Region
0,2023-01-01,A,28.0,Product1,754.0,East
1,2023-01-02,B,39.0,Product3,110.0,North
2,2023-01-03,C,32.0,Product2,398.0,East
3,2023-01-04,B,8.0,Product1,522.0,East
4,2023-01-05,B,26.0,Product3,869.0,North
5,2023-01-06,B,54.0,Product3,192.0,West
6,2023-01-07,A,16.0,Product1,936.0,East
7,2023-01-08,C,89.0,Product1,488.0,West
8,2023-01-09,C,37.0,Product3,772.0,West
9,2023-01-10,A,22.0,Product2,834.0,West


### Filling Missing Values

In [15]:
df['sales_filled'] = df['Sales'].fillna(df['Sales'].mean())
df

Unnamed: 0,Date,Category,Value,Product,Sales,Region,sales_filled
0,2023-01-01,A,28.0,Product1,754.0,East,754.0
1,2023-01-02,B,39.0,Product3,110.0,North,110.0
2,2023-01-03,C,32.0,Product2,398.0,East,398.0
3,2023-01-04,B,8.0,Product1,522.0,East,522.0
4,2023-01-05,B,26.0,Product3,869.0,North,869.0
5,2023-01-06,B,54.0,Product3,192.0,West,192.0
6,2023-01-07,A,16.0,Product1,936.0,East,936.0
7,2023-01-08,C,89.0,Product1,488.0,West,488.0
8,2023-01-09,C,37.0,Product3,772.0,West,772.0
9,2023-01-10,A,22.0,Product2,834.0,West,834.0


In [16]:
df.dtypes

Date             object
Category         object
Value           float64
Product          object
Sales           float64
Region           object
sales_filled    float64
dtype: object

### Renaming Columns

In [17]:

df.rename(columns={'Sales': 'sales', 'Quantity': 'quantity'}, inplace=True)
df.head()

Unnamed: 0,Date,Category,Value,Product,sales,Region,sales_filled
0,2023-01-01,A,28.0,Product1,754.0,East,754.0
1,2023-01-02,B,39.0,Product3,110.0,North,110.0
2,2023-01-03,C,32.0,Product2,398.0,East,398.0
3,2023-01-04,B,8.0,Product1,522.0,East,522.0
4,2023-01-05,B,26.0,Product3,869.0,North,869.0


### Changing Data Types

In [20]:
df['new_value'] = df['Value'].fillna(df['Value'].mean()).astype(int)
df.head()

Unnamed: 0,Date,Category,Value,Product,sales,Region,sales_filled,new_value
0,2023-01-01,A,28.0,Product1,754.0,East,754.0,28
1,2023-01-02,B,39.0,Product3,110.0,North,110.0,39
2,2023-01-03,C,32.0,Product2,398.0,East,398.0,32
3,2023-01-04,B,8.0,Product1,522.0,East,522.0,8
4,2023-01-05,B,26.0,Product3,869.0,North,869.0,26


In [22]:
df['new_sales'] = df['sales'].apply(lambda x: x * 0.8 if x > 100 else x)
df.head()

Unnamed: 0,Date,Category,Value,Product,sales,Region,sales_filled,new_value,new_sales
0,2023-01-01,A,28.0,Product1,754.0,East,754.0,28,603.2
1,2023-01-02,B,39.0,Product3,110.0,North,110.0,39,88.0
2,2023-01-03,C,32.0,Product2,398.0,East,398.0,32,318.4
3,2023-01-04,B,8.0,Product1,522.0,East,522.0,8,417.6
4,2023-01-05,B,26.0,Product3,869.0,North,869.0,26,695.2


### Aggregating and Grouping Data

In [23]:
df.head()

Unnamed: 0,Date,Category,Value,Product,sales,Region,sales_filled,new_value,new_sales
0,2023-01-01,A,28.0,Product1,754.0,East,754.0,28,603.2
1,2023-01-02,B,39.0,Product3,110.0,North,110.0,39,88.0
2,2023-01-03,C,32.0,Product2,398.0,East,398.0,32,318.4
3,2023-01-04,B,8.0,Product1,522.0,East,522.0,8,417.6
4,2023-01-05,B,26.0,Product3,869.0,North,869.0,26,695.2


In [24]:
grouped_mean = df.groupby('Product')['Value'].mean()
print(grouped_mean)

Product
Product1    46.214286
Product2    52.800000
Product3    55.166667
Name: Value, dtype: float64


In [26]:
grouped_sum = df.groupby(['Product', 'Region'])['Value'].sum()
print(grouped_sum)


Product   Region
Product1  East      292.0
          North       9.0
          South     100.0
          West      246.0
Product2  East       56.0
          North     127.0
          South     181.0
          West      428.0
Product3  East      202.0
          North     203.0
          South     215.0
          West      373.0
Name: Value, dtype: float64


In [27]:
grouped_sum = df.groupby(['Product', 'Region'])['Value'].mean()
print(grouped_sum)

Product   Region
Product1  East      41.714286
          North      4.500000
          South     50.000000
          West      82.000000
Product2  East      28.000000
          North     63.500000
          South     60.333333
          West      53.500000
Product3  East      50.500000
          North     40.600000
          South     71.666667
          West      62.166667
Name: Value, dtype: float64


In [29]:
## Aggregate Multiple Functions

grouped_agg = df.groupby('Region')['Value'].agg(['mean', 'sum', 'count'])  # It returns each region's mean, sum, and count
print(grouped_agg)


             mean     sum  count
Region                          
East    42.307692   550.0     13
North   37.666667   339.0      9
South   62.000000   496.0      8
West    61.588235  1047.0     17


### Merging and Joining DataFrames

In [37]:
# Create Sample DataFrames

df1 = pd.DataFrame({
    'Key': ['A', 'B', 'C'],
    'Value1': [100, 200, 300]
})

df2 = pd.DataFrame({
    'Key': ['A', 'B', 'D'],
    'Value2': [150, 250, 350]
})


In [38]:
df1

Unnamed: 0,Key,Value1
0,A,100
1,B,200
2,C,300


In [39]:
df2

Unnamed: 0,Key,Value2
0,A,150
1,B,250
2,D,350


In [None]:
## Merge DataFrames on the Key columns

merged = pd.merge(df1, df2, on='Key', how='outer') # Include all keys from both DataFrames
merged


Unnamed: 0,Key,Value1,Value2
0,A,100.0,150.0
1,B,200.0,250.0
2,C,300.0,
3,D,,350.0


In [None]:
merged = pd.merge(df1, df2, on='Key', how='inner') # Only keys present in both DataFrames
merged

Unnamed: 0,Key,Value1,Value2
0,A,100,150
1,B,200,250


In [42]:
merged = pd.merge(df1, df2, on='Key', how='left') # Only keys from df1
merged

Unnamed: 0,Key,Value1,Value2
0,A,100,150.0
1,B,200,250.0
2,C,300,


In [43]:
merged = pd.merge(df1, df2, on='Key', how='right') # Only keys from df2
merged

Unnamed: 0,Key,Value1,Value2
0,A,100.0,150
1,B,200.0,250
2,D,,350
