## DATA WRANGLING

In [3]:
import pandas as pd
import numpy as np

In [11]:
# Create DataFrame 1
data1 = {
    'id': [1, 2, 3, 4],
    'name': ['Alice', 'Bob', 'Charlie', 'David'],
    'age': [24, 27, 22, 32]
}
df1 = pd.DataFrame(data1)

# Create DataFrame 2
data2 = {
    'id': [3, 4, 5, 6],
    'name': ['Charlie', 'David', 'Edward', 'Frank'],
    'salary': [50000, 60000, 70000, 80000]
}
df2 = pd.DataFrame(data2)

print("DataFrame 1:")
print(df1)
print("\nDataFrame 2:")
print(df2)


DataFrame 1:
   id     name  age
0   1    Alice   24
1   2      Bob   27
2   3  Charlie   22
3   4    David   32

DataFrame 2:
   id     name  salary
0   3  Charlie   50000
1   4    David   60000
2   5   Edward   70000
3   6    Frank   80000


In [15]:
#1. Merge two DataFrames on a single key.

df_merged_single_key = pd.merge(df1, df2, on='id')
print("\nMerged on single key (id):")
print(df_merged_single_key)



Merged on single key (id):
   id   name_x  age   name_y  salary
0   3  Charlie   22  Charlie   50000
1   4    David   32    David   60000


In [17]:
#2. Merge two DataFrames on multiple keys.

df1['dept'] = ['HR', 'IT', 'Finance', 'IT']
df2['dept'] = ['Finance', 'IT', 'HR', 'Finance']

df_merged_multiple_keys = pd.merge(df1, df2, on=['id', 'dept'])
print("\nMerged on multiple keys (id and dept):")
print(df_merged_multiple_keys)



Merged on multiple keys (id and dept):
   id   name_x  age     dept   name_y  salary
0   3  Charlie   22  Finance  Charlie   50000
1   4    David   32       IT    David   60000


In [21]:
#3. Perform an outer join, inner join, left join, and right join.
df_outer_join = pd.merge(df1, df2, on='id', how='outer')
print("\nOuter Join:")
print(df_outer_join)

df_inner_join = pd.merge(df1, df2, on='id', how='inner')
print("\nInner Join:")
print(df_inner_join)

df_left_join = pd.merge(df1, df2, on='id', how='left')
print("\nLeft Join:")
print(df_left_join)

df_right_join = pd.merge(df1, df2, on='id', how='right')
print("\nRight Join:")
print(df_right_join)



Outer Join:
   id   name_x   age   dept_x   name_y   salary   dept_y
0   1    Alice  24.0       HR      NaN      NaN      NaN
1   2      Bob  27.0       IT      NaN      NaN      NaN
2   3  Charlie  22.0  Finance  Charlie  50000.0  Finance
3   4    David  32.0       IT    David  60000.0       IT
4   5      NaN   NaN      NaN   Edward  70000.0       HR
5   6      NaN   NaN      NaN    Frank  80000.0  Finance

Inner Join:
   id   name_x  age   dept_x   name_y  salary   dept_y
0   3  Charlie   22  Finance  Charlie   50000  Finance
1   4    David   32       IT    David   60000       IT

Left Join:
   id   name_x  age   dept_x   name_y   salary   dept_y
0   1    Alice   24       HR      NaN      NaN      NaN
1   2      Bob   27       IT      NaN      NaN      NaN
2   3  Charlie   22  Finance  Charlie  50000.0  Finance
3   4    David   32       IT    David  60000.0       IT

Right Join:
   id   name_x   age   dept_x   name_y  salary   dept_y
0   3  Charlie  22.0  Finance  Charlie   50000  F

In [25]:
#4. Concatenate two DataFrames along rows.
df_concat_rows = pd.concat([df1, df2], ignore_index=True)
print("\nConcatenate along rows:")
print(df_concat_rows)



Concatenate along rows:
   id     name   age     dept   salary
0   1    Alice  24.0       HR      NaN
1   2      Bob  27.0       IT      NaN
2   3  Charlie  22.0  Finance      NaN
3   4    David  32.0       IT      NaN
4   3  Charlie   NaN  Finance  50000.0
5   4    David   NaN       IT  60000.0
6   5   Edward   NaN       HR  70000.0
7   6    Frank   NaN  Finance  80000.0


In [27]:
#5. Concatenate two DataFrames along columns.
df_concat_cols = pd.concat([df1, df2], axis=1)
print("\nConcatenate along columns:")
print(df_concat_cols)



Concatenate along columns:
   id     name  age     dept  id     name  salary     dept
0   1    Alice   24       HR   3  Charlie   50000  Finance
1   2      Bob   27       IT   4    David   60000       IT
2   3  Charlie   22  Finance   5   Edward   70000       HR
3   4    David   32       IT   6    Frank   80000  Finance


In [29]:
#6.Concatenate a list of DataFrames.

df_list = [df1, df2, df1]

df_concat_list = pd.concat(df_list, ignore_index=True)
print("\nConcatenate list of DataFrames:")
print(df_concat_list)



Concatenate list of DataFrames:
    id     name   age     dept   salary
0    1    Alice  24.0       HR      NaN
1    2      Bob  27.0       IT      NaN
2    3  Charlie  22.0  Finance      NaN
3    4    David  32.0       IT      NaN
4    3  Charlie   NaN  Finance  50000.0
5    4    David   NaN       IT  60000.0
6    5   Edward   NaN       HR  70000.0
7    6    Frank   NaN  Finance  80000.0
8    1    Alice  24.0       HR      NaN
9    2      Bob  27.0       IT      NaN
10   3  Charlie  22.0  Finance      NaN
11   4    David  32.0       IT      NaN


In [31]:
#7. Reshape data using the melt function to go from wide to long format.

df_melted = pd.melt(df1, id_vars=['id'], value_vars=['name', 'age'])
print("\nMelted DataFrame (wide to long):")
print(df_melted)



Melted DataFrame (wide to long):
   id variable    value
0   1     name    Alice
1   2     name      Bob
2   3     name  Charlie
3   4     name    David
4   1      age       24
5   2      age       27
6   3      age       22
7   4      age       32


In [35]:
#8. Create a pivot table to summarize data.

df_pivot = df1.pivot_table(index='dept', values='age', aggfunc='mean')
print("\nPivot table summarizing average age by department:")
print(df_pivot)



Pivot table summarizing average age by department:
          age
dept         
Finance  22.0
HR       24.0
IT       29.5


In [37]:
#9. Group data by one or more columns and perform aggregation functions (e.g., sum, mean, count).
df_grouped = df1.groupby('dept').agg({
    'age': ['sum', 'mean', 'count']
})
print("\nGrouped data with aggregation functions (sum, mean, count):")
print(df_grouped)



Grouped data with aggregation functions (sum, mean, count):
        age            
        sum  mean count
dept                   
Finance  22  22.0     1
HR       24  24.0     1
IT       59  29.5     2


In [39]:
#10. Apply multiple aggregation functions to grouped data.
df_grouped_multi = df1.groupby('dept').agg({
    'age': ['sum', 'mean', 'max', 'min']
})
print("\nGrouped data with multiple aggregation functions:")
print(df_grouped_multi)



Grouped data with multiple aggregation functions:
        age              
        sum  mean max min
dept                     
Finance  22  22.0  22  22
HR       24  24.0  24  24
IT       59  29.5  32  27


In [41]:
#11. Use the groupby function to group data and apply custom functions.
# Define a custom function
def custom_agg(series):
    return series.max() - series.min()

# Group by 'dept' and apply custom function to 'age'
df_grouped_custom = df1.groupby('dept').agg({
    'age': custom_agg
})
print("\nGrouped data with custom aggregation function (range of ages):")
print(df_grouped_custom)



Grouped data with custom aggregation function (range of ages):
         age
dept        
Finance    0
HR         0
IT         5
