In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [3]:
icecream = pd.read_csv("Ice Cream Ratings.csv")
icecream.head()

Unnamed: 0,Date,Flavor Rating,Texture Rating,Overall Rating
0,1/1/2022,0.22309,0.04022,0.600129
1,1/2/2022,0.635886,0.938476,0.106264
2,1/3/2022,0.442323,0.044154,0.598112
3,1/4/2022,0.389128,0.549676,0.489353
4,1/5/2022,0.386887,0.519439,0.98828


In [6]:
icecream.shape

(7, 4)

In [8]:
icecream.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Date            7 non-null      object 
 1   Flavor Rating   7 non-null      float64
 2   Texture Rating  7 non-null      float64
 3   Overall Rating  7 non-null      float64
dtypes: float64(3), object(1)
memory usage: 352.0+ bytes


In [9]:
icecream = pd.read_csv("Ice Cream Ratings.csv", parse_dates = ["Date"])
icecream

Unnamed: 0,Date,Flavor Rating,Texture Rating,Overall Rating
0,2022-01-01,0.22309,0.04022,0.600129
1,2022-01-02,0.635886,0.938476,0.106264
2,2022-01-03,0.442323,0.044154,0.598112
3,2022-01-04,0.389128,0.549676,0.489353
4,2022-01-05,0.386887,0.519439,0.98828
5,2022-01-06,0.877984,0.193588,0.832827
6,2022-01-07,0.140995,0.32511,0.105147


In [10]:
icecream.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Date            7 non-null      datetime64[ns]
 1   Flavor Rating   7 non-null      float64       
 2   Texture Rating  7 non-null      float64       
 3   Overall Rating  7 non-null      float64       
dtypes: datetime64[ns](1), float64(3)
memory usage: 352.0 bytes


## parse_dates parameter will convert the date column into date data type. 

In [11]:
icecream.set_index("Date")

Unnamed: 0_level_0,Flavor Rating,Texture Rating,Overall Rating
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,0.22309,0.04022,0.600129
2022-01-02,0.635886,0.938476,0.106264
2022-01-03,0.442323,0.044154,0.598112
2022-01-04,0.389128,0.549676,0.489353
2022-01-05,0.386887,0.519439,0.98828
2022-01-06,0.877984,0.193588,0.832827
2022-01-07,0.140995,0.32511,0.105147


### Why to set Date as index?
- Dates will be unique (most of the times). Setting as index will be easy to perform indexing

# Merging and Joining

In [12]:
data = pd.read_csv("LOTR.csv")
data.head()

Unnamed: 0,FellowshipID,FirstName,Skills
0,1001,Frodo,Hiding
1,1002,Samwise,Gardening
2,1003,Gandalf,Spells
3,1004,Pippin,Fireworks


In [13]:
data.shape

(4, 3)

In [14]:
data2 = pd.read_csv("LOTR 2.csv")
data2.head()

Unnamed: 0,FellowshipID,FirstName,Age
0,1001,Frodo,50
1,1002,Samwise,39
2,1006,Legolas,2931
3,1007,Elrond,6520
4,1008,Barromir,51


In [15]:
data.shape

(4, 3)

## Merge LOTR and LOTR2 dataset

In [16]:
pd.merge(data, data2)

Unnamed: 0,FellowshipID,FirstName,Skills,Age
0,1001,Frodo,Hiding,50
1,1002,Samwise,Gardening,39


## merge() is performing an inner on both the data.
## What is an inner join? 
- Select and display the common rows between the two datasets

In [17]:
pd.merge(data, data2, on ="FellowshipID", how = "inner")

Unnamed: 0,FellowshipID,FirstName_x,Skills,FirstName_y,Age
0,1001,Frodo,Hiding,Frodo,50
1,1002,Samwise,Gardening,Samwise,39


## Notes:
- One column should be common between the dataset you are considering to join
- on parameter of the merge() is used to specify the common column name
- how parameter is used to specify the type of join you want to perform.

- Types of joins: 4 Types of joins are available
1. Inner(default)
2. Outer
3. Right
4. Left

In [18]:
## outer join

pd.merge(data, data2, on ="FellowshipID", how = "outer")

Unnamed: 0,FellowshipID,FirstName_x,Skills,FirstName_y,Age
0,1001,Frodo,Hiding,Frodo,50.0
1,1002,Samwise,Gardening,Samwise,39.0
2,1003,Gandalf,Spells,,
3,1004,Pippin,Fireworks,,
4,1006,,,Legolas,2931.0
5,1007,,,Elrond,6520.0
6,1008,,,Barromir,51.0


In [19]:
## left join

In [20]:
data

Unnamed: 0,FellowshipID,FirstName,Skills
0,1001,Frodo,Hiding
1,1002,Samwise,Gardening
2,1003,Gandalf,Spells
3,1004,Pippin,Fireworks


In [21]:
data2

Unnamed: 0,FellowshipID,FirstName,Age
0,1001,Frodo,50
1,1002,Samwise,39
2,1006,Legolas,2931
3,1007,Elrond,6520
4,1008,Barromir,51


In [22]:
pd.merge(data, data2, on = "FellowshipID", how = "left")

Unnamed: 0,FellowshipID,FirstName_x,Skills,FirstName_y,Age
0,1001,Frodo,Hiding,Frodo,50.0
1,1002,Samwise,Gardening,Samwise,39.0
2,1003,Gandalf,Spells,,
3,1004,Pippin,Fireworks,,


In [23]:
pd.merge(data2, data, on = "FellowshipID", how = "left")

Unnamed: 0,FellowshipID,FirstName_x,Age,FirstName_y,Skills
0,1001,Frodo,50,Frodo,Hiding
1,1002,Samwise,39,Samwise,Gardening
2,1006,Legolas,2931,,
3,1007,Elrond,6520,,
4,1008,Barromir,51,,


#### In a left join the order of the datasets matter i.e. data, data2 and data2, data both will produce different output. The common rows from the right dataset are joined and displayed in the output.

# Right

In [24]:
pd.merge(data, data2, on = "FellowshipID", how = "right")

Unnamed: 0,FellowshipID,FirstName_x,Skills,FirstName_y,Age
0,1001,Frodo,Hiding,Frodo,50
1,1002,Samwise,Gardening,Samwise,39
2,1006,,,Legolas,2931
3,1007,,,Elrond,6520
4,1008,,,Barromir,51


In [25]:
pd.merge(data2, data, on = "FellowshipID", how = 'right')

Unnamed: 0,FellowshipID,FirstName_x,Age,FirstName_y,Skills
0,1001,Frodo,50.0,Frodo,Hiding
1,1002,Samwise,39.0,Samwise,Gardening
2,1003,,,Gandalf,Spells
3,1004,,,Pippin,Fireworks


#### The order of datasets in the merge function matter i.e. data, data2 and data2, data both will give different outputs. The common rows from left dataset will be displayed in the output

In [26]:
data

Unnamed: 0,FellowshipID,FirstName,Skills
0,1001,Frodo,Hiding
1,1002,Samwise,Gardening
2,1003,Gandalf,Spells
3,1004,Pippin,Fireworks


In [27]:
data2

Unnamed: 0,FellowshipID,FirstName,Age
0,1001,Frodo,50
1,1002,Samwise,39
2,1006,Legolas,2931
3,1007,Elrond,6520
4,1008,Barromir,51


In [28]:
pd.merge(data, data2, on = ["FellowshipID", "FirstName"], how = "inner")

Unnamed: 0,FellowshipID,FirstName,Skills,Age
0,1001,Frodo,Hiding,50
1,1002,Samwise,Gardening,39


# Sample data

In [32]:
dic ={
    'Name' : ["Anil", 'Sam', "Riya", "Reema", "Seema", "Sunil", "Ashok", "Alok", 'Sita'],
    'Age' : [23, 28, 35, 47, 56, 22, 30, 40, 59],
    "Gender" : ['M', 'M', "F", 'F', 'F', 'M', 'M', "M", 'F'],
    'Salary' : [25000, 38000, 42000, 49000, 57000, 68000, 76000, 85000, 92000]
}
dic

{'Name': ['Anil',
  'Sam',
  'Riya',
  'Reema',
  'Seema',
  'Sunil',
  'Ashok',
  'Alok',
  'Sita'],
 'Age': [23, 28, 35, 47, 56, 22, 30, 40, 59],
 'Gender': ['M', 'M', 'F', 'F', 'F', 'M', 'M', 'M', 'F'],
 'Salary': [25000, 38000, 42000, 49000, 57000, 68000, 76000, 85000, 92000]}

In [33]:
data = pd.DataFrame(dic)
data

Unnamed: 0,Name,Age,Gender,Salary
0,Anil,23,M,25000
1,Sam,28,M,38000
2,Riya,35,F,42000
3,Reema,47,F,49000
4,Seema,56,F,57000
5,Sunil,22,M,68000
6,Ashok,30,M,76000
7,Alok,40,M,85000
8,Sita,59,F,92000


## Create a new column in the dataset that will display the 10% salary hike to each employee

In [None]:
data['hiked_salary'] = data['Salary'].apply(lambda x : x * 1.1)

In [34]:
data["hiked_salary"] = data["Salary"] * 1.1

In [35]:
data

Unnamed: 0,Name,Age,Gender,Salary,hiked_salary
0,Anil,23,M,25000,27500.0
1,Sam,28,M,38000,41800.0
2,Riya,35,F,42000,46200.0
3,Reema,47,F,49000,53900.0
4,Seema,56,F,57000,62700.0
5,Sunil,22,M,68000,74800.0
6,Ashok,30,M,76000,83600.0
7,Alok,40,M,85000,93500.0
8,Sita,59,F,92000,101200.0


## Create Age bracket column from the Age column where the conditions are 

- 0 - 25 = Young Professional
- 26 - 40 = Associate or Mid Senior
- 41 - 60 = Senior to CXO
- '>'60 = Retired

In [36]:
def age_bracket(x):
    if x> 0 and x <= 25:
        return "Young Professional"
    elif x >= 26 and x <= 40:
        return "Associate or Mid Senior"
    elif x >= 41 and x <= 60:
        return "Senior CXO"
    else:
        return "Retired"

## Apply function:

- apply(function)

In [37]:
data["Age bracket"] = data['Age'].apply(age_bracket)

In [38]:
data

Unnamed: 0,Name,Age,Gender,Salary,hiked_salary,Age bracket
0,Anil,23,M,25000,27500.0,Young Professional
1,Sam,28,M,38000,41800.0,Associate or Mid Senior
2,Riya,35,F,42000,46200.0,Associate or Mid Senior
3,Reema,47,F,49000,53900.0,Senior CXO
4,Seema,56,F,57000,62700.0,Senior CXO
5,Sunil,22,M,68000,74800.0,Young Professional
6,Ashok,30,M,76000,83600.0,Associate or Mid Senior
7,Alok,40,M,85000,93500.0,Associate or Mid Senior
8,Sita,59,F,92000,101200.0,Senior CXO


## Task : Same above logic you have to use pd.cut() or pd.qcut()

In [39]:
pd.cut(data['Age'], bins = 4)

0    (21.963, 31.25]
1    (21.963, 31.25]
2      (31.25, 40.5]
3      (40.5, 49.75]
4      (49.75, 59.0]
5    (21.963, 31.25]
6    (21.963, 31.25]
7      (31.25, 40.5]
8      (49.75, 59.0]
Name: Age, dtype: category
Categories (4, interval[float64, right]): [(21.963, 31.25] < (31.25, 40.5] < (40.5, 49.75] < (49.75, 59.0]]

In [44]:
bin_edges = [0, 25, 40, 60, 100]
bin_values = ["YP", "A/MS", "CXO", "R"]
data["Age_bins"] = pd.cut(data['Age'], bin_edges ,labels =  bin_values)
data

Unnamed: 0,Name,Age,Gender,Salary,hiked_salary,Age bracket,Age_bins
0,Anil,23,M,25000,27500.0,Young Professional,YP
1,Sam,28,M,38000,41800.0,Associate or Mid Senior,A/MS
2,Riya,35,F,42000,46200.0,Associate or Mid Senior,A/MS
3,Reema,47,F,49000,53900.0,Senior CXO,CXO
4,Seema,56,F,57000,62700.0,Senior CXO,CXO
5,Sunil,22,M,68000,74800.0,Young Professional,YP
6,Ashok,30,M,76000,83600.0,Associate or Mid Senior,A/MS
7,Alok,40,M,85000,93500.0,Associate or Mid Senior,A/MS
8,Sita,59,F,92000,101200.0,Senior CXO,CXO


In [45]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   Name          9 non-null      object  
 1   Age           9 non-null      int64   
 2   Gender        9 non-null      object  
 3   Salary        9 non-null      int64   
 4   hiked_salary  9 non-null      float64 
 5   Age bracket   9 non-null      object  
 6   Age_bins      9 non-null      category
dtypes: category(1), float64(1), int64(2), object(3)
memory usage: 773.0+ bytes


In [47]:
pd.qcut(data['Age'], 4)

0    (21.999, 28.0]
1    (21.999, 28.0]
2      (28.0, 35.0]
3      (35.0, 47.0]
4      (47.0, 59.0]
5    (21.999, 28.0]
6      (28.0, 35.0]
7      (35.0, 47.0]
8      (47.0, 59.0]
Name: Age, dtype: category
Categories (4, interval[float64, right]): [(21.999, 28.0] < (28.0, 35.0] < (35.0, 47.0] < (47.0, 59.0]]

## Cut() method is used to create equal sized intervals whereas the qcut() the interval size is a variable

In [49]:
icecream

Unnamed: 0,Date,Flavor Rating,Texture Rating,Overall Rating
0,2022-01-01,0.22309,0.04022,0.600129
1,2022-01-02,0.635886,0.938476,0.106264
2,2022-01-03,0.442323,0.044154,0.598112
3,2022-01-04,0.389128,0.549676,0.489353
4,2022-01-05,0.386887,0.519439,0.98828
5,2022-01-06,0.877984,0.193588,0.832827
6,2022-01-07,0.140995,0.32511,0.105147


In [60]:
icecream["Date"].dt.month

0    1
1    1
2    1
3    1
4    1
5    1
6    1
Name: Date, dtype: int64

## df.pivot_table
## df.pivot
## df.cross_tab