<a href="https://colab.research.google.com/github/YesInAJiffy/Python/blob/main/1.%20Pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


* head() returns the first few rows (the “head” of the DataFrame).
* .info() shows information on each of the columns, such as the data type and number of missing values.
* .shape returns the number of rows and columns of the DataFrame.
* .describe() calculates a few summary statistics for each column.

In [None]:
#Information on the Dataframe

import pandas as pd

# Create a sample DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [25, 30, 22],
        'Salary': [50000, 60000, 45000]}

df = pd.DataFrame(data)

# Display information about the DataFrame
print(df.info())

print(df.head(2))

print(df.shape)

print(df.describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Age     3 non-null      int64 
 2   Salary  3 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 200.0+ bytes
None
    Name  Age  Salary
0  Alice   25   50000
1    Bob   30   60000
(3, 3)
             Age        Salary
count   3.000000      3.000000
mean   25.666667  51666.666667
std     4.041452   7637.626158
min    22.000000  45000.000000
25%    23.500000  47500.000000
50%    25.000000  50000.000000
75%    27.500000  55000.000000
max    30.000000  60000.000000


In [None]:
#SORTING
import pandas as pd

# Create a sample DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [25, 30, 22],
        'Salary': [50000, 60000, 45000]}

df = pd.DataFrame(data)

# Sort the DataFrame by the 'Salary' column in descending order
df_sorted = df.sort_values(by='Salary', ascending=False)

# Display the sorted DataFrame
print(df_sorted)


      Name  Age  Salary
1      Bob   30   60000
0    Alice   25   50000
2  Charlie   22   45000


In [None]:
#SORTING ON MULTIPLE COLUMNS
import pandas as pd

# Create a sample DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
        'Age': [25, 30, 22, 28],
        'Salary': [50000, 60000, 45000, 55000]}

df = pd.DataFrame(data)

# Sort the DataFrame by 'Age' in ascending order and then by 'Salary' in descending order
df_sorted = df.sort_values(by=['Age', 'Salary'], ascending=[True, False])

# Display the sorted DataFrame
print(df_sorted)


      Name  Age  Salary
2  Charlie   22   45000
0    Alice   25   50000
3    David   28   55000
1      Bob   30   60000


In [None]:
#Select subset from a dataframe
import pandas as pd

# Create a sample DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
        'Age': [25, 30, 22, 28],
        'Salary': [50000, 60000, 45000, 55000]}

df = pd.DataFrame(data)

# Create a new DataFrame containing rows where 'Age' is greater than 25
subset_df = df[df['Age'] > 25]

# Display the new DataFrame
print(subset_df)

# Create a new DataFrame with selected columns
selected_columns_df = df[['Name', 'Salary']]

# Display the new DataFrame
print(selected_columns_df)


    Name  Age  Salary
1    Bob   30   60000
3  David   28   55000
      Name  Salary
0    Alice   50000
1      Bob   60000
2  Charlie   45000
3    David   55000


In [1]:
# AGG function to aggregate
import pandas as pd

# Create a sample DataFrame
data = {'Group': ['A', 'A', 'B', 'B', 'A', 'B'],
        'Value1': [10, 20, 30, 40, 50, 60],
        'Value2': [100, 200, 300, 400, 500, 600]}
df = pd.DataFrame(data)

# Group by the 'Group' column and apply aggregation functions
result = df.groupby('Group').agg({
    'Value1': 'sum',        # Sum of 'Value1' within each group
    'Value2': ['mean', 'max']  # Mean and maximum of 'Value2' within each group
})

print(result)


      Value1      Value2     
         sum        mean  max
Group                        
A         80  266.666667  500
B        130  433.333333  600


In [None]:
#Quantile
import pandas as pd
import numpy as np

# Create a sample DataFrame
data = {'Value': np.random.randint(1, 100, 10)}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# Calculate the 25th, 50th (median), and 75th percentiles (quantiles)
quantiles = [0.25, 0.5, 0.75]
result_quantiles = df['Value'].quantile(quantiles)

print("\nQuantiles:")
print(result_quantiles)


In [None]:
#Cumulative Max
import pandas as pd

# Create a pandas Series
data = [3, 1, 7, 2, 8, 4]
series = pd.Series(data, name='Values')

# Calculate the cumulative maximum
cumulative_max = series.cummax()

# Display the original Series and the cumulative maximum Series
print("Original Series:")
print(series)

print("\nCumulative Maximum:")
print(cumulative_max)


In [None]:
#Cumsum example
import pandas as pd

# Create a pandas Series
data = [3, 1, 7, 2, 8, 4]
series = pd.Series(data, name='Values')

# Calculate the cumulative sum
cumulative_sum = series.cumsum()

# Display the original Series and the cumulative sum Series
print("Original Series:")
print(series)

print("\nCumulative Sum:")
print(cumulative_sum)


In [3]:
#Dropping Duplicate
import pandas as pd

# Create a sample DataFrame with duplicate rows
data = {'Name': ['Alice', 'Bob', 'Alice', 'Charlie', 'Bob'],
        'Age': [25, 30, 25, 35, 30],
        'City': ['New York', 'San Francisco', 'New York', 'Los Angeles', 'San Francisco']}

df = pd.DataFrame(data)

# Display the original DataFrame
print("Original DataFrame:")
print(df)

# Use drop_duplicates to remove duplicate rows based on all columns
df_no_duplicates = df.drop_duplicates()

# Display the DataFrame after removing duplicates
print("\nDataFrame after removing duplicates:")
print(df_no_duplicates)


Original DataFrame:
      Name  Age           City
0    Alice   25       New York
1      Bob   30  San Francisco
2    Alice   25       New York
3  Charlie   35    Los Angeles
4      Bob   30  San Francisco

DataFrame after removing duplicates:
      Name  Age           City
0    Alice   25       New York
1      Bob   30  San Francisco
3  Charlie   35    Los Angeles


In [2]:
#Dropping duplicate based on a column
import pandas as pd

# Create a sample DataFrame with duplicate rows
data = {'Name': ['Alice', 'Bob', 'Alice', 'Charlie', 'Bob'],
        'Age': [25, 30, 25, 25, 25],
        'City': ['New York', 'San Francisco', 'New York', 'Los Angeles', 'San Francisco']}

df = pd.DataFrame(data)

# Display the original DataFrame
print("Original DataFrame:")
print(df)

# Use drop_duplicates to remove duplicate rows based on all columns
df_no_duplicates = df.drop_duplicates(subset="Age")

# Display the DataFrame after removing duplicates
print("\nDataFrame after removing duplicates:")
print(df_no_duplicates)


Original DataFrame:
      Name  Age           City
0    Alice   25       New York
1      Bob   30  San Francisco
2    Alice   25       New York
3  Charlie   25    Los Angeles
4      Bob   25  San Francisco

DataFrame after removing duplicates:
    Name  Age           City
0  Alice   25       New York
1    Bob   30  San Francisco


In [17]:
#Dropping on two columns
#Dropping duplicate based on a column
import pandas as pd

# Create a sample DataFrame with duplicate rows
data = {'Name': ['Alice', 'Bob', 'Alice', 'Charlie', 'Bob','Rnady'],
        'Age': [25, 30, 25, 25, 25, 21],
        'City': ['New York', 'San Francisco', 'New York', 'Los Angeles', 'San Francisco', "NY"]}

df = pd.DataFrame(data)

# Display the original DataFrame
print("Original DataFrame:")
print(df)

# Use drop_duplicates to remove duplicate rows based on all columns
df_no_duplicates = df.drop_duplicates(subset=["Name","Age"])


# Display the DataFrame after removing duplicates
print("\nDataFrame after removing duplicates:")
print(df_no_duplicates)

print("\nCount the names occurences")
print(df_no_duplicates["Name"].value_counts())


print("\nCount the names occurences")
print(df_no_duplicates["Name"].value_counts(sort=True))

print("\nCount the names occurences")
print(df_no_duplicates["Name"].value_counts(normalize=True))


Original DataFrame:
      Name  Age           City
0    Alice   25       New York
1      Bob   30  San Francisco
2    Alice   25       New York
3  Charlie   25    Los Angeles
4      Bob   25  San Francisco
5    Rnady   21             NY

DataFrame after removing duplicates:
      Name  Age           City
0    Alice   25       New York
1      Bob   30  San Francisco
3  Charlie   25    Los Angeles
4      Bob   25  San Francisco
5    Rnady   21             NY

Count the names occurences
Bob        2
Alice      1
Charlie    1
Rnady      1
Name: Name, dtype: int64

Count the names occurences
Bob        2
Alice      1
Charlie    1
Rnady      1
Name: Name, dtype: int64

Count the names occurences
Bob        0.4
Alice      0.2
Charlie    0.2
Rnady      0.2
Name: Name, dtype: float64
