In [None]:
# Q1. List any five functions of the pandas library with execution.




# 1. read_csv(): This function is used to read data from a CSV file and create a DataFrame.


import pandas as pd

# Read the CSV file into a DataFrame
data = pd.read_csv('data.csv')

# Display the DataFrame
print(data)


# 2. head(): This function is used to display the first few rows of a DataFrame.


import pandas as pd

# Create a DataFrame
data = pd.DataFrame({'Name': ['Alice', 'Bob', 'Charlie', 'David'],
                     'Age': [25, 30, 35, 40]})

# Display the first 2 rows of the DataFrame
print(data.head(2))


# 3. info(): This function provides a summary of the DataFrame, including the column names, data types, and non-null values.


import pandas as pd

# Create a DataFrame
data = pd.DataFrame({'Name': ['Alice', 'Bob', 'Charlie'],
                     'Age': [25, 30, None],
                     'Salary': [50000, 60000, 70000]})

# Display the summary information of the DataFrame
print(data.info())


# 4. groupby(): This function is used to group the data in a DataFrame based on one or more columns.


import pandas as pd

# Create a DataFrame
data = pd.DataFrame({'Name': ['Alice', 'Bob', 'Charlie', 'David'],
                     'Department': ['HR', 'IT', 'HR', 'IT'],
                     'Salary': [50000, 60000, 55000, 65000]})

# Group the data by the 'Department' column and calculate the average salary
grouped_data = data.groupby('Department')['Salary'].mean()

# Display the grouped data
print(grouped_data)


# 5. describe(): This function provides descriptive statistics of the DataFrame, such as count, mean, standard deviation, minimum, and maximum values.


import pandas as pd

# Create a DataFrame
data = pd.DataFrame({'A': [1, 2, 3, 4, 5],
                     'B': [10, 20, 30, 40, 50],
                     'C': [100, 200, 300, 400, 500]})

# Display the descriptive statistics of the DataFrame
print(data.describe())


# These are just a few examples of the many functions available in the pandas library for data manipulation and analysis.

In [6]:
# Q2. Given a Pandas DataFrame df with columns 'A', 'B', and 'C', write a Python function to re-index the
# DataFrame with a new index that starts from 1 and increments by 2 for each row.



# To re-index a Pandas DataFrame with a new index that starts from 1 and increments by 2 for each row, you can use the `reset_index()` function and manipulate the resulting index. Here's a Python function that accomplishes this:


import pandas as pd

def reindex_with_increment(df):
    # Reset the index to default (0, 1, 2, ...)
    df = df.reset_index(drop=True)
    
    # Set the new index starting from 1 and incrementing by 2
    df.index = (df.index * 2) + 1
    
    return df


# In this function, the `reset_index()` function is used to reset the index of the DataFrame to the default numerical index. The `drop=True` argument ensures that the old index is dropped and not added as a new column in the DataFrame.

# Then, the `index` attribute of the DataFrame is reassigned with a new index that starts from 1 and increments by 2. The expression `(df.index * 2) + 1` calculates the new index values based on the original index.

# Here's an example usage of the function:


# Create a sample DataFrame
df = pd.DataFrame({'A': [10, 20, 30],
                   'B': [40, 50, 60],
                   'C': [70, 80, 90]})

# Re-index the DataFrame
reindexed_df = reindex_with_increment(df)

# Display the re-indexed DataFrame
print(reindexed_df)





# In the output, you can see that the DataFrame has been re-indexed with a new index starting from 1 and incrementing by 2 for each row.

    A   B   C
1  10  40  70
3  20  50  80
5  30  60  90


In [5]:
# Q3. You have a Pandas DataFrame df with a column named 'Values'. Write a Python function that
# iterates over the DataFrame and calculates the sum of the first three values in the 'Values' column. The
# function should print the sum to the console.
# For example, if the 'Values' column of df contains the values [10, 20, 30, 40, 50], your function should
# calculate and print the sum of the first three values, which is 60.


import pandas as pd

def calculate_sum_of_values(df):
    # Extract the 'Values' column from the DataFrame
    values_column = df['Values']
    
    # Calculate the sum of the first three values
    sum_of_values = values_column.head(3).sum()
    
    # Print the sum to the console
    print("Sum of the first three values:", sum_of_values)

    
# Create a sample DataFrame
df = pd.DataFrame({'Values': [10, 20, 30, 40, 50]})

# Calculate and print the sum of the first three values
calculate_sum_of_values(df)


Sum of the first three values: 60


In [7]:
# Q4. Given a Pandas DataFrame df with a column 'Text', write a Python function to create a new column
# 'Word_Count' that contains the number of words in each row of the 'Text' column.



import pandas as pd

def add_word_count_column(df):
    # Create a new column 'Word_Count' using the 'Text' column
    df['Word_Count'] = df['Text'].apply(lambda x: len(str(x).split()))

    return df

# Create a sample DataFrame
df = pd.DataFrame({'Text': ['Hello, how are you?', 'I am doing well.', 'Python is great!']})

# Add the 'Word_Count' column
df = add_word_count_column(df)

# Display the DataFrame with the new column
print(df)


                  Text  Word_Count
0  Hello, how are you?           4
1     I am doing well.           4
2     Python is great!           3


In [8]:
# Q5. How are DataFrame.size() and DataFrame.shape() different?


# The `DataFrame.size` and `DataFrame.shape` are both attributes in Pandas that provide information about the dimensions of a DataFrame, but they represent different aspects of the DataFrame's structure.

# 1. DataFrame.size:
#    - `DataFrame.size` returns the total number of elements in the DataFrame.
#    - It represents the product of the number of rows and columns in the DataFrame.
#    - The returned value is an integer.
#    - It includes both the NaN (missing) and non-NaN elements in the DataFrame.
#    - It can be calculated as `DataFrame.shape[0] * DataFrame.shape[1]`.

# 2. DataFrame.shape:
#    - `DataFrame.shape` returns a tuple representing the dimensions of the DataFrame.
#    - The returned tuple contains two elements: the number of rows and the number of columns, respectively.
#    - It provides an overview of the DataFrame's structure.
#    - The returned values are integers.
#    - It does not include the actual data elements of the DataFrame, only the dimensions.

# Here's an example to demonstrate the difference:


import pandas as pd

# Create a sample DataFrame
data = {'A': [1, 2, 3],
        'B': [4, 5, 6],
        'C': [7, 8, 9]}
df = pd.DataFrame(data)

# Get the DataFrame size
size = df.size

# Get the DataFrame shape
shape = df.shape

print("Size:", size)
print("Shape:", shape)




# In this example, the DataFrame has 3 rows and 3 columns. The `DataFrame.size` returns 9, which is the total number of elements (3 rows x 3 columns). The `DataFrame.shape` returns (3, 3), indicating that the DataFrame has 3 rows and 3 columns.

Size: 9
Shape: (3, 3)


In [None]:
# Q6. Which function of pandas do we use to read an excel file?



# To read an Excel file in pandas, you can use the `read_excel()` function. This function is specifically designed to read data from Excel files and create a DataFrame.

# Here's an example of how to use the `read_excel()` function:


import pandas as pd

# Read the Excel file into a DataFrame
df = pd.read_excel('data.xlsx')

# Display the DataFrame
print(df)


# In this example, `read_excel()` is used to read the Excel file named 'data.xlsx' and create a DataFrame called `df`. The function automatically detects the data within the Excel file and converts it into a structured tabular format.

# You can specify additional parameters in the `read_excel()` function to customize the reading process, such as the sheet name, specific columns to read, data range, etc. Here's an example that demonstrates reading a specific sheet from the Excel file:


import pandas as pd

# Read the 'Sheet1' from the Excel file into a DataFrame
df = pd.read_excel('data.xlsx', sheet_name='Sheet1')

# Display the DataFrame
print(df)


# In this case, the `sheet_name` parameter is set to 'Sheet1' to read only the data from that particular sheet.

# Remember to ensure that you have the necessary dependencies installed, such as `pandas` and `openpyxl` (for reading Excel files), before using the `read_excel()` function.

In [10]:
# Q7. You have a Pandas DataFrame df that contains a column named 'Email' that contains email
# addresses in the format 'username@domain.com'. Write a Python function that creates a new column
# 'Username' in df that contains only the username part of each email address.




# To create a new column 'Username' in a Pandas DataFrame `df` that contains only the username part of each email address in the 'Email' column, you can use the `str.split()` method along with the `str.get()` method. Here's a Python function that accomplishes this:


import pandas as pd

def extract_username(df):
    # Split the 'Email' column by '@' to separate the username and domain parts
    df['Username'] = df['Email'].str.split('@').str.get(0)

    return df


# In this function, we use the `str.split()` method to split each email address in the 'Email' column into a list of strings, using '@' as the separator. Then, we use the `str.get()` method to retrieve the first element of each list, which represents the username part of the email address.

# Here's an example usage of the function:


# Create a sample DataFrame
df = pd.DataFrame({'Email': ['john@example.com', 'jane@example.com', 'bob@example.com']})

# Extract the username and create the 'Username' column
df = extract_username(df)

# Display the DataFrame with the new column
print(df)


# In the output, you can see that a new column 'Username' has been added to the DataFrame, containing only the username part of each email address from the 'Email' column.

              Email Username
0  john@example.com     john
1  jane@example.com     jane
2   bob@example.com      bob


In [11]:
# Q8. You have a Pandas DataFrame df with columns 'A', 'B', and 'C'. Write a Python function that selects
# all rows where the value in column 'A' is greater than 5 and the value in column 'B' is less than 10. The
# function should return a new DataFrame that contains only the selected rows.
# For example, if df contains the following values:

#   A B C
# 0 3 5 1
# 1 8 2 7
# 2 6 9 4
# 3 2 3 5
# 4 9 1 2



# To select rows from a Pandas DataFrame `df` where the value in column 'A' is greater than 5 and the value in column 'B' is less than 10, you can use boolean indexing. Here's a Python function that achieves this:


import pandas as pd

def select_rows(df):
    # Apply boolean indexing to select rows based on the given conditions
    selected_rows = df[(df['A'] > 5) & (df['B'] < 10)]

    return selected_rows


# In this function, we use boolean indexing with the conditions `(df['A'] > 5)` and `(df['B'] < 10)` to filter the DataFrame `df`. The `&` operator is used to combine the conditions with an element-wise AND operation.

# Here's an example usage of the function with the provided DataFrame values:


# Create the sample DataFrame
data = {'A': [3, 8, 6, 2, 9],
        'B': [5, 2, 9, 3, 1],
        'C': [1, 7, 4, 5, 2]}
df = pd.DataFrame(data)

# Select the rows based on the conditions
selected_df = select_rows(df)

# Display the selected DataFrame
print(selected_df)




# In the output, you can see that the function has returned a new DataFrame containing only the rows where the value in column 'A' is greater than 5 and the value in column 'B' is less than 10.

   A  B  C
1  8  2  7
2  6  9  4
4  9  1  2


In [12]:
# Q9. Given a Pandas DataFrame df with a column 'Values', write a Python function to calculate the mean,
# median, and standard deviation of the values in the 'Values' column.




import pandas as pd

def calculate_stats(df):
    # Calculate mean, median, and standard deviation of 'Values' column
    mean = df['Values'].mean()
    median = df['Values'].median()
    std_dev = df['Values'].std()
    
    # Return the calculated statistics
    return mean, median, std_dev


# Create a sample DataFrame
df = pd.DataFrame({'Values': [10, 20, 30, 40, 50]})

# Calculate the statistics
mean, median, std_dev = calculate_stats(df)

# Print the calculated statistics
print("Mean:", mean)
print("Median:", median)
print("Standard Deviation:", std_dev)


Mean: 30.0
Median: 30.0
Standard Deviation: 15.811388300841896


In [13]:
# Q10. Given a Pandas DataFrame df with a column 'Sales' and a column 'Date', write a Python function to
# create a new column 'MovingAverage' that contains the moving average of the sales for the past 7 days
# for each row in the DataFrame. The moving average should be calculated using a window of size 7 and
# should include the current day.


import pandas as pd

def calculate_moving_average(df):
    # Calculate the moving average using a window of size 7
    df['MovingAverage'] = df['Sales'].rolling(window=7, min_periods=1).mean()

    return df


# Create a sample DataFrame
df = pd.DataFrame({
    'Date': pd.date_range(start='2023-06-01', periods=10),
    'Sales': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
})

# Calculate the moving average
df = calculate_moving_average(df)

# Display the DataFrame with the new column
print(df)


        Date  Sales  MovingAverage
0 2023-06-01     10           10.0
1 2023-06-02     20           15.0
2 2023-06-03     30           20.0
3 2023-06-04     40           25.0
4 2023-06-05     50           30.0
5 2023-06-06     60           35.0
6 2023-06-07     70           40.0
7 2023-06-08     80           50.0
8 2023-06-09     90           60.0
9 2023-06-10    100           70.0


In [14]:
# Q11. You have a Pandas DataFrame df with a column 'Date'. Write a Python function that creates a new
# column 'Weekday' in the DataFrame. The 'Weekday' column should contain the weekday name (e.g.
# Monday, Tuesday) corresponding to each date in the 'Date' column.



import pandas as pd

def add_weekday_column(df):
    # Convert 'Date' column to datetime format
    df['Date'] = pd.to_datetime(df['Date'])

    # Extract weekday name and assign it to 'Weekday' column
    df['Weekday'] = df['Date'].dt.strftime('%A')

    return df



# Create a sample DataFrame
df = pd.DataFrame({'Date': ['2023-06-01', '2023-06-02', '2023-06-03']})

# Add the weekday column
df = add_weekday_column(df)

# Display the DataFrame with the new column
print(df)


        Date   Weekday
0 2023-06-01  Thursday
1 2023-06-02    Friday
2 2023-06-03  Saturday


In [15]:
# Q12. Given a Pandas DataFrame df with a column 'Date' that contains timestamps, write a Python
# function to select all rows where the date is between '2023-01-01' and '2023-01-31'.




import pandas as pd

def select_rows_between_dates(df):
    # Convert 'Date' column to datetime format
    df['Date'] = pd.to_datetime(df['Date'])

    # Select rows where the date is between '2023-01-01' and '2023-01-31'
    selected_rows = df[df['Date'].between('2023-01-01', '2023-01-31')]

    return selected_rows


# Create a sample DataFrame
df = pd.DataFrame({'Date': ['2022-12-31', '2023-01-01', '2023-01-15', '2023-02-01']})

# Select the rows between '2023-01-01' and '2023-01-31'
selected_df = select_rows_between_dates(df)

# Display the selected DataFrame
print(selected_df)


        Date
1 2023-01-01
2 2023-01-15
