Q1. List any five functions of the pandas library with execution.

In [2]:
import pandas as pd

# Create a sample DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
        'Age': [25, 30, 22, 28],
        'City': ['New York', 'Los Angeles', 'Chicago', 'Houston']}
df = pd.DataFrame(data)

# 1. head(): Display the first few rows of the DataFrame
print(df.head())




      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   22      Chicago
3    David   28      Houston


In [3]:
# 2. describe(): Generate summary statistics of the DataFrame
print(df.describe())



         Age
count   4.00
mean   26.25
std     3.50
min    22.00
25%    24.25
50%    26.50
75%    28.50
max    30.00


In [4]:
# 3. info(): Display information about the DataFrame, including data types and non-null counts
print(df.info())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    4 non-null      object
 1   Age     4 non-null      int64 
 2   City    4 non-null      object
dtypes: int64(1), object(2)
memory usage: 224.0+ bytes
None


In [5]:
# 4. groupby(): Group data based on a column and perform aggregate operations
grouped = df.groupby('City')['Age'].mean()
print(grouped)



City
Chicago        22.0
Houston        28.0
Los Angeles    30.0
New York       25.0
Name: Age, dtype: float64


In [6]:
# 5. sort_values(): Sort the DataFrame by one or more columns
sorted_df = df.sort_values(by='Age', ascending=False)
print(sorted_df)

      Name  Age         City
1      Bob   30  Los Angeles
3    David   28      Houston
0    Alice   25     New York
2  Charlie   22      Chicago


Q2. Given a Pandas DataFrame df with columns 'A', 'B', and 'C', write a Python function to re-index the
DataFrame with a new index that starts from 1 and increments by 2 for each row.

In [7]:
import pandas as pd

def reindex_with_increment(df):
    new_index = pd.RangeIndex(start=1, step=2, stop=len(df)*2)
    df = df.set_index(new_index)
    return df

# Example DataFrame
data = {'A': [10, 20, 30, 40],
        'B': [50, 60, 70, 80],
        'C': [90, 100, 110, 120]}
df = pd.DataFrame(data)

# Re-index the DataFrame using the function
new_df = reindex_with_increment(df)
print(new_df)


    A   B    C
1  10  50   90
3  20  60  100
5  30  70  110
7  40  80  120


Q3. You have a Pandas DataFrame df with a column named 'Values'. Write a Python function that
iterates over the DataFrame and calculates the sum of the first three values in the 'Values' column. The
function should print the sum to the console.

In [8]:
import pandas as pd

def calculate_sum_of_first_three(df):
    # Get the first three values from the 'Values' column
    first_three_values = df['Values'].iloc[:3]
    
    # Calculate the sum of the first three values
    sum_first_three = first_three_values.sum()
    
    # Print the sum to the console
    print("Sum of the first three values:", sum_first_three)

# Example DataFrame
data = {'Values': [10, 20, 30, 40, 50]}
df = pd.DataFrame(data)

# Call the function to calculate and print the sum
calculate_sum_of_first_three(df)


Sum of the first three values: 60


Q4. Given a Pandas DataFrame df with a column 'Text', write a Python function to create a new column
'Word_Count' that contains the number of words in each row of the 'Text' column.

In [9]:
import pandas as pd

def add_word_count_column(df):
    # Split each row in the 'Text' column into words and calculate word counts
    df['Word_Count'] = df['Text'].apply(lambda x: len(x.split()))
    return df

# Example DataFrame
data = {'Text': ["This is a sample text.",
                 "Another example with more words.",
                 "Short text."]}
df = pd.DataFrame(data)

# Call the function to add the 'Word_Count' column
df_with_word_count = add_word_count_column(df)
print(df_with_word_count)


                               Text  Word_Count
0            This is a sample text.           5
1  Another example with more words.           5
2                       Short text.           2


Q5. How are DataFrame.size() and DataFrame.shape() different?

In [10]:
import pandas as pd

data = {'A': [1, 2, 3],
        'B': [4, 5, 6]}
df = pd.DataFrame(data)

size_of_df = df.size
print(size_of_df)  # Output: 6 (3 rows * 2 columns = 6 elements)

shape_of_df = df.shape
print(shape_of_df)  # Output: (3, 2) (3 rows, 2 columns)

6
(3, 2)


Q6. Which function of pandas do we use to read an excel file?

Q7. You have a Pandas DataFrame df that contains a column named 'Email' that contains email
addresses in the format 'username@domain.com'. Write a Python function that creates a new column
'Username' in df that contains only the username part of each email address.

In [11]:
import pandas as pd

def extract_username(df):
    # Extract username from email addresses using a lambda function
    df['Username'] = df['Email'].apply(lambda email: email.split('@')[0])
    return df

# Example DataFrame
data = {'Email': ['user1@example.com',
                  'user2@example.com',
                  'user3@example.com']}
df = pd.DataFrame(data)

# Call the function to extract usernames and create the 'Username' column
df_with_username = extract_username(df)
print(df_with_username)


               Email Username
0  user1@example.com    user1
1  user2@example.com    user2
2  user3@example.com    user3


In [12]:
import pandas as pd

def filter_dataframe(df):
    selected_rows = df[(df['A'] > 5) & (df['B'] < 10)]
    return selected_rows

# Example DataFrame
data = {'A': [3, 8, 6, 2, 9],
        'B': [5, 2, 9, 3, 1],
        'C': [1, 7, 4, 5, 2]}
df = pd.DataFrame(data)

# Call the function to filter the DataFrame
selected_df = filter_dataframe(df)
print(selected_df)


   A  B  C
1  8  2  7
2  6  9  4
4  9  1  2


Q9. Given a Pandas DataFrame df with a column 'Values', write a Python function to calculate the mean,
median, and standard deviation of the values in the 'Values' column.

In [13]:
import pandas as pd

def calculate_stats(df):
    # Calculate mean, median, and standard deviation
    mean_value = df['Values'].mean()
    median_value = df['Values'].median()
    std_deviation = df['Values'].std()
    
    return mean_value, median_value, std_deviation

# Example DataFrame
data = {'Values': [10, 20, 30, 40, 50]}
df = pd.DataFrame(data)

# Call the function to calculate statistics
mean_val, median_val, std_dev = calculate_stats(df)

# Print the calculated statistics
print("Mean:", mean_val)
print("Median:", median_val)
print("Standard Deviation:", std_dev)


Mean: 30.0
Median: 30.0
Standard Deviation: 15.811388300841896


Q10. Given a Pandas DataFrame df with a column 'Sales' and a column 'Date', write a Python function to
create a new column 'MovingAverage' that contains the moving average of the sales for the past 7 days
for each row in the DataFrame. The moving average should be calculated using a window of size 7 and
should include the current day.

In [14]:
import pandas as pd

def calculate_moving_average(df):
    window_size = 7
    
    # Calculate moving average using rolling window
    df['MovingAverage'] = df['Sales'].rolling(window=window_size, min_periods=1).mean()
    
    return df

# Example DataFrame
data = {'Date': pd.date_range(start='2023-08-01', periods=10),
        'Sales': [10, 15, 20, 25, 30, 35, 40, 45, 50, 55]}
df = pd.DataFrame(data)

# Call the function to calculate and add the 'MovingAverage' column
df_with_moving_average = calculate_moving_average(df)
print(df_with_moving_average)


        Date  Sales  MovingAverage
0 2023-08-01     10           10.0
1 2023-08-02     15           12.5
2 2023-08-03     20           15.0
3 2023-08-04     25           17.5
4 2023-08-05     30           20.0
5 2023-08-06     35           22.5
6 2023-08-07     40           25.0
7 2023-08-08     45           30.0
8 2023-08-09     50           35.0
9 2023-08-10     55           40.0


In [17]:
import pandas as pd

def add_weekday_column(df):
    # Convert 'Date' column to datetime format if it's not already
    df['Date'] = pd.to_datetime(df['Date'])
   
    # Add 'Weekday' column with weekday names
    df['Weekday'] = df['Date'].dt.strftime('%A')
    
    return df

# Example DataFrame
data = {'Date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05']}
df = pd.DataFrame(data)

# Call the function to add the 'Weekday' column
df_with_weekday = add_weekday_column(df)
print(df_with_weekday)


        Date    Weekday
0 2023-01-01     Sunday
1 2023-01-02     Monday
2 2023-01-03    Tuesday
3 2023-01-04  Wednesday
4 2023-01-05   Thursday


Q12. Given a Pandas DataFrame df with a column 'Date' that contains timestamps, write a Python
function to select all rows where the date is between '2023-01-01' and '2023-01-31'.

In [19]:
import pandas as pd

def filter_by_date(df):
    # Convert 'Date' column to datetime format if it's not already
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Filter rows based on date range using .between()
    filtered_df = df[df['Date'].between('2023-01-01', '2023-01-31', inclusive=both)]
    
    return filtered_df

# Example DataFrame
data = {'Date': ['2023-01-01', '2023-01-15', '2023-01-31', '2023-02-15', '2023-03-01']}
df = pd.DataFrame(data)

# Call the function to filter by date range
filtered_by_date_df = filter_by_date(df)
print(filtered_by_date_df)


NameError: name 'both' is not defined