In [2]:
# Import pandas
import pandas as pd

# Example DataFrame
data = {'Name': ['John', 'Alice', 'Bob', 'Emily', 'Michael'],
        'Age': [28, 24, 22, 29, 35],
        'City': ['New York', 'Los Angeles', 'Chicago', 'San Francisco', 'Boston']}

df = pd.DataFrame(data)

# 1. head(): Returns the first n rows of the DataFrame (default n=5).
print(df.head(3))

# 2. tail(): Returns the last n rows of the DataFrame (default n=5).
print(df.tail(2))

# 3. describe(): Generates descriptive statistics of the DataFrame.
print(df.describe())

# 4. info(): Provides a concise summary of the DataFrame, including data types and non-null values.
print(df.info())

# 5. groupby(): Group DataFrame using a mapper or by a Series of columns and perform operations on the grouped data.
grouped = df.groupby('City')
print(grouped.mean())


    Name  Age         City
0   John   28     New York
1  Alice   24  Los Angeles
2    Bob   22      Chicago
      Name  Age           City
3    Emily   29  San Francisco
4  Michael   35         Boston
             Age
count   5.000000
mean   27.600000
std     5.029911
min    22.000000
25%    24.000000
50%    28.000000
75%    29.000000
max    35.000000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    5 non-null      object
 1   Age     5 non-null      int64 
 2   City    5 non-null      object
dtypes: int64(1), object(2)
memory usage: 248.0+ bytes
None
                Age
City               
Boston         35.0
Chicago        22.0
Los Angeles    24.0
New York       28.0
San Francisco  29.0


  print(grouped.mean())


In [3]:
import pandas as pd

def reindex_with_increment(df):
    new_index = pd.RangeIndex(start=1, stop=len(df)*2, step=2)
    df_reindexed = df.copy()
    df_reindexed.index = new_index
    return df_reindexed

# Example usage:
data = {'A': [10, 20, 30, 40, 50],
        'B': [100, 200, 300, 400, 500],
        'C': [1000, 2000, 3000, 4000, 5000]}
df = pd.DataFrame(data)

reindexed_df = reindex_with_increment(df)
print(reindexed_df)


    A    B     C
1  10  100  1000
3  20  200  2000
5  30  300  3000
7  40  400  4000
9  50  500  5000


In [4]:
import pandas as pd

def sum_first_three_values(df):
    first_three_values_sum = df['Values'].head(3).sum()
    print("Sum of the first three values:", first_three_values_sum)

# Example usage:
data = {'Values': [10, 20, 30, 40, 50]}
df = pd.DataFrame(data)

sum_first_three_values(df)


Sum of the first three values: 60


In [5]:
import pandas as pd

def count_words(df):
    df['Word_Count'] = df['Text'].apply(lambda x: len(str(x).split()))
    return df

# Example usage:
data = {'Text': ['Hello, how are you?', 'This is a pandas tutorial', 'Python is great']}
df = pd.DataFrame(data)

df = count_words(df)
print(df)


                        Text  Word_Count
0        Hello, how are you?           4
1  This is a pandas tutorial           5
2            Python is great           3


In [8]:
#Difference between DataFrame.size() and DataFrame.shape():

#DataFrame.size: It returns the total number of elements in the DataFrame, which is the product of the number of rows and columns.
#DataFrame.shape: It returns a tuple representing the dimensions of the DataFrame, with the number of rows and columns.

import pandas as pd

data = {'A': [1, 2, 3],
        'B': [4, 5, 6]}
df = pd.DataFrame(data)

print("DataFrame.size:", df.size)      

print("DataFrame.shape:", df.shape)    



DataFrame.size: 6
DataFrame.shape: (3, 2)


In [None]:
#The function to read an Excel file in pandas is pd.read_excel().

import pandas as pd

# Assuming 'data.xlsx' is the name of the Excel file
df = pd.read_excel('data.xlsx')
print(df)

In [9]:
import pandas as pd

def extract_username(df):
    df['Username'] = df['Email'].str.split('@').str[0]
    return df

# Example usage:
data = {'Email': ['john.doe@example.com', 'alice.smith@example.com', 'bob@gmail.com']}
df = pd.DataFrame(data)

df = extract_username(df)
print(df)


                     Email     Username
0     john.doe@example.com     john.doe
1  alice.smith@example.com  alice.smith
2            bob@gmail.com          bob


In [10]:
import pandas as pd

def select_rows(df):
    selected_rows = df[(df['A'] > 5) & (df['B'] < 10)]
    return selected_rows

# Example usage:
data = {'A': [3, 8, 6, 2, 9],
        'B': [5, 2, 9, 3, 1],
        'C': [1, 7, 4, 5, 2]}
df = pd.DataFrame(data)

selected_df = select_rows(df)
print(selected_df)


   A  B  C
1  8  2  7
2  6  9  4
4  9  1  2


In [11]:
import pandas as pd

def calculate_stats(df):
    mean_value = df['Values'].mean()
    median_value = df['Values'].median()
    std_value = df['Values'].std()
    return mean_value, median_value, std_value

# Example usage:
data = {'Values': [10, 20, 30, 40, 50]}
df = pd.DataFrame(data)

mean, median, std = calculate_stats(df)
print("Mean:", mean)
print("Median:", median)
print("Standard Deviation:", std)


Mean: 30.0
Median: 30.0
Standard Deviation: 15.811388300841896


In [12]:
import pandas as pd

def calculate_moving_average(df):
    df['MovingAverage'] = df['Sales'].rolling(window=7, min_periods=1).mean()
    return df

# Example usage:
data = {'Sales': [10, 15, 20, 25, 30, 35, 40, 45, 50]}
df = pd.DataFrame(data)

df = calculate_moving_average(df)
print(df)


   Sales  MovingAverage
0     10           10.0
1     15           12.5
2     20           15.0
3     25           17.5
4     30           20.0
5     35           22.5
6     40           25.0
7     45           30.0
8     50           35.0


In [13]:
import pandas as pd

def add_weekday_column(df):
    df['Weekday'] = pd.to_datetime(df['Date']).dt.strftime('%A')
    return df

# Example usage:
data = {'Date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05']}
df = pd.DataFrame(data)

df = add_weekday_column(df)
print(df)


         Date    Weekday
0  2023-01-01     Sunday
1  2023-01-02     Monday
2  2023-01-03    Tuesday
3  2023-01-04  Wednesday
4  2023-01-05   Thursday


In [14]:
import pandas as pd

def select_rows_between_dates(df):
    start_date = '2023-01-01'
    end_date = '2023-01-31'
    mask = (df['Date'] >= start_date) & (df['Date'] <= end_date)
    selected_rows = df[mask]
    return selected_rows

# Example usage:
data = {'Date': ['2023-01-01', '2023-01-15', '2023-01-30', '2023-02-05']}
df = pd.DataFrame(data)

selected_df = select_rows_between_dates(df)
print(selected_df)


         Date
0  2023-01-01
1  2023-01-15
2  2023-01-30


In [None]:
#The first and foremost necessary library to use the basic functions of pandas is import pandas as pd. 
#This import statement allows you to access the pandas library in your Python code using the alias 'pd'.
#It is a convention to use 'pd' as the alias for pandas,
#but you can choose any other valid alias if you prefer.