In [1]:
import pandas as pd


 Pandas is one of the most widely-used libraries in Python for data manipulation and analysis. It provides powerful data structures like ``DataFrame`` and ``Series`` that make it easy to work with structured data, whether it's from a CSV, database, or other sources

# Creating a DataFrame
 - A DataFrame is a 2-dimensional labeled data structure (like a table). It’s the primary object you'll work with in pandas.
 - You can create a DataFrame by passing a dictionary where the keys are the column names, and the values are lists (the rows).

In [2]:

import pandas as pd

# Creating a DataFrame from a dictionary
data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['New York', 'Los Angeles', 'Chicago']
}

df = pd.DataFrame(data)
print(df)


      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago


# Basic DataFrame Operations



In [4]:
print(df.head())  # First 5 rows (default)
print("--------------------")
print(df.head(2)) # First 2 rows

      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago
--------------------
    Name  Age         City
0  Alice   25     New York
1    Bob   30  Los Angeles


In [5]:
print(df.tail())  # Last 5 rows (default)


      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago


In [6]:
print(df.shape)  # Returns (rows, columns)


(3, 3)


In [7]:
print(df.columns)  # Column names
print(df.index)    # Index (row labels)


Index(['Name', 'Age', 'City'], dtype='object')
RangeIndex(start=0, stop=3, step=1)


# Selecting Data

In [8]:
print(df['Name'])  # Using key-like access
print(df.Name)     # Using attribute-like access (valid for simple column names)


0      Alice
1        Bob
2    Charlie
Name: Name, dtype: object
0      Alice
1        Bob
2    Charlie
Name: Name, dtype: object


# Selecting multiple columns:

In [None]:
print(df[['Name', 'Age']])


# Selecting rows by index:

In [10]:
# Selecting by index using iloc
print(df.iloc[0])  # First row
print("________________")
# Selecting by index range using iloc
print(df.iloc[0:2])  # First two rows
print("________________")
# Selecting by label using loc
print(df.loc[0])  # First row by index label
# here index label is 0

Name       Alice
Age           25
City    New York
Name: 0, dtype: object
________________
    Name  Age         City
0  Alice   25     New York
1    Bob   30  Los Angeles
________________
Name       Alice
Age           25
City    New York
Name: 0, dtype: object


# Boolean Indexing (Conditional Selection):

In [11]:
df['Age'] > 30


Unnamed: 0,Age
0,False
1,False
2,True


In [12]:
print(df[df['Age'] > 30])

      Name  Age     City
2  Charlie   35  Chicago


In [13]:
print(df[df['Age'] > 30].index)


Index([2], dtype='int64')


In [14]:
print(df[df['Age'] > 30].index.tolist())


[2]


# Modifying Data

In [15]:
df['Country'] = ['USA', 'USA', 'USA']
print(df)


      Name  Age         City Country
0    Alice   25     New York     USA
1      Bob   30  Los Angeles     USA
2  Charlie   35      Chicago     USA


In [16]:
df.at[0, 'Age'] = 26  # Change Alice's age to 26
df.loc[1, 'City'] = 'San Francisco'  # Change Bob's city to San Francisco
print(df)


      Name  Age           City Country
0    Alice   26       New York     USA
1      Bob   30  San Francisco     USA
2  Charlie   35        Chicago     USA


# Dropping columns or rows:

In [17]:
# Drop a column
df = df.drop('Country', axis=1)

# Drop a row by index
df = df.drop(1, axis=0)

print(df)


      Name  Age      City
0    Alice   26  New York
2  Charlie   35   Chicago


# Handling Missing Data

In [18]:
print(df.isnull())  # Returns True for NaN values, False otherwise
print(df.isnull().sum())  # Sum of NaN values per column


    Name    Age   City
0  False  False  False
2  False  False  False
Name    0
Age     0
City    0
dtype: int64


# Filling missing values:

In [19]:
df['Age'] = df['Age'].fillna(df['Age'].mean())  # Fill NaN values with the column mean


# Dropping rows with missing values:

In [None]:
df = df.dropna()  # Drop rows with any NaN values


# Aggregating Data
- You can group data by one or more columns and then aggregate with various functions like sum, mean, count, etc.

In [21]:
# Example data
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Bob'],
    'Age': [25, 30, 35, 26, 31],
    'City': ['New York', 'Los Angeles', 'Chicago', 'New York', 'Los Angeles']
}

df = pd.DataFrame(data)

# Group by 'City' and calculate the average age
grouped = df.groupby('City').mean('Age')
print(grouped)


              Age
City             
Chicago      35.0
Los Angeles  30.5
New York     25.5


In [22]:
# Get the sum of 'Age' grouped by 'City'
sum_ages = df.groupby('City')['Age'].sum()
print(sum_ages)

# Get the count of people per city
count_people = df.groupby('City').size()
print(count_people)


City
Chicago        35
Los Angeles    61
New York       51
Name: Age, dtype: int64
City
Chicago        1
Los Angeles    2
New York       2
dtype: int64


# Merging and Joining DataFrames

In [None]:
# Example data
data1 = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35]
}

data2 = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'City': ['New York', 'Los Angeles', 'Chicago']
}

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

# Merging dataframes on the 'Name' column
merged_df = pd.merge(df1, df2, on='Name')
print(merged_df)


# Sorting Data

In [23]:
# Sort by a single column
df_sorted = df.sort_values('Age', ascending=False)

# Sort by multiple columns
df_sorted = df.sort_values(by=['City', 'Age'], ascending=[True, False])


# Reading and Writing Data

In [25]:

df.to_csv('output.csv', index=False)  # index=False prevents writing row numbers


In [26]:
df = pd.read_csv('output.csv')

In [27]:
# Writing to an Excel file
df.to_excel('output.xlsx', index=False)

In [29]:
# Reading Excel files (you need to install `openpyxl` or `xlrd` for Excel support)
df = pd.read_excel('output.xlsx')

# Time Series Data
- Pandas has powerful functionality for working with time series data.

## Converting a column to datetime:


In [31]:
#df['Date'] = pd.to_datetime(df['Date'])


## Setting a column as the index (for time series):

In [None]:
df.set_index('Date', inplace=True)


# Resampling (e.g., to monthly data):

In [33]:
#monthly_data = df.resample('M').mean()  # Resample by month and get the mean
