In [1]:
import pandas as pd

#Creating a Dataframe from dictionary
data = {
    'Name': ['Anna', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['New York', 'Los Angeles', 'Chicago']
}

df = pd.DataFrame(data)
print(df)


      Name  Age         City
0     Anna   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago


In [5]:
#Inspecting the data
# Display the first 5 rows of the DataFrame
print(df.head())

# Display information about the DataFrame (data types, non-null values, etc.)
print(df.info())

# Get summary statistics for numerical columns
print(df.describe())

      Name  Age         City
0     Anna   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Age     3 non-null      int64 
 2   City    3 non-null      object
dtypes: int64(1), object(2)
memory usage: 204.0+ bytes
None
        Age
count   3.0
mean   30.0
std     5.0
min    25.0
25%    27.5
50%    30.0
75%    32.5
max    35.0


In [7]:
# Selecting columns
# Select one column
ages = df['Age']
print(ages)

# Select multiple columns
subset = df[['Name', 'City']]
print(subset)

0    25
1    30
2    35
Name: Age, dtype: int64
      Name         City
0     Anna     New York
1      Bob  Los Angeles
2  Charlie      Chicago


In [9]:
#Filtering the rows
# Filter rows where Age is greater than 30
filtered_df = df[df['Age'] > 20]
print(filtered_df)


      Name  Age         City
0     Anna   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago


In [11]:
#Adding a new column
# Add a new column 'Country' with the value 'USA' for all rows
df['Country'] = 'USA'
print(df)

      Name  Age         City Country
0     Anna   25     New York     USA
1      Bob   30  Los Angeles     USA
2  Charlie   35      Chicago     USA


In [13]:
#Modifying data
# Increase everyone's age by 1 year
df['Age'] = df['Age'] + 1
print(df)

      Name  Age         City Country
0     Anna   26     New York     USA
1      Bob   31  Los Angeles     USA
2  Charlie   36      Chicago     USA


In [15]:
# Handling Missing Data
# Check for missing values
print(df.isnull().sum())

# Fill missing values with a default value
df['Age'].fillna(0, inplace=True)

Name       0
Age        0
City       0
Country    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(0, inplace=True)


In [17]:
#Grouping and Aggregating data
# Group data by 'City' and calculate the average 'Age'
grouped_df = df.groupby('City')['Age'].mean()
print(grouped_df)

City
Chicago        36.0
Los Angeles    31.0
New York       26.0
Name: Age, dtype: float64


In [19]:
#Sorting the data
# Sort by 'Age' in descending order
sorted_df = df.sort_values(by='Age', ascending=False)
print(sorted_df)

      Name  Age         City Country
2  Charlie   36      Chicago     USA
1      Bob   31  Los Angeles     USA
0     Anna   26     New York     USA


In [None]:
# Save DataFrame to a CSV file
df.to_csv('output.csv', index=False)