In [1]:
## Data Frame Basics 

In [None]:
import pandas as pd
import numpy as np

In [9]:
# 1. Creating DataFrames
# From a dictionary
data_dict = {
    'Name': ['John', 'Anna', 'Peter', 'Linda'],
    'Age': [28, 34, 29, 42],
    'City': ['New York', 'Paris', 'Berlin', 'London'],
    'Salary': [65000, 70000, 62000, 85000]
}
df1 = pd.DataFrame(data_dict)

In [11]:
# From a list of lists
data_list = [
    ['John', 28, 'New York', 65000],
    ['Anna', 34, 'Paris', 70000],
    ['Peter', 29, 'Berlin', 62000],
    ['Linda', 42, 'London', 85000]
]
df2 = pd.DataFrame(data_list, columns=['Name', 'Age', 'City', 'Salary'])

In [15]:
 #Basic Information
df1.shape  # Returns tuple of (rows, columns)

(4, 4)

In [17]:
df1.info()  # Summary of DataFrame including dtypes and non-null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    4 non-null      object
 1   Age     4 non-null      int64 
 2   City    4 non-null      object
 3   Salary  4 non-null      int64 
dtypes: int64(2), object(2)
memory usage: 260.0+ bytes


In [19]:
df1.describe()  # Statistical summary of numerical columns

Unnamed: 0,Age,Salary
count,4.0,4.0
mean,33.25,70500.0
std,6.396614,10214.368964
min,28.0,62000.0
25%,28.75,64250.0
50%,31.5,67500.0
75%,36.0,73750.0
max,42.0,85000.0


In [21]:
df1.head(2)  # First 2 rows

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65000
1,Anna,34,Paris,70000


In [23]:
df1.tail(2)  # Last 2 rows

Unnamed: 0,Name,Age,City,Salary
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


In [25]:
df1.columns.tolist()  # List of column names

['Name', 'Age', 'City', 'Salary']

In [27]:
df1.dtypes  # Data types of each column

Name      object
Age        int64
City      object
Salary     int64
dtype: object

In [29]:
##  Selection and Indexing

In [31]:
# Select a single column
df1['Name']  # Returns Series

0     John
1     Anna
2    Peter
3    Linda
Name: Name, dtype: object

In [33]:
# Select multiple columns
df1[['Name', 'Age']]  # Returns DataFrame

Unnamed: 0,Name,Age
0,John,28
1,Anna,34
2,Peter,29
3,Linda,42


In [35]:
# Select by position (iloc)
df1.iloc[0]  # First row, all columns
df1.iloc[0:2, [0, 2]]  # First 2 rows, columns 0 and 2

Unnamed: 0,Name,City
0,John,New York
1,Anna,Paris


In [37]:
# Select by label (loc)
df1.loc[df1['Name'] == 'John']  # Select rows where Name is 'John'

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65000


In [39]:
# Boolean indexing
df1[df1['Age'] > 30]  # Select rows where Age > 30

Unnamed: 0,Name,Age,City,Salary
1,Anna,34,Paris,70000
3,Linda,42,London,85000


In [41]:
## Data Manipulation

In [45]:
# Add a new column
df1['Experience'] = [3, 8, 4, 12]
df1

Unnamed: 0,Name,Age,City,Salary,Experience
0,John,28,New York,65000,3
1,Anna,34,Paris,70000,8
2,Peter,29,Berlin,62000,4
3,Linda,42,London,85000,12


In [47]:
# Modify values
df1.loc[0, 'Salary'] = 68000  # Change John's salary
df1

Unnamed: 0,Name,Age,City,Salary,Experience
0,John,28,New York,68000,3
1,Anna,34,Paris,70000,8
2,Peter,29,Berlin,62000,4
3,Linda,42,London,85000,12


In [49]:
# Delete column
df_temp = df1.copy()
df_temp = df_temp.drop('Experience', axis=1)  # axis=1 for columns, axis=0 for rows
df_temp

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,68000
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,85000


In [51]:
# Rename columns
df_temp = df1.rename(columns={'Name': 'Employee', 'City': 'Location'})
df_temp

Unnamed: 0,Employee,Age,Location,Salary,Experience
0,John,28,New York,68000,3
1,Anna,34,Paris,70000,8
2,Peter,29,Berlin,62000,4
3,Linda,42,London,85000,12


In [53]:
## Sorting

In [55]:
# Sort by Age (ascending)
df1.sort_values('Age')

Unnamed: 0,Name,Age,City,Salary,Experience
0,John,28,New York,68000,3
2,Peter,29,Berlin,62000,4
1,Anna,34,Paris,70000,8
3,Linda,42,London,85000,12


In [57]:
# Sort by Salary (descending)
df1.sort_values('Salary', ascending=False)

Unnamed: 0,Name,Age,City,Salary,Experience
3,Linda,42,London,85000,12
1,Anna,34,Paris,70000,8
0,John,28,New York,68000,3
2,Peter,29,Berlin,62000,4


In [59]:
# Sort by multiple columns
df1.sort_values(['City', 'Age'], ascending=[True, False])  # City ascending, Age descending

Unnamed: 0,Name,Age,City,Salary,Experience
2,Peter,29,Berlin,62000,4
3,Linda,42,London,85000,12
0,John,28,New York,68000,3
1,Anna,34,Paris,70000,8


In [61]:
## Grouping and Aggregation

In [65]:
# Create a larger dataset for grouping
cities = ['New York', 'Paris', 'Berlin', 'London', 'Tokyo']
departments = ['HR', 'IT', 'Finance', 'Marketing', 'Sales']
data = {
    'Name': [f'Person_{i}' for i in range(1, 11)],
    'Age': np.random.randint(25, 55, 10),
    'City': np.random.choice(cities, 10),
    'Department': np.random.choice(departments, 10),
    'Salary': np.random.randint(50000, 100000, 10)
}
df_group = pd.DataFrame(data)
df_group

Unnamed: 0,Name,Age,City,Department,Salary
0,Person_1,34,London,HR,66299
1,Person_2,30,New York,IT,89762
2,Person_3,47,Berlin,HR,80396
3,Person_4,42,London,IT,88573
4,Person_5,32,New York,Finance,55055
5,Person_6,43,New York,HR,77365
6,Person_7,48,London,IT,51970
7,Person_8,26,Paris,Finance,93727
8,Person_9,48,Tokyo,Finance,62242
9,Person_10,52,Paris,Finance,91925


In [67]:
# Group by Department and calculate mean salary
df_group.groupby('Department')['Salary'].mean()

# Group by Department and City, calculate multiple aggregations
df_group.groupby(['Department', 'City']).agg({
    'Salary': ['mean', 'min', 'max'],
    'Age': ['mean', 'count']
})

Unnamed: 0_level_0,Unnamed: 1_level_0,Salary,Salary,Salary,Age,Age
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,min,max,mean,count
Department,City,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Finance,New York,55055.0,55055,55055,32.0,1
Finance,Paris,92826.0,91925,93727,39.0,2
Finance,Tokyo,62242.0,62242,62242,48.0,1
HR,Berlin,80396.0,80396,80396,47.0,1
HR,London,66299.0,66299,66299,34.0,1
HR,New York,77365.0,77365,77365,43.0,1
IT,London,70271.5,51970,88573,45.0,2
IT,New York,89762.0,89762,89762,30.0,1
