In [1]:
import pandas as pd

In [2]:
data = {
    'Name': ['John', 'Anna', 'Peter', 'Linda'],
    'Age': [28, 34, 29, 42],
    'City': ['New York', 'Paris', 'Berlin', 'London'],
    'Salary': [65089, 70808, 62080, 83080]
}

### Creating a Data Frame

In [3]:
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65089
1,Anna,34,Paris,70808
2,Peter,29,Berlin,62080
3,Linda,42,London,83080


In [11]:
# Using a list
data_list = [
    ['John', 28, 'New York', 65980],
    ['Anna', 34, 'Paris', 70000],
    ['Peter', 29, 'Berlin', 62000],
    ['Linda', 42, 'London', 65000]
]
df2 = pd.DataFrame(data_list)
df2

Unnamed: 0,0,1,2,3
0,John,28,New York,65980
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,65000


In [None]:
## add/customize col name
col = ["Name", "Age", "City", "Salary"]
df2 = pd.DataFrame(data_list, columns=col)
df2

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65980
1,Anna,34,Paris,70000
2,Peter,29,Berlin,62000
3,Linda,42,London,65000


### Selection and Indexing of Columns

In [None]:
##df[0] # become an error

# select city columns
df["City"]

0    New York
1       Paris
2      Berlin
3      London
Name: City, dtype: object

In [4]:
df.City

0    New York
1       Paris
2      Berlin
3      London
Name: City, dtype: object

In [21]:
# select Name and Age columns together

df[["Name", "Age"]]

Unnamed: 0,Name,Age
0,John,28
1,Anna,34
2,Peter,29
3,Linda,42


### Creating a new columns

In [7]:
df["Id"] = [101, 202, 303, 404]

In [39]:
df

Unnamed: 0,Name,Age,City,Salary,Id
0,John,28,New York,65089,101
1,Anna,34,Paris,70808,202
2,Peter,29,Berlin,62080,303
3,Linda,42,London,83080,404


### Removing Cols

In [40]:
# -------- 0 means x axis, 1 is y axis --------

df.drop("Id", axis=1) # Not remove in actual memory. It's return just a copy 

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65089
1,Anna,34,Paris,70808
2,Peter,29,Berlin,62080
3,Linda,42,London,83080


In [None]:
df.drop(3, axis=0) # Also not remove in actual memory. It's return just a copy 

Unnamed: 0,Name,Age,City,Salary,Id
0,John,28,New York,65089,101
1,Anna,34,Paris,70808,202
2,Peter,29,Berlin,62080,303


In [41]:
# Permanently remove
df.drop("Id", axis=1, inplace=True)
df

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65089
1,Anna,34,Paris,70808
2,Peter,29,Berlin,62080
3,Linda,42,London,83080


In [43]:
df

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65089
1,Anna,34,Paris,70808
2,Peter,29,Berlin,62080
3,Linda,42,London,83080


In [48]:
data = {
    'Name': ['John', 'Anna', 'Peter', 'Linda'],
    'Age': [28, 34, 29, 42],
    'City': ['New York', 'Paris', 'Berlin', 'London'],
    'Salary': [65089, 70808, 62080, 83080]
}

data_frame = pd.DataFrame(data)
data_frame

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65089
1,Anna,34,Paris,70808
2,Peter,29,Berlin,62080
3,Linda,42,London,83080


In [None]:
# remove multiple cols
data_frame.drop(["Name", "Age"], axis=1)

Unnamed: 0,City,Salary
0,New York,65089
1,Paris,70808
2,Berlin,62080
3,London,83080


### Selecting a Rows

In [50]:
df

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65089
1,Anna,34,Paris,70808
2,Peter,29,Berlin,62080
3,Linda,42,London,83080


In [None]:
# df[0]  // getting an error
df.loc[1] # loc = location

Name       Anna
Age          34
City      Paris
Salary    70808
Name: 1, dtype: object

In [55]:
# select multiple
df.loc[ [0, 1] ]

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65089
1,Anna,34,Paris,70808


In [56]:
# select by index location
df.iloc[0]

Name          John
Age             28
City      New York
Salary       65089
Name: 0, dtype: object

### Selecting subset of rows and cols

In [57]:
df

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65089
1,Anna,34,Paris,70808
2,Peter,29,Berlin,62080
3,Linda,42,London,83080


In [None]:
df.loc[[0, 1]] [["City", "Salary"]]

Unnamed: 0,City,Salary
0,New York,65089
1,Paris,70808


In [8]:
df.loc[[]]

Unnamed: 0,Name,Age,City,Salary,Id


### Conditional 

In [59]:
df

Unnamed: 0,Name,Age,City,Salary
0,John,28,New York,65089
1,Anna,34,Paris,70808
2,Peter,29,Berlin,62080
3,Linda,42,London,83080


In [60]:
# return those people whose age is greater than 30

df[df["Age"] > 30]

Unnamed: 0,Name,Age,City,Salary
1,Anna,34,Paris,70808
3,Linda,42,London,83080


In [62]:
# return those people whose age is greater than 30 and their city must be Paris

df[(df["Age"] > 30) & (df["City"] == "Paris")]

Unnamed: 0,Name,Age,City,Salary
1,Anna,34,Paris,70808


# Practice

In [64]:
data = {
    'Name': ['John', 'Anna', 'Peter', 'Linda', 'Michael', 'Sarah', 'David', 'Emily', 'James', 'Olivia', 'Robert', 'Sophia', 'William', 'Isabella', 'Thomas', 'Charlotte', 'Christopher', 'Amelia', 'Daniel', 'Mia'],
    'Age': [28, 34, 29, 42, 31, 26, 45, 29, 38, 27, 33, 24, 40, 35, 29, 31, 36, 28, 32, 30],
    'City': ['New York', 'Paris', 'Berlin', 'London', 'Tokyo', 'Sydney', 'Toronto', 'Madrid', 'Rome', 'Moscow', 'Seoul', 'Beijing', 'Dubai', 'Mumbai', 'Cairo', 'Rio', 'Chicago', 'Los Angeles', 'Houston', 'Miami'],
    'Salary': [65089, 70808, 62080, 83080, 75000, 68000, 90000, 72000, 71000, 69000, 85000, 67000, 92000, 73000, 64000, 88000, 77000, 82000, 79000, 86000],
    'Department': ['IT', 'HR', 'IT', 'Finance', 'Marketing', 'IT', 'Finance', 'HR', 'IT', 'Marketing', 'Finance', 'IT', 'HR', 'Marketing', 'IT', 'Finance', 'HR', 'IT', 'Marketing', 'Finance'],
    'Experience': [5, 8, 4, 15, 6, 3, 18, 5, 10, 4, 12, 2, 14, 9, 4, 11, 13, 6, 8, 7],
    'Education': ['Bachelor', 'Master', 'Bachelor', 'PhD', 'Bachelor', 'Bachelor', 'Master', 'Bachelor', 'Master', 'Bachelor', 'PhD', 'Bachelor', 'Master', 'Bachelor', 'Bachelor', 'Master', 'PhD', 'Bachelor', 'Master', 'Bachelor'],
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'],
    'Marital Status': ['Single', 'Married', 'Single', 'Married', 'Single', 'Married', 'Married', 'Single', 'Married', 'Single', 'Married', 'Single', 'Married', 'Single', 'Married', 'Single', 'Married', 'Single', 'Married', 'Single'],
    'Job Title': ['Developer', 'Manager', 'Analyst', 'Director', 'Specialist', 'Developer', 'Director', 'Manager', 'Developer', 'Specialist', 'Director', 'Analyst', 'Manager', 'Specialist', 'Analyst', 'Director', 'Manager', 'Developer', 'Specialist', 'Director']
}
dataFrame = pd.DataFrame(data)
dataFrame

Unnamed: 0,Name,Age,City,Salary,Department,Experience,Education,Gender,Marital Status,Job Title
0,John,28,New York,65089,IT,5,Bachelor,Male,Single,Developer
1,Anna,34,Paris,70808,HR,8,Master,Female,Married,Manager
2,Peter,29,Berlin,62080,IT,4,Bachelor,Male,Single,Analyst
3,Linda,42,London,83080,Finance,15,PhD,Female,Married,Director
4,Michael,31,Tokyo,75000,Marketing,6,Bachelor,Male,Single,Specialist
5,Sarah,26,Sydney,68000,IT,3,Bachelor,Female,Married,Developer
6,David,45,Toronto,90000,Finance,18,Master,Male,Married,Director
7,Emily,29,Madrid,72000,HR,5,Bachelor,Female,Single,Manager
8,James,38,Rome,71000,IT,10,Master,Male,Married,Developer
9,Olivia,27,Moscow,69000,Marketing,4,Bachelor,Female,Single,Specialist


In [69]:
dataFrame.loc[[10, 11, 12, 13, 14, 15]] [["Name", "Age", "Department"]]

Unnamed: 0,Name,Age,Department
10,Robert,33,Finance
11,Sophia,24,IT
12,William,40,HR
13,Isabella,35,Marketing
14,Thomas,29,IT
15,Charlotte,31,Finance


In [None]:
# return those people whose age is greater than 20 and their dept. must be IT

dataFrame[ (dataFrame["Age"] > 20) & (dataFrame["Department"] == "IT") ]

Unnamed: 0,Name,Age,City,Salary,Department,Experience,Education,Gender,Marital Status,Job Title
0,John,28,New York,65089,IT,5,Bachelor,Male,Single,Developer
2,Peter,29,Berlin,62080,IT,4,Bachelor,Male,Single,Analyst
5,Sarah,26,Sydney,68000,IT,3,Bachelor,Female,Married,Developer
8,James,38,Rome,71000,IT,10,Master,Male,Married,Developer
11,Sophia,24,Beijing,67000,IT,2,Bachelor,Female,Single,Analyst
14,Thomas,29,Cairo,64000,IT,4,Bachelor,Male,Married,Analyst
17,Amelia,28,Los Angeles,82000,IT,6,Bachelor,Female,Single,Developer
