In [None]:
0.75

In [None]:
# using pandas
import pandas as pd

In [213]:
# create a pandas Series
series = pd.Series([10, 20, 30, 40, 50])

print("series: \n", series)

print("=================================")

print("Type of series ", type(series))

series: 
 0    10
1    20
2    30
3    40
4    50
dtype: int64
Type of series  <class 'pandas.Series'>


In [214]:
# create another pandas Series with custom index
series_1 = pd.Series([5, 10, 15, 20, 25], index=['a', 'b', 'c', 'd', 'e'])

print("series 1: \n", series_1)

series 1: 
 a     5
b    10
c    15
d    20
e    25
dtype: int64


In [215]:
# create a DataFrame from list
data = [[1, 2, 3, 'a'],
        [4, 5, 6, 'b'],
        [7, 8, 9, 'c']]

In [216]:
# create DataFrame with specified column names
df = pd.DataFrame(data, columns=["Column1", "Column2", "Column3", "Column4"])

# display the DataFrame
display(df) # display is used in Jupyter Notebooks, not like print -> print(df), but works too

Unnamed: 0,Column1,Column2,Column3,Column4
0,1,2,3,a
1,4,5,6,b
2,7,8,9,c


In [217]:
# create another DataFrame from a dictionary
data_2 = {"Name": ["Alice", "Bob", "Charlie", "David"],
          "Age": [24, 27, 22, 32],
          "City": ["New York", "Los Angeles", "Chicago", "Houston"]}

df_2 = pd.DataFrame(data_2, columns=["Name", "Age", "City"])

# display the DataFrame
display(df_2)

Unnamed: 0,Name,Age,City
0,Alice,24,New York
1,Bob,27,Los Angeles
2,Charlie,22,Chicago
3,David,32,Houston


In [218]:
# get the columns of the DataFrame
df_2.columns

Index(['Name', 'Age', 'City'], dtype='str')

In [219]:
# get the index of the DataFrame
df_2.index

RangeIndex(start=0, stop=4, step=1)

In [220]:
# get the values of the DataFrame as a 2D array
df_2.values

array([['Alice', 24, 'New York'],
       ['Bob', 27, 'Los Angeles'],
       ['Charlie', 22, 'Chicago'],
       ['David', 32, 'Houston']], dtype=object)

In [221]:
# get the data types of each column
df_2.dtypes

Name      str
Age     int64
City      str
dtype: object

In [222]:
# priority of data types: object < string < float < int < bool
mixed_data = {'A': [1, 2, 3],
              'B': [4.0, 5.5, 6.1],
              'C': ['x', 'y', 'z'],
              'D': [True, False, True]}

df_mixed = pd.DataFrame(mixed_data)

print("Data type of mixed vector: \n", df_mixed.dtypes) # Output: object why? Because of column C

Data type of mixed vector: 
 A      int64
B    float64
C        str
D       bool
dtype: object


In [223]:
# get a specific column's data type
df['Column1'].dtype

dtype('int64')

In [224]:
# convert a column to a different data type
df["Column1"] = df["Column1"].astype("category")

df["Column1"].dtype

CategoricalDtype(categories=[1, 4, 7], ordered=False, categories_dtype=int64)

In [225]:
# get the first 5 rows of the DataFrame
df.head(1) # get first 1 row

Unnamed: 0,Column1,Column2,Column3,Column4
0,1,2,3,a


In [226]:
# get the last 5 rows of the DataFrame
df.tail(2) # last 2 rows

Unnamed: 0,Column1,Column2,Column3,Column4
1,4,5,6,b
2,7,8,9,c


In [227]:
# get sample rows from the DataFrame
df.sample(3) # 3 random rows

Unnamed: 0,Column1,Column2,Column3,Column4
1,4,5,6,b
2,7,8,9,c
0,1,2,3,a


In [228]:
# get summary statistics of the DataFrame
df.describe() # for numerical columns by default

Unnamed: 0,Column2,Column3
count,3.0,3.0
mean,5.0,6.0
std,3.0,3.0
min,2.0,3.0
25%,3.5,4.5
50%,5.0,6.0
75%,6.5,7.5
max,8.0,9.0


In [229]:
# get summary statistics of the categorical columns
df.describe(include="category") # include='category' to get stats for categorical columns only, # include='all' to get stats for all columns

Unnamed: 0,Column1
count,3
unique,3
top,1
freq,1


In [230]:
# perform operations on a specific column
df["Column2"].unique().sum() # sum of unique values in Column2, used in numerical analysis, operations-> (min, max, mean, median, mode, sum, count, std, var, etc.)

np.int64(15)

In [231]:
# get the data type of a specific column
df["Column1"].dtype

# get the index of the maximum value in Column1
df["Column1"].idxmax()  # get the index of the maximum value in Column1, works for numerical columns only

2

In [232]:
# get info about the DataFrame
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   Column1  3 non-null      category
 1   Column2  3 non-null      int64   
 2   Column3  3 non-null      int64   
 3   Column4  3 non-null      str     
dtypes: category(1), int64(2), str(1)
memory usage: 231.0 bytes


In [233]:
# get the count of unique values in Column1
df["Column1"].value_counts()

Column1
1    1
4    1
7    1
Name: count, dtype: int64

In [234]:
# define a function to duplicate the values
def duplicate_function(x):
    return x * 2

df["Column1"].apply(duplicate_function) # apply function to each element in Column1, or use df["Column1"].apply(lambda x: x * 2)

0     2
1     8
2    14
Name: Column1, dtype: category
Categories (3, int64): [2, 8, 14]

In [235]:
# get the first element in Column1
df["Column1"].iloc[0] # get the first element in Column1 using (integer-location) based indexing

df["Column1"].loc[2]  # get the element at index 2 in Column1 using (label) based indexing

df["Column1"].iat[0]  # get the first element in Column1 using integer based indexing, faster than iloc for single element access

df["Column1"].loc[0]  # get the first element in Column1 using label based indexing

np.int64(1)

In [236]:
# use slicing to get first 3 elements in Column2
df["Column2"].iloc[0:3]  # slicing using iloc

df.iloc[1:4, 0:2] # get rows 1 to 3 and columns 0 to 1 using iloc

df.iloc[ : , : -1] # get all rows and all columns except the last one (:) for rows, (: -1) for columns except last

Unnamed: 0,Column1,Column2,Column3
0,1,2,3
1,4,5,6
2,7,8,9


In [237]:
# create a new column by performing operations on an existing column
new_col = df_2["Age"] + 5  # create a new column by adding 5 to the Age column

print("New column with Age + 5: \n", new_col)

New column with Age + 5: 
 0    29
1    32
2    27
3    37
Name: Age, dtype: int64


In [238]:
# add the new column to the DataFrame
df_2 = df_2.assign(Age_in_5_Years = df_2["Age"] + 5) # or use df_2["Age_in_5_Years"] = df_2["Age"] + 5

# display the updated DataFrame
display(df_2)

Unnamed: 0,Name,Age,City,Age_in_5_Years
0,Alice,24,New York,29
1,Bob,27,Los Angeles,32
2,Charlie,22,Chicago,27
3,David,32,Houston,37


In [239]:
# drop a column from the DataFrame
df_2 = df_2.drop(columns=["Age_in_5_Years"])  # drop the Age_in_5_Years column or use df_2.drop("Age_in_5_Years", axis=1, inplace=True)

# display the DataFrame after dropping the column
display(df_2)

Unnamed: 0,Name,Age,City
0,Alice,24,New York
1,Bob,27,Los Angeles
2,Charlie,22,Chicago
3,David,32,Houston


In [240]:
# drop a row from the DataFrame
df_2.drop(df_2.index[2], axis=0, inplace=True) # drop the row with index 2

# display the DataFrame after dropping the row
display(df_2)

Unnamed: 0,Name,Age,City
0,Alice,24,New York
1,Bob,27,Los Angeles
3,David,32,Houston


In [241]:
# get null count for each column
null = df_2.isnull().sum()

# format the null count as a DataFrame
null_df = pd.DataFrame(null, columns=["Null_Count"])

# display the DataFrame
display(null_df)

Unnamed: 0,Null_Count
Name,0
Age,0
City,0
