In [None]:
import numpy as np
import pandas as pd

In [None]:
# There are two types of classes for handling data in Pandas:
# Series and DataFrame.
# Series is a one-dimensional labeled array capable of holding any data type.
# DataFrame is a two-dimensional labeled data structure with columns of potentially different types.

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [None]:
# Create a date range and assign it to a variable
# The date range is created with a start date and a number of periods
dates = pd.date_range("20230101", periods=6)

# Print the date range
dates

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
               '2023-01-05', '2023-01-06'],
              dtype='datetime64[ns]', freq='D')

In [None]:
# create a DataFrame with random numbers
# The DataFrame is created with the date range as the index and columns labeled A, B, C, D
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))

# Print the DataFrame
df

Unnamed: 0,A,B,C,D
2023-01-01,-0.30702,-0.859978,-1.633374,-0.45851
2023-01-02,0.39295,0.026672,0.980754,-0.791025
2023-01-03,1.015471,0.039823,-0.661513,1.763645
2023-01-04,-0.654969,-1.674629,-2.056228,0.934604
2023-01-05,-0.090786,-0.41703,0.858619,1.074061
2023-01-06,1.06308,-0.986553,-0.400404,1.354427


In [None]:
# Pass a dictionary to create a DataFrame
# Keys are column names and values are the data
df2 = pd.DataFrame(
     {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)

# Print the DataFrame
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [None]:
# Print the data types of each column in the DataFrame
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [20]:
# Print the first 3 rows of the DataFrame
df.head(3)

Unnamed: 0,A,B,C,D
2023-01-01,-0.30702,-0.859978,-1.633374,-0.45851
2023-01-02,0.39295,0.026672,0.980754,-0.791025
2023-01-03,1.015471,0.039823,-0.661513,1.763645


In [19]:
# Print the last 3 rows of the DataFrame
df.tail(3)

Unnamed: 0,A,B,C,D
2023-01-04,-0.654969,-1.674629,-2.056228,0.934604
2023-01-05,-0.090786,-0.41703,0.858619,1.074061
2023-01-06,1.06308,-0.986553,-0.400404,1.354427


In [None]:
# Print the index of the DataFrame
df.index

DatetimeIndex(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04',
               '2023-01-05', '2023-01-06'],
              dtype='datetime64[ns]', freq='D')

In [23]:
# Print the columns of the DataFrame
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [26]:
# Print the values of the DataFrame as a NumPy array
df.to_numpy()

array([[-0.3070197 , -0.85997847, -1.63337384, -0.45851002],
       [ 0.39294996,  0.02667187,  0.98075423, -0.79102457],
       [ 1.01547053,  0.03982292, -0.66151265,  1.76364491],
       [-0.65496875, -1.67462927, -2.05622841,  0.93460399],
       [-0.09078552, -0.41702954,  0.85861948,  1.07406068],
       [ 1.0630796 , -0.98655273, -0.4004038 ,  1.35442722]])

In [29]:
# Print the data types of each column in the DataFrame
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [31]:
# Print the values of the DataFrame as a NumPy array
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [33]:
# Prints summary statistics of the DataFrame
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.236454,-0.645283,-0.485357,0.6462
std,0.708767,0.662587,1.247364,1.029807
min,-0.654969,-1.674629,-2.056228,-0.791025
25%,-0.252961,-0.954909,-1.390409,-0.110232
50%,0.151082,-0.638504,-0.530958,1.004332
75%,0.85984,-0.084253,0.543864,1.284336
max,1.06308,0.039823,0.980754,1.763645


In [35]:
# Transpose the DataFrame to switch rows and columns
df.T

Unnamed: 0,2023-01-01,2023-01-02,2023-01-03,2023-01-04,2023-01-05,2023-01-06
A,-0.30702,0.39295,1.015471,-0.654969,-0.090786,1.06308
B,-0.859978,0.026672,0.039823,-1.674629,-0.41703,-0.986553
C,-1.633374,0.980754,-0.661513,-2.056228,0.858619,-0.400404
D,-0.45851,-0.791025,1.763645,0.934604,1.074061,1.354427


In [37]:
# Sort the DataFrame by index
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2023-01-01,-0.45851,-1.633374,-0.859978,-0.30702
2023-01-02,-0.791025,0.980754,0.026672,0.39295
2023-01-03,1.763645,-0.661513,0.039823,1.015471
2023-01-04,0.934604,-2.056228,-1.674629,-0.654969
2023-01-05,1.074061,0.858619,-0.41703,-0.090786
2023-01-06,1.354427,-0.400404,-0.986553,1.06308


In [38]:
# Sort the DataFrame by values in column B
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2023-01-04,-0.654969,-1.674629,-2.056228,0.934604
2023-01-06,1.06308,-0.986553,-0.400404,1.354427
2023-01-01,-0.30702,-0.859978,-1.633374,-0.45851
2023-01-05,-0.090786,-0.41703,0.858619,1.074061
2023-01-02,0.39295,0.026672,0.980754,-0.791025
2023-01-03,1.015471,0.039823,-0.661513,1.763645
