Create a DataFrame with duplicate rows and remove duplicates.

In [1]:
import pandas as pd
data = {'X': [1, 2, 2, 3], 'Y': [4, 5, 5, 6]}
df = pd.DataFrame(data)
df.drop_duplicates(inplace=True)
print(df)

   X  Y
0  1  4
1  2  5
3  3  6


Create a DataFrame with hierarchical index.

In [2]:
import pandas as pd
arrays = [['X', 'X', 'Y', 'Y'], [1, 2, 1, 2]]
index = pd.MultiIndex.from_arrays(arrays, names=('Group', 'Number'))
data = {'Value': [10, 20, 30, 40]}
df = pd.DataFrame(data, index=index)
print(df)

              Value
Group Number       
X     1          10
      2          20
Y     1          30
      2          40


Calculate the difference between consecutive rows in a DataFrame.

In [3]:
import pandas as pd
data = {'X': [1, 3, 6, 10]}
df = pd.DataFrame(data)
df['Difference'] = df['X'].diff()
print(df)

    X  Difference
0   1         NaN
1   3         2.0
2   6         3.0
3  10         4.0


Create a DataFrame with hierarchical columns.

In [4]:
import pandas as pd
arrays = [['X', 'X', 'Y', 'Y'], ['C1', 'C2', 'C1', 'C2']]
columns = pd.MultiIndex.from_arrays(arrays, names=('Group', 'Type'))
data = [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]
df = pd.DataFrame(data, columns=columns)
print(df)

Group  X       Y    
Type  C1  C2  C1  C2
0      1   2   3   4
1      5   6   7   8
2      9  10  11  12


Filter rows based on the length of strings in a column.

In [5]:
import pandas as pd
data = {'X': ['foo', 'bar', 'baz', 'qux']}
df = pd.DataFrame(data)
filtered_df = df[df['X'].str.len() > 3]
print(filtered_df)

Empty DataFrame
Columns: [X]
Index: []


Calculate the percentage change between rows in a DataFrame.

In [6]:
import pandas as pd
data = {'X': [1, 2, 3, 4]}
df = pd.DataFrame(data)
df['Pct_Change'] = df['X'].pct_change()
print(df)

   X  Pct_Change
0  1         NaN
1  2    1.000000
2  3    0.500000
3  4    0.333333


Create a DataFrame from a dictionary of Series.

In [7]:
import pandas as pd
data = {'X': pd.Series([1, 2, 3]), 'Y': pd.Series([4, 5, 6])}
df = pd.DataFrame(data)
print(df)

   X  Y
0  1  4
1  2  5
2  3  6


Filter rows based on whether a column value is in a list.

In [8]:
import pandas as pd
data = {'X': [1, 2, 3, 4], 'Y': [5, 6, 7, 8]}
df = pd.DataFrame(data)
filtered_df = df[df['X'].isin([2, 3])]
print(filtered_df)

   X  Y
1  2  6
2  3  7


Calculate the z-score of values in a DataFrame.

In [9]:
import pandas as pd
import numpy as np
data = {'X': [1, 2, 3, 4], 'Y': [4, 5, 6, 7]}
df = pd.DataFrame(data)
df['zscore_A'] = (df['X'] - np.mean(df['X'])) / np.std(df['X'])
print(df)

   X  Y  zscore_A
0  1  4 -1.341641
1  2  5 -0.447214
2  3  6  0.447214
3  4  7  1.341641


Create a DataFrame with random integers and calculate descriptive statistics.

In [10]:
import pandas as pd
import numpy as np
data = np.random.randint(1, 100, size=(5, 3))
df = pd.DataFrame(data, columns=['X', 'Y', 'Z'])
print(df.describe())

               X          Y          Z
count   5.000000   5.000000   5.000000
mean   39.000000  39.600000  30.400000
std    27.613403  30.311714  24.172298
min     4.000000   8.000000  10.000000
25%    26.000000  11.000000  10.000000
50%    35.000000  41.000000  29.000000
75%    53.000000  61.000000  34.000000
max    77.000000  77.000000  69.000000
