#### Goal - 1) Column & Data Structure Management
- Renaming columns 
- Dropping irrelevant/meaningless columns 
- Detecting constant/single-unique columns

In [1]:
# Sample Dataset

import pandas as pd
import numpy as np

# Create a sample DataFrame
df = pd.DataFrame({
    'constant_numeric': [100, 100, 100, 100, 100],  
    'varied_numeric': [10, 20, 30, 40, 50],         
    'binary_col': [True, False, True, False, True], 
    'trinary_cat': ['High', 'Medium', 'Low', 'Medium', 'High'],  
    'constant_str': ['same']*5,                     
    'some_dates': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-01', '2023-01-03', '2023-01-01']),
    'categorical_col': pd.Series(['A', 'B', 'A', 'C', 'B'], dtype='category'),
    'object_col': ['alpha', 'beta', 'gamma', 'alpha', 'beta']  
})

df

Unnamed: 0,constant_numeric,varied_numeric,binary_col,trinary_cat,constant_str,some_dates,categorical_col,object_col
0,100,10,True,High,same,2023-01-01,A,alpha
1,100,20,False,Medium,same,2023-01-02,B,beta
2,100,30,True,Low,same,2023-01-01,A,gamma
3,100,40,False,Medium,same,2023-01-03,C,alpha
4,100,50,True,High,same,2023-01-01,B,beta


#### 1-1) Renaming columns = rename(), df.columns = [...], str.replace(), str.lower(), str.strip() 

In [2]:
# Change column name
df.rename(columns={
    'some_dates':'Date',
    'trinary_cat':'Trinary Category'},    
          inplace=True) # it will make permanent change in df
df

Unnamed: 0,constant_numeric,varied_numeric,binary_col,Trinary Category,constant_str,Date,categorical_col,object_col
0,100,10,True,High,same,2023-01-01,A,alpha
1,100,20,False,Medium,same,2023-01-02,B,beta
2,100,30,True,Low,same,2023-01-01,A,gamma
3,100,40,False,Medium,same,2023-01-03,C,alpha
4,100,50,True,High,same,2023-01-01,B,beta


In [3]:
# When all column names wants to change manually at once
# df.columns['A','B','C']

In [4]:
# Replace certain part of column name
df.columns = df.columns.str.replace('col','Column') # 'Column' will replace 'col'
df.columns = df.columns.str.replace('str','String') # 'String' will replace 'str'
df.columns = df.columns.str.replace('_',' ') # Space will replace '_'

df

Unnamed: 0,constant numeric,varied numeric,binary Column,Trinary Category,constant String,Date,categorical Column,object Column
0,100,10,True,High,same,2023-01-01,A,alpha
1,100,20,False,Medium,same,2023-01-02,B,beta
2,100,30,True,Low,same,2023-01-01,A,gamma
3,100,40,False,Medium,same,2023-01-03,C,alpha
4,100,50,True,High,same,2023-01-01,B,beta


In [5]:
# Convert column names into title case
df.columns = df.columns.str.title().str.strip() # also remove trailing(rstrip)/leading(lstrip) spaces
df

Unnamed: 0,Constant Numeric,Varied Numeric,Binary Column,Trinary Category,Constant String,Date,Categorical Column,Object Column
0,100,10,True,High,same,2023-01-01,A,alpha
1,100,20,False,Medium,same,2023-01-02,B,beta
2,100,30,True,Low,same,2023-01-01,A,gamma
3,100,40,False,Medium,same,2023-01-03,C,alpha
4,100,50,True,High,same,2023-01-01,B,beta


#### 1-2) Dropping irrelevant/meaningless columns = drop(), dropna() (with axis=1), Manual column Selection

In [6]:
# drop specific columns
df.drop(columns=['Constant String'],axis=1,inplace=True)
df

Unnamed: 0,Constant Numeric,Varied Numeric,Binary Column,Trinary Category,Date,Categorical Column,Object Column
0,100,10,True,High,2023-01-01,A,alpha
1,100,20,False,Medium,2023-01-02,B,beta
2,100,30,True,Low,2023-01-01,A,gamma
3,100,40,False,Medium,2023-01-03,C,alpha
4,100,50,True,High,2023-01-01,B,beta


In [7]:
# drop columns when all values are NaN
df.dropna(axis=1, how='all',inplace=True)

# drop columns when a single value is NaN
df.dropna(axis=1, how='any', inplace=True) # axis=1 for columns

# drop rows when all values in row are NaN
df.dropna(axis=0, how='all', inplace=True) # axis=0 for rows

# drop rows when a single value in row is NaN
df.dropna(axis=0, how='any', inplace=True)

# keep only those columns/rows which have 3 non null values
df.dropna(axis=1, thresh=3)  # thresh value can be changed

# apply dropna only on certain columns using subset
df.dropna(subset=['Varied Numeric','Categorical Column'], how='all')

# Manual column selection
#df = df[['Binary Column','Trinary Category']]
df

Unnamed: 0,Constant Numeric,Varied Numeric,Binary Column,Trinary Category,Date,Categorical Column,Object Column
0,100,10,True,High,2023-01-01,A,alpha
1,100,20,False,Medium,2023-01-02,B,beta
2,100,30,True,Low,2023-01-01,A,gamma
3,100,40,False,Medium,2023-01-03,C,alpha
4,100,50,True,High,2023-01-01,B,beta


#### 1-3) Detecting constant/single-unique columns

In [8]:
df.nunique()

# it tells no of unique values in each column
# 'Constant Numeric' has all values same so its 1
# 'Binary Column' contains two different values so its 2
# 'Trinary Category' contains three different values so its 3

Constant Numeric      1
Varied Numeric        5
Binary Column         2
Trinary Category      3
Date                  3
Categorical Column    3
Object Column         3
dtype: int64

In [9]:
# another way to check unique values
df.nunique() == 2

Constant Numeric      False
Varied Numeric        False
Binary Column          True
Trinary Category      False
Date                  False
Categorical Column    False
Object Column         False
dtype: bool

In [10]:
# check variance(var) and standard deviation(std)
# Works only for numeric columns
# constant columns will have 0 variance or 0 std

# variance
print('Variance')
print(df.var(numeric_only=True))
print()

# standard deviation
print('Standard Deviation')
print(df.std(numeric_only=True))
print()

# Condional variance or std check
print("Condional var/std")
df.var(numeric_only=True) == 50

Variance
Constant Numeric      0.0
Varied Numeric      250.0
Binary Column         0.3
dtype: float64

Standard Deviation
Constant Numeric     0.000000
Varied Numeric      15.811388
Binary Column        0.547723
dtype: float64

Condional var/std


Constant Numeric    False
Varied Numeric      False
Binary Column       False
dtype: bool

In [11]:
df.describe()  # by default include='number' df.describe(include='number')

# gives 8 number summary of numerical columns(count, mean, std, min, 25%, 50%, 75%, max)
# useful to manually check count, unique, std, etc.
# For categorical columns gives count, unique, top, freq. for this use include='all'

Unnamed: 0,Constant Numeric,Varied Numeric,Date
count,5.0,5.0,5
mean,100.0,30.0,2023-01-01 14:24:00
min,100.0,10.0,2023-01-01 00:00:00
25%,100.0,20.0,2023-01-01 00:00:00
50%,100.0,30.0,2023-01-01 00:00:00
75%,100.0,40.0,2023-01-02 00:00:00
max,100.0,50.0,2023-01-03 00:00:00
std,0.0,15.811388,


In [12]:
# with include='all'  # includes all types of columns
df.describe(include='all')

Unnamed: 0,Constant Numeric,Varied Numeric,Binary Column,Trinary Category,Date,Categorical Column,Object Column
count,5.0,5.0,5,5,5,5,5
unique,,,2,3,,3,3
top,,,True,High,,A,alpha
freq,,,3,2,,2,2
mean,100.0,30.0,,,2023-01-01 14:24:00,,
min,100.0,10.0,,,2023-01-01 00:00:00,,
25%,100.0,20.0,,,2023-01-01 00:00:00,,
50%,100.0,30.0,,,2023-01-01 00:00:00,,
75%,100.0,40.0,,,2023-01-02 00:00:00,,
max,100.0,50.0,,,2023-01-03 00:00:00,,


#### other include parameters available

include parameters= 
- 'all' (All columns, regardless of type)
- 'number' (Only numeric columns (int, float)) = default
- 'object' (Only string/object columns)
- 'category' (Only categorical dtype columns)
- 'datetime' (Only datetime columns)
- 'bool'	(Only boolean columns)
- A list of types (mixed)	e.g., include=['number', 'object']

In [13]:
# similary exclude works, but it excludes mentioned criteria instead of include

# gives all non numeric columns
df.describe(exclude='number') # excludes numeric columns

Unnamed: 0,Binary Column,Trinary Category,Date,Categorical Column,Object Column
count,5,5,5,5,5
unique,2,3,,3,3
top,True,High,,A,alpha
freq,3,2,,2,2
mean,,,2023-01-01 14:24:00,,
min,,,2023-01-01 00:00:00,,
25%,,,2023-01-01 00:00:00,,
50%,,,2023-01-01 00:00:00,,
75%,,,2023-01-02 00:00:00,,
max,,,2023-01-03 00:00:00,,


In [14]:
# Exclude numeric and object columns
df.describe(exclude=['number', 'object'])

Unnamed: 0,Binary Column,Date,Categorical Column
count,5,5,5
unique,2,,3
top,True,,A
freq,3,,2
mean,,2023-01-01 14:24:00,
min,,2023-01-01 00:00:00,
25%,,2023-01-01 00:00:00,
50%,,2023-01-01 00:00:00,
75%,,2023-01-02 00:00:00,
max,,2023-01-03 00:00:00,


In [15]:
df.nunique()

Constant Numeric      1
Varied Numeric        5
Binary Column         2
Trinary Category      3
Date                  3
Categorical Column    3
Object Column         3
dtype: int64

In [16]:
# drop those columns which contains 1 unique value
# it helps when we have many columns
# we can apply different conditions

df = df.drop(columns=[col for col in df.columns if df[col].nunique() == 1])
df.nunique()

Varied Numeric        5
Binary Column         2
Trinary Category      3
Date                  3
Categorical Column    3
Object Column         3
dtype: int64