In [1]:
import pandas as pd
from sklearn.datasets import load_boston


In [2]:

# Load the Boston Housing Dataset
boston = load_boston()

# Create a DataFrame from the dataset
df = pd.DataFrame(boston.data, columns=boston.feature_names)
# Add the target column (median house prices) to the DataFrame
df['MEDV'] = boston.target

# Display the first few rows of the DataFrame
print(df.head())

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  


In [3]:
# Filtering the 'CRIM' column
filtered_single_column = df['CRIM']

# Display the filtered_single_column
print(filtered_single_column.head())

0    0.00632
1    0.02731
2    0.02729
3    0.03237
4    0.06905
Name: CRIM, dtype: float64


In [4]:
# Filtering using multiple columns 'CRIM' and 'ZN'
filtered_columns = df[['CRIM', 'ZN']]

print(filtered_columns.head())

      CRIM    ZN
0  0.00632  18.0
1  0.02731   0.0
2  0.02729   0.0
3  0.03237   0.0
4  0.06905   0.0


In [5]:
# Filtering rows where 'CHAS' is equal to 1
filtered_by_chas = df[df['CHAS'] == 1]

print(filtered_by_chas.head())

        CRIM   ZN  INDUS  CHAS    NOX     RM    AGE     DIS  RAD    TAX  \
142  3.32105  0.0  19.58   1.0  0.871  5.403  100.0  1.3216  5.0  403.0   
152  1.12658  0.0  19.58   1.0  0.871  5.012   88.0  1.6102  5.0  403.0   
154  1.41385  0.0  19.58   1.0  0.871  6.129   96.0  1.7494  5.0  403.0   
155  3.53501  0.0  19.58   1.0  0.871  6.152   82.6  1.7455  5.0  403.0   
160  1.27346  0.0  19.58   1.0  0.605  6.250   92.6  1.7984  5.0  403.0   

     PTRATIO       B  LSTAT  MEDV  
142     14.7  396.90  26.82  13.4  
152     14.7  343.28  12.12  15.3  
154     14.7  321.02  15.12  17.0  
155     14.7   88.01  15.02  15.6  
160     14.7  338.92   5.50  27.0  


In [6]:
# Filter rows where 'RM' (average number of rooms per dwelling) is greater than 6
filtered_greater_than = df[df['RM'] > 6]

print(filtered_greater_than.head())

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  


In [7]:
# Filter rows where 'TAX' (full-value property-tax rate per $10,000) is less than 300
filtered_less_than = df[df['TAX'] < 300]

print(filtered_less_than.head())

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  


In [8]:
# Filter rows where 'NOX' (nitric oxides concentration) is greater than or equal to 0.5
filtered_greater_equal = df[df['NOX'] >= 0.5]

print(filtered_greater_equal.head())

      CRIM    ZN  INDUS  CHAS    NOX     RM    AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31   0.0  0.538  6.575   65.2  4.0900  1.0  296.0   
6  0.08829  12.5   7.87   0.0  0.524  6.012   66.6  5.5605  5.0  311.0   
7  0.14455  12.5   7.87   0.0  0.524  6.172   96.1  5.9505  5.0  311.0   
8  0.21124  12.5   7.87   0.0  0.524  5.631  100.0  6.0821  5.0  311.0   
9  0.17004  12.5   7.87   0.0  0.524  6.004   85.9  6.5921  5.0  311.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
6     15.2  395.60  12.43  22.9  
7     15.2  396.90  19.15  27.1  
8     15.2  386.63  29.93  16.5  
9     15.2  386.71  17.10  18.9  


In [9]:
# Filter rows where 'DIS' (weighted distances to five Boston employment centers) is less than or equal to 2.0
filtered_less_equal = df[df['DIS'] <= 2.0]

print(filtered_less_equal.head())

        CRIM   ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
123  0.15038  0.0  25.65   0.0  0.581  5.856  97.0  1.9444  2.0  188.0   
125  0.16902  0.0  25.65   0.0  0.581  5.986  88.4  1.9929  2.0  188.0   
126  0.38735  0.0  25.65   0.0  0.581  5.613  95.6  1.7572  2.0  188.0   
127  0.25915  0.0  21.89   0.0  0.624  5.693  96.0  1.7883  4.0  437.0   
128  0.32543  0.0  21.89   0.0  0.624  6.431  98.8  1.8125  4.0  437.0   

     PTRATIO       B  LSTAT  MEDV  
123     19.1  370.31  25.41  17.3  
125     19.1  385.02  14.81  21.4  
126     19.1  359.29  27.26  15.7  
127     21.2  392.11  17.19  16.2  
128     21.2  396.90  15.39  18.0  


In [10]:
# Filter houses with more than 6 rooms and located near an employment center
filtered_and = df[(df['RM'] > 6) & (df['DIS'] <= 2.0)]

print(filtered_and.head())

        CRIM   ZN  INDUS  CHAS    NOX     RM    AGE     DIS  RAD    TAX  \
128  0.32543  0.0  21.89   0.0  0.624  6.431   98.8  1.8125  4.0  437.0   
137  0.35233  0.0  21.89   0.0  0.624  6.454   98.4  1.8498  4.0  437.0   
139  0.54452  0.0  21.89   0.0  0.624  6.151   97.9  1.6687  4.0  437.0   
140  0.29090  0.0  21.89   0.0  0.624  6.174   93.6  1.6119  4.0  437.0   
145  2.37934  0.0  19.58   0.0  0.871  6.130  100.0  1.4191  5.0  403.0   

     PTRATIO       B  LSTAT  MEDV  
128     21.2  396.90  15.39  18.0  
137     21.2  394.08  14.59  17.1  
139     21.2  396.90  18.46  17.8  
140     21.2  388.08  24.16  14.0  
145     14.7  172.91  27.80  13.8  


In [11]:
# Filter houses with more than 6 rooms or located near an employment center
filtered_or = df[(df['RM'] > 6) | (df['DIS'] <= 2.0)]

print(filtered_or.head())

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  


In [12]:
# Filter houses that do not have more than 6 rooms
filtered_not = df[~(df['RM'] > 6)]

print(filtered_not.head())

       CRIM    ZN  INDUS  CHAS    NOX     RM    AGE     DIS  RAD    TAX  \
8   0.21124  12.5   7.87   0.0  0.524  5.631  100.0  6.0821  5.0  311.0   
12  0.09378  12.5   7.87   0.0  0.524  5.889   39.0  5.4509  5.0  311.0   
13  0.62976   0.0   8.14   0.0  0.538  5.949   61.8  4.7075  4.0  307.0   
15  0.62739   0.0   8.14   0.0  0.538  5.834   56.5  4.4986  4.0  307.0   
16  1.05393   0.0   8.14   0.0  0.538  5.935   29.3  4.4986  4.0  307.0   

    PTRATIO       B  LSTAT  MEDV  
8      15.2  386.63  29.93  16.5  
12     15.2  390.50  15.71  21.7  
13     21.0  396.90   8.26  20.4  
15     21.0  395.62   8.47  19.9  
16     21.0  386.85   6.58  23.1  


In [13]:
# Using .query() to filter data
filtered_query = df.query('RM > 6 and MEDV > 25')

print(filtered_query.head())

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   
5  0.02985   0.0   2.18   0.0  0.458  6.430  58.7  6.0622  3.0  222.0   
7  0.14455  12.5   7.87   0.0  0.524  6.172  96.1  5.9505  5.0  311.0   

   PTRATIO       B  LSTAT  MEDV  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  
5     18.7  394.12   5.21  28.7  
7     15.2  396.90  19.15  27.1  


In [14]:
# Using .between() to filter data
filtered_between = df[df['TAX'].between(200, 400)]

print(filtered_between.head())

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  


In [15]:
# Using .isin() to filter data
age_values = [65.2, 90.0]
filtered_isin_age = df[df['AGE'].isin(age_values)]

print(filtered_isin_age.head())

        CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS   RAD    TAX  \
0    0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900   1.0  296.0   
104  0.13960   0.0   8.56   0.0  0.520  6.167  90.0  2.4210   5.0  384.0   
119  0.14476   0.0  10.01   0.0  0.547  5.731  65.2  2.7592   6.0  432.0   
460  4.81213   0.0  18.10   0.0  0.713  6.701  90.0  2.5975  24.0  666.0   

     PTRATIO       B  LSTAT  MEDV  
0       15.3  396.90   4.98  24.0  
104     20.9  392.69  12.33  20.1  
119     17.8  391.50  13.61  19.3  
460     20.2  255.23  16.42  16.4  


In [16]:
# Apply .where() condition
filtered_where = df.where(df['RM'] > 7)

print(filtered_where.head())

      CRIM   ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  PTRATIO  \
0      NaN  NaN    NaN   NaN    NaN    NaN   NaN     NaN  NaN    NaN      NaN   
1      NaN  NaN    NaN   NaN    NaN    NaN   NaN     NaN  NaN    NaN      NaN   
2  0.02729  0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0     17.8   
3      NaN  NaN    NaN   NaN    NaN    NaN   NaN     NaN  NaN    NaN      NaN   
4  0.06905  0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0     18.7   

        B  LSTAT  MEDV  
0     NaN    NaN   NaN  
1     NaN    NaN   NaN  
2  392.83   4.03  34.7  
3     NaN    NaN   NaN  
4  396.90   5.33  36.2  


In [17]:
# Create a sample DataFrame with datetime index
date_rng = pd.date_range(start='2023-01-01', end='2023-01-10', freq='D')
values = range(len(date_rng))
df = pd.DataFrame({'value': values}, index=date_rng)

print("Original DataFrame:")
print(df)

Original DataFrame:
            value
2023-01-01      0
2023-01-02      1
2023-01-03      2
2023-01-04      3
2023-01-05      4
2023-01-06      5
2023-01-07      6
2023-01-08      7
2023-01-09      8
2023-01-10      9


In [18]:
# Filtering by Date Indexing
filtered_date_index = df['2023-01-03':'2023-01-07']
print("Filtered DataFrame using Date Indexing:")
print(filtered_date_index)

Filtered DataFrame using Date Indexing:
            value
2023-01-03      2
2023-01-04      3
2023-01-05      4
2023-01-06      5
2023-01-07      6


In [19]:
# Filter data using .loc[] and a date condition
filtered_loc = df.loc[df.index >= '2023-01-05']

print(filtered_loc)

            value
2023-01-05      4
2023-01-06      5
2023-01-07      6
2023-01-08      7
2023-01-09      8
2023-01-10      9


In [20]:
# Filter data using .query() and a date condition
filtered_query = df.query('index >= "2023-01-06"')

print(filtered_query)

            value
2023-01-06      5
2023-01-07      6
2023-01-08      7
2023-01-09      8
2023-01-10      9


In [21]:
# Create a sample DataFrame with datetime index
date_rng = pd.date_range(start='2023-01-01 08:00', periods=10, freq='H')
values = range(len(date_rng))
df = pd.DataFrame({'value': values}, index=date_rng)

df

Unnamed: 0,value
2023-01-01 08:00:00,0
2023-01-01 09:00:00,1
2023-01-01 10:00:00,2
2023-01-01 11:00:00,3
2023-01-01 12:00:00,4
2023-01-01 13:00:00,5
2023-01-01 14:00:00,6
2023-01-01 15:00:00,7
2023-01-01 16:00:00,8
2023-01-01 17:00:00,9


In [22]:
# Filter data using .between_time() for a time interval
filtered_between_time = df.between_time('10:00', '16:00')

print(filtered_between_time)

                     value
2023-01-01 10:00:00      2
2023-01-01 11:00:00      3
2023-01-01 12:00:00      4
2023-01-01 13:00:00      5
2023-01-01 14:00:00      6
2023-01-01 15:00:00      7
2023-01-01 16:00:00      8


In [23]:
# Create a sample DataFrame with student exam scores
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
        'Math': [85, np.nan, 72, 90, 65],
        'English': [90, 78, np.nan, 85, 92]}
df = pd.DataFrame(data)
df

NameError: name 'np' is not defined

In [None]:
# Using .isna() to filter rows with missing scores
missing_scores_mask = df.isna()
missing_scores_filtered = df[missing_scores_mask.any(axis=1)]

print("Original DataFrame:")
print(df)
print("\nMask for Missing Scores:")
print(missing_scores_mask)
print("\nRows with Missing Scores:")
print(missing_scores_filtered)

In [None]:
# Using .notna() to filter rows without missing scores
non_missing_scores_mask = df.notna()
non_missing_scores_filtered = df[non_missing_scores_mask.all(axis=1)]

print("Original DataFrame:")
print(df)
print("\nMask for Non-Missing Scores:")
print(non_missing_scores_mask)
print("\nRows without Missing Scores:")
print(non_missing_scores_filtered)