In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.datasets import load_iris





In [2]:
# --- When to Use This Technique ---
# This initial data exploration is the very first step in any data analysis or machine learning project.
# It's used to get a basic understanding of the dataset's structure, content, and quality.
#
# --- Data Requirements ---
# This process works best with structured data, typically in a tabular format (like a CSV or database table).
# No specific data types are required, as the techniques are designed to reveal those types.



In [3]:
# 1. Load a relevant sample dataset from scikit-learn
# We'll use the famous Iris dataset, which contains measurements for three species of iris flowers.
iris_data = load_iris()
# Create a Pandas DataFrame for easier manipulation.
# The data is in iris_data.data and the column names are in iris_data.feature_names.
df = pd.DataFrame(data=iris_data.data, columns=iris_data.feature_names)
# Add the target variable (species) to the DataFrame for context.
df['species'] = iris_data.target_names[iris_data.target]

print("--- 1. Dataset Loaded: Iris Flower Dataset ---")
print("The dataset is now loaded into a Pandas DataFrame.\n")




--- 1. Dataset Loaded: Iris Flower Dataset ---
The dataset is now loaded into a Pandas DataFrame.



In [4]:
# 2. What are the names of the columns (features)?
# Use the .columns attribute to list all column names.
print("--- 2. Column Names (.columns) ---")
print(df.columns)
print("\n")




--- 2. Column Names (.columns) ---
Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'species'],
      dtype='object')




In [5]:
# 3. How many rows (observations) and columns are in the dataset?
# Use the .shape attribute. It returns a tuple of (rows, columns).
print("--- 3. Dataset Dimensions (.shape) ---")
rows, columns = df.shape
print(f"Number of rows (observations): {rows}")
print(f"Number of columns (features): {columns}")
print("\n")




--- 3. Dataset Dimensions (.shape) ---
Number of rows (observations): 150
Number of columns (features): 5




In [6]:
# 4. What are the data types of each column?
# Use the .info() method for a concise summary of the DataFrame.
# This includes the index dtype and columns, non-null values, and memory usage.
print("--- 4. Data Types and Non-Null Values (.info) ---")
# The .info() method prints its output directly, so we call it here.
df.info()
print("\n")




--- 4. Data Types and Non-Null Values (.info) ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   species            150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB




In [7]:
# 5. Show a sample of the data
# Use .head() to see the first 5 rows and .tail() to see the last 5 rows.
print("--- 5. First 5 Rows (.head) ---")
print(df.head())
print("\n")

print("--- Last 5 Rows (.tail) ---")
print(df.tail())
print("\n")




--- 5. First 5 Rows (.head) ---
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

  species  
0  setosa  
1  setosa  
2  setosa  
3  setosa  
4  setosa  


--- Last 5 Rows (.tail) ---
     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
145                6.7               3.0                5.2               2.3   
146                6.3               2.5                5.0               1.9   
147                6.5               3.0                5.2               2.0   
148                6.2               3.4                5.4        

In [8]:
# 6. Generate descriptive statistics for numerical columns
# Use the .describe() method to get statistics like mean, std, min, max, etc.
print("--- 6. Descriptive Statistics for Numerical Columns (.describe) ---")
print(df.describe())
print("\n")



--- 6. Descriptive Statistics for Numerical Columns (.describe) ---
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            5.843333          3.057333           3.758000   
std             0.828066          0.435866           1.765298   
min             4.300000          2.000000           1.000000   
25%             5.100000          2.800000           1.600000   
50%             5.800000          3.000000           4.350000   
75%             6.400000          3.300000           5.100000   
max             7.900000          4.400000           6.900000   

       petal width (cm)  
count        150.000000  
mean           1.199333  
std            0.762238  
min            0.100000  
25%            0.300000  
50%            1.300000  
75%            1.800000  
max            2.500000  




In [9]:
# 7. Check documentation (Data Dictionary)
# For scikit-learn datasets, the description is available in the DESCR attribute.
print("--- 7. Dataset Documentation (Data Dictionary) ---")
print(iris_data.DESCR)

--- 7. Dataset Documentation (Data Dictionary) ---
.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

:Number of Instances: 150 (50 in each of three classes)
:Number of Attributes: 4 numeric, predictive attributes and the class
:Attribute Information:
    - sepal length in cm
    - sepal width in cm
    - petal length in cm
    - petal width in cm
    - class:
            - Iris-Setosa
            - Iris-Versicolour
            - Iris-Virginica

:Summary Statistics:

                Min  Max   Mean    SD   Class Correlation
sepal length:   4.3  7.9   5.84   0.83    0.7826
sepal width:    2.0  4.4   3.05   0.43   -0.4194
petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

:Missing Attribute Values: None
:Class Distribution: 33.3% for each of 3 classes.
:Creator: R.A. Fisher
:Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
:Date: July, 1988

The famous Iris database, first use