In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

In [2]:
# Load the Iris dataset
iris = load_iris()

In [3]:
# Convert to a Pandas DataFrame
data = pd.DataFrame(data=iris.data, columns=iris.feature_names)
data['target'] = iris.target
data['species'] = data['target'].map({i: species for i, species in enumerate(iris.target_names)})


In [4]:
# Display the first few rows of the dataset
print("Dataset Preview:")
print(data.head())

Dataset Preview:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target species  
0       0  setosa  
1       0  setosa  
2       0  setosa  
3       0  setosa  
4       0  setosa  


In [5]:
# Applying NumPy functions
print("\n--- NumPy Functions ---")
# Convert dataset values to a NumPy array
np_data = data.iloc[:, :-2].values  # Exclude target and species columns
# Mean of each feature
mean_features = np.mean(np_data, axis=0)
print("Mean of features:", mean_features)
# Standard deviation of each feature
std_features = np.std(np_data, axis=0)
print("Standard deviation of features:", std_features)
# Min and Max values of each feature
min_features = np.min(np_data, axis=0)
max_features = np.max(np_data, axis=0)
print("Min values of features:", min_features)
print("Max values of features:", max_features)
# Sum of all feature values
sum_features = np.sum(np_data, axis=0)
print("Sum of features:", sum_features)


--- NumPy Functions ---
Mean of features: [5.84333333 3.05733333 3.758      1.19933333]
Standard deviation of features: [0.82530129 0.43441097 1.75940407 0.75969263]
Min values of features: [4.3 2.  1.  0.1]
Max values of features: [7.9 4.4 6.9 2.5]
Sum of features: [876.5 458.6 563.7 179.9]


In [6]:
# Applying Pandas functions
print("\n--- Pandas Functions ---")
# Summary statistics of the dataset
print("Summary statistics:")
print(data.describe())
# Group by species and calculate the mean of each feature
print("\nMean features grouped by species:")
print(data.groupby('species').mean())

# Checking for null values
print("\nNull value check:")
print(data.isnull().sum())

# Correlation between features
print("\nCorrelation matrix:")
# Select only numerical features for correlation calculation
numerical_data = data.select_dtypes(include=np.number)  # Exclude non-numeric columns like 'species'
print(numerical_data.corr())
# Save the DataFrame to a CSV file
data.to_csv("iris_dataset.csv", index=False)
print("\nDataset saved to 'iris_dataset.csv'")


--- Pandas Functions ---
Summary statistics:
       sepal length (cm)  sepal width (cm)  petal length (cm)  \
count         150.000000        150.000000         150.000000   
mean            5.843333          3.057333           3.758000   
std             0.828066          0.435866           1.765298   
min             4.300000          2.000000           1.000000   
25%             5.100000          2.800000           1.600000   
50%             5.800000          3.000000           4.350000   
75%             6.400000          3.300000           5.100000   
max             7.900000          4.400000           6.900000   

       petal width (cm)      target  
count        150.000000  150.000000  
mean           1.199333    1.000000  
std            0.762238    0.819232  
min            0.100000    0.000000  
25%            0.300000    0.000000  
50%            1.300000    1.000000  
75%            1.800000    2.000000  
max            2.500000    2.000000  

Mean features grouped by 

In [None]:

# NumPy: Array Creation
print("\n--- NumPy Array Creation ---")
arr_from_list = np.array([1, 2, 3, 4, 5])
arr_range = np.arange(10)
arr_zeros = np.zeros((2, 3))
arr_random = np.random.random((3, 3))

print("Array from list:", arr_from_list)
print("Array with range:", arr_range)
print("Array of zeros:
", arr_zeros)
print("Random array:
", arr_random)


In [None]:

# NumPy: Dimension and Size Check
print("\n--- NumPy Dimensions and Size ---")
print("Shape:", arr_random.shape)
print("Dimensions:", arr_random.ndim)
print("Size:", arr_random.size)


In [None]:

# NumPy: Array Manipulation
print("\n--- NumPy Array Manipulation ---")
arr_reshaped = arr_range.reshape(2, 5)
arr_stacked = np.hstack((arr_reshaped, arr_reshaped))
arr_split = np.array_split(arr_range, 3)

print("Reshaped array:
", arr_reshaped)
print("Horizontally stacked array:
", arr_stacked)
print("Split array:", arr_split)


In [None]:

# NumPy: Arithmetic Operations
print("\n--- NumPy Arithmetic Operations ---")
arr1 = np.array([1, 2, 3])
arr2 = np.array([4, 5, 6])

print("Addition:", arr1 + arr2)
print("Subtraction:", arr1 - arr2)
print("Element-wise multiplication:", arr1 * arr2)
print("Element-wise division:", arr1 / arr2)


In [None]:

# NumPy: Indexing, Slicing, and Masking
print("\n--- NumPy Indexing, Slicing, and Masking ---")
print("Original array:", arr_range)
print("First 5 elements:", arr_range[:5])
print("Elements greater than 5:", arr_range[arr_range > 5])


In [None]:

# NumPy: Shape Manipulation
print("\n--- NumPy Shape Manipulation ---")
flattened = arr_reshaped.flatten()
reshaped_again = flattened.reshape(5, 2)
print("Flattened array:", flattened)
print("Reshaped array:
", reshaped_again)


In [None]:

# Pandas: Import/Export Data
print("\n--- Pandas Import/Export ---")
# Exporting the DataFrame to JSON and Excel
data.to_json("iris_dataset.json", orient="records", lines=True)
data.to_excel("iris_dataset.xlsx", index=False)
print("Dataset saved to 'iris_dataset.json' and 'iris_dataset.xlsx'")


In [None]:

# Pandas: Data Cleanup
print("\n--- Pandas Data Cleanup ---")
data_with_nan = data.copy()
data_with_nan.iloc[0, 0] = np.nan  # Introduce NaN value
print("Data with NaN value:
", data_with_nan.head())

# Handle missing values
data_cleaned = data_with_nan.dropna()
print("Cleaned data:
", data_cleaned.head())

# Check for duplicates
print("Duplicates check:", data.duplicated().sum())


In [None]:

# Pandas: Indexing and Grouping
print("\n--- Pandas Indexing and Grouping ---")
grouped = data.groupby("species").agg(["mean", "max", "min"])
print("Grouped data:
", grouped)


In [None]:

# Pandas: Short Filtering
print("\n--- Pandas Short Filtering ---")
filtered_data = data[data["sepal length (cm)"] > 5.0]
print("Filtered data (sepal length > 5.0):
", filtered_data.head())


In [None]:

# Pandas: Merge DataFrames
print("\n--- Pandas Merging ---")
df1 = pd.DataFrame({"key": [1, 2, 3], "value1": ["A", "B", "C"]})
df2 = pd.DataFrame({"key": [2, 3, 4], "value2": ["X", "Y", "Z"]})

merged = pd.merge(df1, df2, on="key", how="inner")
print("Merged DataFrame:
", merged)


In [None]:

# Pandas: Statistical Visualization
import matplotlib.pyplot as plt

print("\n--- Pandas Statistical Visualization ---")
data.groupby("species").mean().plot(kind="bar", figsize=(8, 5), title="Mean Features by Species")
plt.show()
