In [1]:
# Step 1: Set Up the Environment
# Objective: Ensure you have the right tools and packages installed.

# 1. Install the required libraries.
# 2. Verify the installation by importing the libraries in a Python script or Jupyter notebook





# Step 2: Load & Explore the Dataset
# Objective: Load data into a pandas DataFrame and obtain a basic understanding of its structure.

# 3. Load a CSV file into a DataFrame.
# 4. Display the first few records to understand the structure.
# 5. Get a summary of the dataset.






# Step 3: Perform NumPy Operations
# Objective: Utilize NumPy for basic numerical operations and array manipulations.

# 6. Convert a DataFrame column to a NumPy array and perform array operations like mean and sum.
# 7. Create a NumPy array and calculate the variance and standard deviation.
# 8. Use NumPy to filter based on conditions.








# Step 4: Data Manipulation with Pandas
# Objective: Use Pandas to clean and manipulate dataset for analysis.

# 9. Handle missing data by filling or dropping.
# 10. Create new columns or modify existing ones.
# 11. Use groupby to aggregate data.







# Step 5: Data Visualization with Matplotlib & Seaborn
# Objective: Visualize the data to identify patterns, trends, and insights.

# 12. Use Matplotlib to create a basic plot.
# 13. Create a histogram using Seaborn.
# 14. Plot a box plot for a clear view of data distribution.

# Step 1: Set Up the Environment
# Install required packages (run this in terminal/command prompt first)
# pip install numpy pandas matplotlib seaborn scikit-learn

# Verify installations by importing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
print("All packages imported successfully!")

# Step 2: Load & Explore the Dataset
# Load sample dataset (replace with your CSV file)
df = sns.load_dataset('iris')

# Display first records and summary
print("\nFirst 5 records:")
print(df.head())

print("\nDataset info:")
print(df.info())

print("\nStatistical summary:")
print(df.describe())

# Step 3: Perform NumPy Operations
# Convert column to NumPy array
sepal_length = df['sepal_length'].to_numpy()

print("\nNumPy operations:")
print("Mean:", np.mean(sepal_length))
print("Sum:", np.sum(sepal_length))
print("Variance:", np.var(sepal_length))
print("Standard deviation:", np.std(sepal_length))

# Filtering with NumPy
filtered = sepal_length[sepal_length > 5.0]
print("\nValues > 5.0:", filtered)

# Step 4: Data Manipulation with Pandas
# Handle missing data (though iris has none)
print("\nMissing values before:", df.isnull().sum())
df_filled = df.fillna(df.mean())  # Would fill if there were missing values

# Create new column
df['sepal_area'] = df['sepal_length'] * df['sepal_width']

# Groupby operations
grouped = df.groupby('species').mean()
print("\nGrouped averages:")
print(grouped)

# Step 5: Data Visualization
# Matplotlib basic plot
plt.figure(figsize=(10, 6))
plt.plot(df['sepal_length'][:50], 'b-', label='Sepal Length')
plt.title('First 50 Sepal Length Measurements')
plt.xlabel('Index')
plt.ylabel('Length (cm)')
plt.legend()
plt.show()

# Seaborn histogram
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='sepal_length', hue='species', element='step')
plt.title('Sepal Length Distribution by Species')
plt.show()

# Box plot
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='species', y='petal_length')
plt.title('Petal Length Distribution by Species')
plt.show()






All packages imported successfully!

First 5 records:
   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None

Statistical summary:
       sepal_length  sepal_width  petal_length  petal_width
cou

TypeError: Could not convert ['setosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosasetosaversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorversicolorvirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginicavirginica'] to numeric