In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt



In [2]:
dataset_path = "ageIncome.csv"
age_income_df = pd.read_csv(dataset_path)

# Step 3: Load the Dataset into pandas dataframe
print("First 5 rows of the dataset:")
print(age_income_df.head())

First 5 rows of the dataset:
   CustomerID  Gender  Age  Annual Income (k$)  Spending Score (1-100)
0           1    Male   19                  15                      39
1           2    Male   21                  15                      81
2           3  Female   20                  16                       6
3           4  Female   23                  16                      77
4           5  Female   31                  17                      40


In [3]:
age_income_missing = age_income_df.isnull().sum()
print("\nMissing Values in Each Column:")
print(age_income_missing[age_income_missing > 0])


Missing Values in Each Column:
Series([], dtype: int64)


In [4]:
age_income_description = age_income_df.describe()
print("\nStatistical Summary:")
print(age_income_description)


Statistical Summary:
       CustomerID         Age  Annual Income (k$)  Spending Score (1-100)
count  200.000000  200.000000          200.000000              200.000000
mean   100.500000   38.850000           60.560000               50.200000
std     57.879185   13.969007           26.264721               25.823522
min      1.000000   18.000000           15.000000                1.000000
25%     50.750000   28.750000           41.500000               34.750000
50%    100.500000   36.000000           61.500000               50.000000
75%    150.250000   49.000000           78.000000               73.000000
max    200.000000   70.000000          137.000000               99.000000


In [5]:
print("\nDataset Dimensions:", age_income_df.shape)


Dataset Dimensions: (200, 5)


In [6]:
print("\nData Types:")
print(age_income_df.dtypes)


Data Types:
CustomerID                 int64
Gender                    object
Age                        int64
Annual Income (k$)         int64
Spending Score (1-100)     int64
dtype: object


In [7]:
if 'AgeGroup' in age_income_df.columns:
    age_income_df['AgeGroup'] = age_income_df['AgeGroup'].astype('category').cat.codes

print("\nUpdated Data Types:")
print(age_income_df.dtypes)


Updated Data Types:
CustomerID                 int64
Gender                    object
Age                        int64
Annual Income (k$)         int64
Spending Score (1-100)     int64
dtype: object


In [11]:
bins = [18, 25, 35, 45, 55, 65, 100]  # Define age bins
labels = ['18-25', '26-35', '36-45', '46-55', '56-65', '65+']
age_income_df['AgeGroup'] = pd.cut(age_income_df['Age'], bins=bins, labels=labels)

print("\nUpdated Data Types:")
print(age_income_df.dtypes)

# Step 6: Descriptive Statistics - Measures of Central Tendency and Variability
# Provide summary statistics for income grouped by AgeGroup
if 'AgeGroup' in age_income_df.columns and 'Annual Income (k$)' in age_income_df.columns:
    summary_stats_age_income = age_income_df.groupby('AgeGroup')['Annual Income (k$)'].agg(['mean', 'median', 'min', 'max', 'std'])
    print("\nSummary Statistics for Annual Income Grouped by AgeGroup:")
    print(summary_stats_age_income)


Updated Data Types:
CustomerID                   int64
Gender                      object
Age                          int64
Annual Income (k$)           int64
Spending Score (1-100)       int64
AgeGroup                  category
dtype: object

Summary Statistics for Annual Income Grouped by AgeGroup:
               mean  median  min  max        std
AgeGroup                                        
18-25     45.029412    47.0   15   81  21.877739
26-35     68.150000    74.5   17  137  30.747096
36-45     72.055556    72.0   20  126  24.071257
46-55     57.729730    57.0   23  120  21.693123
56-65     55.294118    54.0   19   93  21.805288
65+       51.416667    51.5   19   63  12.616427


  summary_stats_age_income = age_income_df.groupby('AgeGroup')['Annual Income (k$)'].agg(['mean', 'median', 'min', 'max', 'std'])


In [10]:
print(age_income_df.columns)


Index(['CustomerID', 'Gender', 'Age', 'Annual Income (k$)',
       'Spending Score (1-100)'],
      dtype='object')


In [12]:
iris_path = "iris.csv"
iris_df = pd.read_csv(iris_path)

In [13]:
iris_summary = iris_df.groupby('species').describe()
print("\nStatistical Summary for Iris Species:")
print(iris_summary)


Statistical Summary for Iris Species:
           sepal_length                                              \
                  count   mean       std  min    25%  50%  75%  max   
species                                                               
setosa             50.0  5.006  0.352490  4.3  4.800  5.0  5.2  5.8   
versicolor         50.0  5.936  0.516171  4.9  5.600  5.9  6.3  7.0   
virginica          50.0  6.588  0.635880  4.9  6.225  6.5  6.9  7.9   

           sepal_width         ... petal_length      petal_width         \
                 count   mean  ...          75%  max       count   mean   
species                        ...                                        
setosa            50.0  3.428  ...        1.575  1.9        50.0  0.246   
versicolor        50.0  2.770  ...        4.600  5.1        50.0  1.326   
virginica         50.0  2.974  ...        5.875  6.9        50.0  2.026   

                                               
                 std  min  25%  50%

In [14]:
iris_median = iris_df.groupby('species').median()
print("\nMedian values for Iris Species:")
print(iris_median)



Median values for Iris Species:
            sepal_length  sepal_width  petal_length  petal_width
species                                                         
setosa               5.0          3.4          1.50          0.2
versicolor           5.9          2.8          4.35          1.3
virginica            6.5          3.0          5.55          2.0


In [15]:
iris_percentiles = iris_df.groupby('species').quantile([0.25, 0.5, 0.75])  # 25th, 50th (median), and 75th percentile
print("\nPercentiles (25th, 50th, 75th) for Iris Species:")
print(iris_percentiles)


Percentiles (25th, 50th, 75th) for Iris Species:
                 sepal_length  sepal_width  petal_length  petal_width
species                                                              
setosa     0.25         4.800        3.200         1.400          0.2
           0.50         5.000        3.400         1.500          0.2
           0.75         5.200        3.675         1.575          0.3
versicolor 0.25         5.600        2.525         4.000          1.2
           0.50         5.900        2.800         4.350          1.3
           0.75         6.300        3.000         4.600          1.5
virginica  0.25         6.225        2.800         5.100          1.8
           0.50         6.500        3.000         5.550          2.0
           0.75         6.900        3.175         5.875          2.3


In [16]:
iris_summary = iris_df.groupby('species').describe()
print("\nStatistical Summary for Iris Species:")
print(iris_summary)



Statistical Summary for Iris Species:
           sepal_length                                              \
                  count   mean       std  min    25%  50%  75%  max   
species                                                               
setosa             50.0  5.006  0.352490  4.3  4.800  5.0  5.2  5.8   
versicolor         50.0  5.936  0.516171  4.9  5.600  5.9  6.3  7.0   
virginica          50.0  6.588  0.635880  4.9  6.225  6.5  6.9  7.9   

           sepal_width         ... petal_length      petal_width         \
                 count   mean  ...          75%  max       count   mean   
species                        ...                                        
setosa            50.0  3.428  ...        1.575  1.9        50.0  0.246   
versicolor        50.0  2.770  ...        4.600  5.1        50.0  1.326   
virginica         50.0  2.974  ...        5.875  6.9        50.0  2.026   

                                               
                 std  min  25%  50%