# Task 2
### write python code that perform a complete Pipe Line Preprocessing task on your own Raw dataeach

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.decomposition import PCA
import pandas as pd

### Load the data

In [2]:
data = pd.read_csv('Shopping Mall Customer Segmentation Data .csv')

In [3]:
print(data.head())
print(data.info())
print(data.describe())

                            Customer ID  Age  Gender  Annual Income  \
0  d410ea53-6661-42a9-ad3a-f554b05fd2a7   30    Male         151479   
1  1770b26f-493f-46b6-837f-4237fb5a314e   58  Female         185088   
2  e81aa8eb-1767-4b77-87ce-1620dc732c5e   62  Female          70912   
3  9795712a-ad19-47bf-8886-4f997d6046e3   23    Male          55460   
4  64139426-2226-4cd6-bf09-91bce4b4db5e   24    Male         153752   

   Spending Score  
0              89  
1              95  
2              76  
3              57  
4              76  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15079 entries, 0 to 15078
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Customer ID     15079 non-null  object
 1   Age             15079 non-null  int64 
 2   Gender          15079 non-null  object
 3   Annual Income   15079 non-null  int64 
 4   Spending Score  15079 non-null  int64 
dtypes: int64(3), object(2)
memory us

### Step 2: Identify features

In [4]:
numeric_features = ['Age', 'Annual Income', 'Spending Score']
categorical_features = ['Gender']

### Step 3: Preprocessing for numeric data

In [5]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing values by replacing them with the mean
    ('scaler', StandardScaler()),  # Standardize features by removing the mean and scaling to unit variance
])

### Step 4: Preprocessing for categorical data

In [6]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing values by replacing them with the most frequent value
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Convert categorical data to a one-hot encoded matrix
])

### Step 5: Combine numeric and categorical transformers

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

### Step 6: Add dimensionality reduction

In [8]:
pca = PCA(n_components=2)  # Reducing the feature space to 2 dimensions for visualization or efficiency

### Step 7: Create the complete pipeline

In [9]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('pca', pca)])  # Including PCA as part of the pipeline


### Step 8: Fit and transform the data

In [10]:
preprocessed_data = pipeline.fit_transform(data)

In [11]:
# Convert the preprocessed data to a DataFrame for easy inspection
preprocessed_df = pd.DataFrame(preprocessed_data, columns=['PCA1', 'PCA2'])
preprocessed_df.head()

Unnamed: 0,PCA1,PCA2
0,-1.592701,1.062683
1,-0.642579,1.97947
2,-0.429188,-0.187192
3,-1.407204,-1.10303
4,-1.496757,0.844584
