In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [None]:
filename = "Customers.csv"
dataset = pd.read_csv(filename)
dataset.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size
0,1,Male,19,15000,39,Healthcare,1,4
1,2,Male,21,35000,81,Engineer,3,3
2,3,Female,20,86000,6,Engineer,1,1
3,4,Female,23,59000,77,Lawyer,0,2
4,5,Female,31,38000,40,Entertainment,2,6


In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              2000 non-null   int64 
 1   Gender                  2000 non-null   object
 2   Age                     2000 non-null   int64 
 3   Annual Income ($)       2000 non-null   int64 
 4   Spending Score (1-100)  2000 non-null   int64 
 5   Profession              1965 non-null   object
 6   Work Experience         2000 non-null   int64 
 7   Family Size             2000 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 125.1+ KB


In [None]:
dataset.isnull().sum()

CustomerID                 0
Gender                     0
Age                        0
Annual Income ($)          0
Spending Score (1-100)     0
Profession                35
Work Experience            0
Family Size                0
dtype: int64

In [None]:
# Drop rows with missing values in 'Profession' column
dataset.dropna(how = 'any', subset = ['Profession'], inplace = True)

In [None]:
# Perform label encoding for 'Gender' column
dataset['Gender'] = dataset['Gender'].map({'Male': 0, 'Female': 1})

In [None]:
# Perform one-hot encoding for 'Profession' column
dataset = pd.get_dummies(dataset, columns=['Profession'], dummy_na=True)

In [None]:
print(dataset)

      CustomerID  Gender  Age  Annual Income ($)  Spending Score (1-100)  \
0              1       0   19              15000                      39   
1              2       0   21              35000                      81   
2              3       1   20              86000                       6   
3              4       1   23              59000                      77   
4              5       1   31              38000                      40   
...          ...     ...  ...                ...                     ...   
1995        1996       1   71             184387                      40   
1996        1997       1   91              73158                      32   
1997        1998       0   87              90961                      14   
1998        1999       0   77             182109                       4   
1999        2000       0   90             110610                      52   

      Work Experience  Family Size  Profession_Artist  Profession_Doctor  \
0          

In [None]:
# Extract features (independent variables) and target (dependent variable)
x = dataset[['Age', 'Annual Income ($)', 'Spending Score (1-100)', 'Work Experience']]
y = dataset[['Profession_Artist' and 'Profession_Doctor' and 'Profession_Engineer' and 'Profession_Entertainment' and 'Profession_Executive' and 'Profession_Healthcare' and 'Profession_Homemaker' and 'Profession_Lawyer' and 'Profession_Marketing' and 'Profession_nan']]

In [None]:
print(x)

      Age  Annual Income ($)  Spending Score (1-100)  Work Experience
0      19              15000                      39                1
1      21              35000                      81                3
2      20              86000                       6                1
3      23              59000                      77                0
4      31              38000                      40                2
...   ...                ...                     ...              ...
1995   71             184387                      40                8
1996   91              73158                      32                7
1997   87              90961                      14                9
1998   77             182109                       4                7
1999   90             110610                      52                5

[1965 rows x 4 columns]


In [None]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, train_size=0.8, random_state=42)


In [None]:
scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
# Reshape y_train to 1-dimensional array
y_train = y_train.values.ravel()

In [None]:
model = KNeighborsClassifier(n_neighbors = 10)
model.fit(x_train, y_train)

In [None]:
# Initialize and train the Random Forest classifier
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(x_train, y_train)

In [None]:
# Make predictions on test data
predictions = model.predict(x_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
classification_rep = classification_report(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)

# Print evaluation metrics
print(f"Accuracy: {accuracy}")
print("KNN Classification Report:\n", classification_rep)
print("Confusion Matrix:\n", conf_matrix)


Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       393

    accuracy                           1.00       393
   macro avg       1.00      1.00      1.00       393
weighted avg       1.00      1.00      1.00       393

Confusion Matrix:
 [[393]]


In [None]:
# Make predictions on test data
predictions = rfc.predict(x_test)

# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
classification_rep = classification_report(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)

# Print evaluation metrics
print(f"Accuracy: {accuracy}")
print("Random Forest Classification Report:\n", classification_rep)
print("Confusion Matrix:\n", conf_matrix)


Accuracy: 1.0
KNN Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       393

    accuracy                           1.00       393
   macro avg       1.00      1.00      1.00       393
weighted avg       1.00      1.00      1.00       393

Confusion Matrix:
 [[393]]


In [None]:
# Sample predictions
sam1 = [[22, 85000, 50, 4]]
print("Prediction for sample 1:", model.predict(sam1))

sam2 = [[31, 38000, 40, 5]]
print("Prediction for sample 2:", model.predict(sam2))


Prediction for sample 1: [0]
Prediction for sample 2: [0]
