# K-Nearest Neighbor(KNN) 

## Load and preprocess the dataset

In [10]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix



# Load the dataset
file_path = 'salary_data.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
print("Dataset Head:")
print(df.head())    


#ID: A unique identifier for each individual record
#Age: The age of the individual in years between 22 and 65. This helps show the correlation between age and salary.
#Education: The highest level of education the individual has achieved: High School , Associate Degree , Bachelor Degree ,Master Degree ,PhD .
#Years_Experience: The number of years  between 1 and 40 years.
#Job_Role: The current role or job title of the individual. It can be one of the following: Software Engineer , Data Scientist ,Product ,anager,Consultant ,Designer .
#Salary: The individual's annual salary between 50,000 and 200,000 USD.

#Location: The city where the individual is employed. It can be one of the following: New York , San Francisco ,Los Angeles ,Austin,Seattle
#Class_Label: This represents whether the salary is considered "Above Median" or "Below Median" based on the salary data. This label is binary and is used as the target class in classification tasks.

Dataset Head:
   ID  Age         Education  Years_Experience           Job_Role  Salary  \
0   1   60       High School                23  Software Engineer  160078   
1   2   50     Master Degree                24  Software Engineer  130623   
2   3   36  Associate Degree                37    Product Manager   72671   
3   4   64     Master Degree                35  Software Engineer  143384   
4   5   29  Associate Degree                22         Consultant  136202   

      Location   Class_Label  
0     New York  Below Median  
1       Austin  Above Median  
2  Los Angeles  Below Median  
3     New York  Above Median  
4       Austin  Above Median  


## Preprocessing (Encoding categorical variables and normalization)

In [13]:
# Encode categorical variables (Job_Role, Education, Location, Class_Label)
label_encoders = {}
for column in ['Job_Role', 'Education', 'Location', 'Class_Label']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Split data into features (X) and target (y)
X = df.drop(['Class_Label', 'ID'], axis=1)  # Drop ID and target column
y = df['Class_Label']

# Normalize the feature set
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Split the data into training and test sets

In [26]:
# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print(f"Training data size: {len(X_train)}")
print(f"Test data size: {len(X_test)}")

Training data size: 40
Test data size: 10


## Train KNN Algorithm

In [28]:
# Initialize and train the KNN model
knn = KNeighborsClassifier(n_neighbors=5)  # You can experiment with different values of 'k'
knn.fit(X_train, y_train)

# Predict on the test set
y_pred = knn.predict(X_test)


## Evaluate the Model 

In [30]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"KNN Model Accuracy: {accuracy * 100:.2f}%")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


KNN Model Accuracy: 80.00%
Confusion Matrix:
[[6 1]
 [1 2]]


In [32]:
from sklearn.metrics import classification_report, confusion_matrix

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification Report: includes Precision, Recall, and F-measure
classification_report_results = classification_report(y_test, y_pred, target_names=['Below Median', 'Above Median'])
print("Classification Report:")
print(classification_report_results)


Confusion Matrix:
[[6 1]
 [1 2]]
Classification Report:
              precision    recall  f1-score   support

Below Median       0.86      0.86      0.86         7
Above Median       0.67      0.67      0.67         3

    accuracy                           0.80        10
   macro avg       0.76      0.76      0.76        10
weighted avg       0.80      0.80      0.80        10

