In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

from sklearn.model_selection import KFold
import pyexpat
from pyexpat import model
from sklearn.ensemble import RandomForestClassifier  

TEHTÄVÄNANTO


List 2 interesting use cases for applying kNN algorithm for real world/ These may or may not be business cases

Use Case 1: Dividing customers for personalized marketing

Objective is to group customers based on their shopping behaviors to send them specific offers and recommendations.

How it works:
Retailers collect data about customers shopping habits - how often they shop, how much they spend, and what type of products they buy. Using this data, kNN can find customers who are similar to each other (targeted marketing).
It's useful because of personalized offers, companies increase the chances of that specific offer to be interesting for that specific person -> there will be less 'useless' offers, making customer experience better -> customers will more likely buy more products.


Use Case 2: Scam detection in financial transactions

Objective is to identify suspicious transactions that may be fake by comparing them to normal transaction patterns.

How it works:
Banks track customer transactions, for example the amount spent, where and when the purchase happened. By using kNN, banks can compare each new transaction to past transactions. If a transaction looks very different from usual behavior, it may be flagged as suspicious or potential fraud. 
As an example, if a person usually makes small purchases in one country but suddenly spends a lot of money overseas, kNN can detect this as an unusual pattern and flag it for review. It's useful for companies if they can detect unusual transaction early and stop it before it happens.



In [2]:
#Run kNN algorithm on that data 

penguin_df = pd.read_csv('penguins_size.csv')

penguin_df = penguin_df.dropna(axis=0)
penguin_df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE


In [3]:
x = penguin_df.iloc[:,2 :-1]
y = penguin_df.iloc[:, [0, 1, -1]]
y

Unnamed: 0,species,island,sex
0,Adelie,Torgersen,MALE
1,Adelie,Torgersen,FEMALE
2,Adelie,Torgersen,FEMALE
4,Adelie,Torgersen,FEMALE
5,Adelie,Torgersen,MALE
...,...,...,...
338,Gentoo,Biscoe,FEMALE
340,Gentoo,Biscoe,FEMALE
341,Gentoo,Biscoe,MALE
342,Gentoo,Biscoe,FEMALE


In [4]:
x

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g
0,39.1,18.7,181.0,3750.0
1,39.5,17.4,186.0,3800.0
2,40.3,18.0,195.0,3250.0
4,36.7,19.3,193.0,3450.0
5,39.3,20.6,190.0,3650.0
...,...,...,...,...
338,47.2,13.7,214.0,4925.0
340,46.8,14.3,215.0,4850.0
341,50.4,15.7,222.0,5750.0
342,45.2,14.8,212.0,5200.0


In [5]:
#Study the effect of different training/testing split portions on your data (like 80/20 training/testing split)

x_train, x_test, y_train, y_test= train_test_split(x, y, test_size = 0.2, shuffle = True, random_state = 0)

In [6]:
#Make experiments on your algorithms with different values of k

K=4
knn=KNeighborsClassifier(K)
knn.fit(x_train, y_train)
y_pred_sklearn= knn.predict(x_test)
print(y_pred_sklearn)

[['Adelie' 'Dream' 'FEMALE']
 ['Adelie' 'Dream' 'FEMALE']
 ['Adelie' 'Dream' 'FEMALE']
 ['Gentoo' 'Biscoe' 'FEMALE']
 ['Adelie' 'Dream' 'FEMALE']
 ['Gentoo' 'Biscoe' 'MALE']
 ['Adelie' 'Biscoe' 'FEMALE']
 ['Adelie' 'Dream' 'FEMALE']
 ['Adelie' 'Dream' 'FEMALE']
 ['Gentoo' 'Biscoe' 'MALE']
 ['Gentoo' 'Biscoe' 'FEMALE']
 ['Adelie' 'Dream' 'MALE']
 ['Adelie' 'Torgersen' 'FEMALE']
 ['Adelie' 'Biscoe' 'FEMALE']
 ['Adelie' 'Dream' 'FEMALE']
 ['Adelie' 'Dream' 'FEMALE']
 ['Adelie' 'Dream' 'FEMALE']
 ['Gentoo' 'Biscoe' 'FEMALE']
 ['Chinstrap' 'Dream' 'FEMALE']
 ['Adelie' 'Dream' 'FEMALE']
 ['Adelie' 'Torgersen' 'FEMALE']
 ['Adelie' 'Dream' 'MALE']
 ['Gentoo' 'Biscoe' 'FEMALE']
 ['Gentoo' 'Biscoe' 'MALE']
 ['Gentoo' 'Biscoe' 'MALE']
 ['Gentoo' 'Biscoe' 'FEMALE']
 ['Gentoo' 'Biscoe' 'MALE']
 ['Gentoo' 'Biscoe' 'MALE']
 ['Adelie' 'Biscoe' 'MALE']
 ['Adelie' 'Dream' 'FEMALE']
 ['Adelie' 'Dream' 'FEMALE']
 ['Adelie' 'Dream' 'MALE']
 ['Adelie' 'Dream' 'FEMALE']
 ['Adelie' 'Torgersen' 'FEMALE']
 ['Ge

In [7]:
print("Shape of y_test:", y_test.shape)
print("Shape of y_pred_sklearn:", y_pred_sklearn.shape)

Shape of y_test: (67, 3)
Shape of y_pred_sklearn: (67, 3)


In [8]:
# Drop rows with missing values in any column
penguin_df = penguin_df.dropna()

# Define x (features) and y (target)
# Select relevant features and target column based on dataset structure
x = penguin_df[['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g']]  # Example features
y = penguin_df['species']  # Example target

# Convert categorical target to numeric if necessary
y = y.astype('category').cat.codes

# Inspect your features (x_train and x_test) to see if they contain any categorical data or string values
print(x_train.dtypes)
print(x_test.dtypes)

culmen_length_mm     float64
culmen_depth_mm      float64
flipper_length_mm    float64
body_mass_g          float64
dtype: object
culmen_length_mm     float64
culmen_depth_mm      float64
flipper_length_mm    float64
body_mass_g          float64
dtype: object


In [None]:
n_splits= 2 # Choose the number of splits
k=4
kf= KFold(n_splits= n_splits, shuffle=True) # Call the K Fold function. Note because of shuffle, the results may vary run by run
summa=0

for train_idx, test_idx in  kf.split(x): # Loop over the splits
    x_train = x.iloc[train_idx, :] # fetch the values
    x_test = x.iloc[test_idx, :]

    y_train = y.iloc[train_idx] # fetch the values
    y_test = y.iloc[test_idx]

    knn=KNeighborsClassifier(k) # model training 
    knn.fit(x_train, y_train)

    y_pred_test = knn.predict(x_test)

    summa += accuracy_score(y_pred_test, y_test) #cumulative sum 

print(summa/n_splits) #average accuracy

0.7155688622754491


In [10]:
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size = 0.3, shuffle = True, random_state = 42)

In [11]:
print(f"X_test shape: {x_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_test shape: (101, 4)
y_test shape: (101,)


In [12]:
model = RandomForestClassifier()  # Create the model
model.fit(x_train, y_train)  # Fit the model on training data
y_pred_sklearn = model.predict(x_test)  # Generate predictions
print(len(y_pred_sklearn))  # Should print 101

101


In [13]:
accuracy_score(y_test, y_pred_sklearn)

0.9900990099009901