#### In Python, "SMOTE" stands for "Synthetic Minority Oversampling Technique," 
- which is a method used to balance out an imbalanced dataset by creating artificial data points for the minority class,
- essentially "copying" existing minority data points with slight variations to make the dataset more even between the majority and minority classes; 
- it's like adding more examples of the rare category to make your machine learning model learn better from it. 

In [2]:
import pandas as pd

In [3]:
from imblearn.over_sampling import SMOTE

In [4]:
data=pd.read_csv('car_evaluation.csv')
data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,outcome
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [5]:
data.shape

(1728, 7)

In [6]:
data.columns

Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'outcome'], dtype='object')

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   outcome   1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [16]:
data.outcome.value_counts()

outcome
unacc    1210
acc       384
good       69
vgood      65
Name: count, dtype: int64

In [24]:
X=data.iloc[:,:-1]
y=data.outcome

In [26]:
print(data.columns)

Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'outcome'], dtype='object')


In [28]:
y

0       unacc
1       unacc
2       unacc
3       unacc
4       unacc
        ...  
1723     good
1724    vgood
1725    unacc
1726     good
1727    vgood
Name: outcome, Length: 1728, dtype: object

In [30]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [42]:
import warnings
warnings.filterwarnings('ignore')

In [56]:
X.loc[:, ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']] = \
X.loc[:, ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety']].apply(le.fit_transform)


In [58]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
dtypes: object(6)
memory usage: 81.1+ KB


In [60]:
X.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,3,3,0,0,2,1
1,3,3,0,0,2,2
2,3,3,0,0,2,0
3,3,3,0,0,1,1
4,3,3,0,0,1,2


In [64]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.30,random_state=10)

In [68]:
from sklearn.neighbors import KNeighborsClassifier
model= KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)

In [70]:
model.fit(X_train,y_train)

In [72]:
y_pred = model.predict(X_test)

In [76]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,y_pred)
accuracy

0.9267822736030829

In [78]:
pd.crosstab(y_test,y_pred)

col_0,acc,good,unacc,vgood
outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
acc,82,1,19,0
good,6,14,1,0
unacc,2,0,369,0
vgood,3,1,5,16


In [94]:
# acc
(82/(82+1+19)) *100


80.3921568627451

In [88]:
(14/(6+14+1))*100

66.66666666666666

In [90]:
369/(2+369)

0.9946091644204852

In [92]:
16/25

0.64

In [96]:
smote=SMOTE()

In [102]:
X_train_smote,y_train_smote = smote.fit_resample(X_train.astype('float'),y_train)


In [104]:
from collections import Counter
print('Before Smote:',Counter(y_train))
print('After Smote:',Counter(y_train_smote))

Before Smote: Counter({'unacc': 839, 'acc': 282, 'good': 48, 'vgood': 40})
After Smote: Counter({'acc': 839, 'unacc': 839, 'vgood': 839, 'good': 839})


In [None]:
model_new= KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)
model_new= fit(X_train)