In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
data = pd.read_excel('JPN Data.xlsx')

In [5]:
data

Unnamed: 0,ID,CURR_AGE,GENDER,ANN_INCOME,AGE_CAR,PURCHASE
0,00001Q15YJ,50,M,445344.000000,439,0
1,00003I71CQ,35,M,107634.000000,283,0
2,00003N47FS,59,F,502786.666667,390,1
3,00005H41DE,43,M,585664.000000,475,0
4,00007E17UM,39,F,705722.666667,497,1
...,...,...,...,...,...,...
39995,99988B18OG,49,M,478511.000000,464,1
39996,99990Q44VP,28,M,271419.000000,61,0
39997,99997Q09VJ,41,F,590704.000000,379,1
39998,99998J59EE,50,M,363160.000000,302,1


In [7]:
data['GENDER'] = data['GENDER'].map({'F':0, 'M':1})

In [15]:
x = data.iloc[:,1:-1].values
y = data.iloc[:,-1].values

In [17]:
# making of training and test dataset
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)

In [18]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [19]:
models = {
    'Logistic Regression': LogisticRegression(random_state=0),
    'SVM': svm.SVC(kernel='linear', random_state=0),
    'KNN': KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(criterion='entropy', random_state=0)
}

In [22]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'p': [1, 2]
}

In [25]:
grid_search = GridSearchCV(estimator=models['KNN'], param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


In [26]:
best_knn = grid_search.best_estimator_
print(f"Best KNN parameters: {grid_search.best_params_}")

Best KNN parameters: {'metric': 'euclidean', 'n_neighbors': 11, 'p': 1}


In [27]:
models['KNN'] = best_knn

In [28]:
for name, model in models.items():
    model.fit(x_train, y_train)  
    y_pred = model.predict(x_test)
    print(name)
    print(confusion_matrix(y_test, y_pred))     
    print(accuracy_score(y_test, y_pred))  
    print(classification_report(y_test, y_pred)) 
    print("*************************")

Logistic Regression
[[2266 2002]
 [1243 4489]]
0.6755
              precision    recall  f1-score   support

           0       0.65      0.53      0.58      4268
           1       0.69      0.78      0.73      5732

    accuracy                           0.68     10000
   macro avg       0.67      0.66      0.66     10000
weighted avg       0.67      0.68      0.67     10000

*************************
SVM
[[2176 2092]
 [1209 4523]]
0.6699
              precision    recall  f1-score   support

           0       0.64      0.51      0.57      4268
           1       0.68      0.79      0.73      5732

    accuracy                           0.67     10000
   macro avg       0.66      0.65      0.65     10000
weighted avg       0.67      0.67      0.66     10000

*************************
KNN
[[2496 1772]
 [1403 4329]]
0.6825
              precision    recall  f1-score   support

           0       0.64      0.58      0.61      4268
           1       0.71      0.76      0.73      5732



In [29]:
data_2  = pd.read_excel('INDIAN_DATASET.xlsx')
data_2

Unnamed: 0,ID,CURR_AGE,GENDER,ANN_INCOME,DT_MAINT,DAYS,AGE_CAR
0,20710B05XL,54,M,1425390,2018-04-20,2019-07-01,437
1,89602T51HX,47,M,1678954,2018-06-08,2019-07-01,388
2,70190Z52IP,60,M,931624,2017-07-31,2019-07-01,700
3,25623V15MU,55,F,1106320,2017-07-31,2019-07-01,700
4,36230I68CE,32,F,748465,2019-01-27,2019-07-01,155
...,...,...,...,...,...,...,...
69995,35280V26PS,49,F,861770,2018-04-08,2019-07-01,449
69996,79863N01VR,59,M,888976,2018-05-17,2019-07-01,410
69997,94236O99QE,33,M,699676,2019-04-21,2019-07-01,71
69998,23696V12DP,41,M,1881922,2018-07-14,2019-07-01,352


In [30]:
data_2['GENDER'] = data_2['GENDER'].map({'M': 0, 'F': 1})

In [31]:
data_2.columns

Index(['ID', 'CURR_AGE', 'GENDER', 'ANN_INCOME', 'DT_MAINT', 'DAYS',
       'AGE_CAR'],
      dtype='object')

In [33]:
z = data_2.iloc[:,[1,2,3,6]].values

In [34]:
z = sc.transform(z)

In [35]:
# applying KNN classifier on z (model with highest accuracy)
y_pred_z = models['KNN'].predict(z)
y_pred_z

array([1, 1, 1, ..., 0, 1, 1], dtype=int64)

In [38]:
 # Assuming 1 indicates purchase
count = sum(y_pred_z == 1)

In [39]:
print(f"Predicted number of buyers in Indain Market : {count}")
if count >= 10000:
    print("Market entry is favorable.")
else:
    print("Market entry is not favorable.")

Predicted number of buyers in Indain Market : 56015
Market entry is favorable.
