In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score # For Checking Accuracy
from sklearn.model_selection import train_test_split # Splitting Data For Train Test
from sklearn.neighbors import KNeighborsClassifier # ML Algo KNN
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB # For Multinomial Naive Bayes Model
from sklearn.model_selection import cross_val_score # For Cross Validation
from sklearn.naive_bayes import BernoulliNB 
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv("D:/csv/train.csv")
test = pd.read_csv("D:/csv/test.csv")
train.head()
test.head()
print(train.shape)
print(test.shape)

(900000, 33)
(700000, 32)


In [3]:
del train['id']
del train['f_27']

# del test['id']
del test['f_27']

In [4]:
# Separating Target & Other Columns
X = train.drop(columns=['target'])
y = train['target']


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(630000, 30)
(270000, 30)
(630000,)
(270000,)


In [7]:
bnbTesting = BernoulliNB() #Classifier
bnbTesting.fit(X_train, y_train) #Training by fitting into model
bnbTestingPred = bnbTesting.predict(X_test) #Prediction
bnbAcc = metrics.accuracy_score(y_test, bnbTestingPred) #Checking Accuracy Score
print ("Naive Bayes Accuracy: ", bnbAcc)

Naive Bayes Accuracy:  0.5810888888888889


In [8]:
# ======== NAIVE BAYES ======== 
nav_clf = BernoulliNB()
nav_scores = cross_val_score(nav_clf, X_train, y_train, cv=6)
print('Naive Bayes Scores: ',nav_scores)
nav_mean = nav_scores.mean()
print('Naive Bayes Mean Score: ',nav_mean)

Naive Bayes Scores:  [0.58599048 0.58252381 0.58387619 0.58149524 0.5831619  0.58197143]
Naive Bayes Mean Score:  0.5831698412698412


In [9]:
print(train.shape)
print(test.shape)

(900000, 31)
(700000, 31)


In [10]:
RafayCSVtoTest = test[['id']]
RafayCSVtoTest

Unnamed: 0,id
0,900000
1,900001
2,900002
3,900003
4,900004
...,...
699995,1599995
699996,1599996
699997,1599997
699998,1599998


In [11]:
predT = test.drop(columns=['id'])
predT.head(2)

Unnamed: 0,f_00,f_01,f_02,f_03,f_04,f_05,f_06,f_07,f_08,f_09,...,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_28,f_29,f_30
0,0.442517,0.17438,-0.999816,0.762741,0.186778,-1.074775,0.501888,6,6,0,...,2.749347,-1.0064,-1.193879,-2.435736,-2.42743,-1.966887,5.734205,99.478419,0,0
1,-0.605598,-0.305715,0.627667,-0.578898,-1.750931,1.35555,-0.190911,1,3,4,...,1.080762,2.382405,0.149442,1.883322,-2.848714,-0.725155,3.194219,-65.993825,1,0


In [12]:
# ======== NAIVE BAYES (LAPLACE SMOOTHING) ======== 

from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
y_pred  =  classifier.predict(X_test)

gnbAcc = metrics.accuracy_score(y_test, y_pred) # Checking Accuracy Score
print ("Naive Bayes Accuracy: ", gnbAcc)

Naive Bayes Accuracy:  0.6253814814814814


In [13]:
predictionOnTest = classifier.predict(predT)
print(predictionOnTest)
print(len(predictionOnTest))

[0 1 1 ... 0 0 0]
700000


In [14]:
RafayCSVtoTest['target'] = predictionOnTest
RafayCSVtoTest.head()

Unnamed: 0,id,target
0,900000,0
1,900001,1
2,900002,1
3,900003,0
4,900004,0


In [15]:
RafayCSVtoTest.shape
RafayCSVtoTest.to_csv('RafayCSVtoTest.csv', index=False)

In [16]:
from sklearn.model_selection import GridSearchCV
import numpy as np

param_grid = {
    'var_smoothing': np.logspace(0,-9, num=1000)
}
classifier = GaussianNB()

grid = GridSearchCV(estimator=classifier, param_grid=param_grid, cv=5)


In [17]:
grid.fit(X_train, y_train)


GridSearchCV(cv=5, estimator=GaussianNB(),
             param_grid={'var_smoothing': array([1.00000000e+00, 9.79469667e-01, 9.59360829e-01, 9.39664831e-01,
       9.20373200e-01, 9.01477631e-01, 8.82969996e-01, 8.64842328e-01,
       8.47086827e-01, 8.29695852e-01, 8.12661920e-01, 7.95977700e-01,
       7.79636013e-01, 7.63629826e-01, 7.47952252e-01, 7.32596543e-01,
       7.17556092e-01, 7.02824426e-01, 6.88...
       1.61141428e-09, 1.57833141e-09, 1.54592774e-09, 1.51418933e-09,
       1.48310251e-09, 1.45265393e-09, 1.42283046e-09, 1.39361927e-09,
       1.36500781e-09, 1.33698374e-09, 1.30953502e-09, 1.28264983e-09,
       1.25631660e-09, 1.23052400e-09, 1.20526094e-09, 1.18051653e-09,
       1.15628013e-09, 1.13254132e-09, 1.10928986e-09, 1.08651577e-09,
       1.06420924e-09, 1.04236067e-09, 1.02096066e-09, 1.00000000e-09])})

In [18]:
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'var_smoothing': 2.024446509976806e-06} with a score of 0.63


In [19]:
import pandas as pd

grid_results = pd.concat([pd.DataFrame(grid.cv_results_["params"]),pd.DataFrame(grid.cv_results_["mean_test_score"], columns=["Accuracy"])],axis=1)
grid_results.head()

Unnamed: 0,var_smoothing,Accuracy
0,1.0,0.532541
1,0.97947,0.532857
2,0.959361,0.532998
3,0.939665,0.53323
4,0.920373,0.533468
