In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn import metrics, svm
from sklearn.metrics import precision_score, recall_score, roc_curve, confusion_matrix, jaccard_score, f1_score

### Reading in simulated data

In [2]:
df = pd.read_json('sim_data.json')

In [3]:
df.describe()

Unnamed: 0,prior_exp,usage_hours,publishes,ai,at3,at4,at5,utility,conversion
count,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0
mean,1.89044,250.175218,9.599912,14.051599,1.197865,3.99886,5.004053,27.500887,0.41233
std,0.396343,144.626707,2.198206,3.433038,1.076411,1.000608,2.887509,4.202783,0.492255
min,0.0,0.001127,0.0,0.0,0.0,-0.446405,0.00014,1.257861,0.0
25%,2.0,124.657227,9.229995,14.102369,0.0,3.325622,2.498452,27.03646,0.0
50%,2.0,250.366247,9.949248,14.889738,1.0,3.996904,5.004031,28.553327,0.0
75%,2.0,375.300415,10.643674,15.606132,2.0,4.673544,7.505106,29.861572,1.0
max,2.0,499.998308,14.683496,19.357789,3.0,8.738615,9.99999,37.191022,1.0


In [4]:
drop_cols = ['prior_exp','usage_hours','utility']
df = df.drop(columns=drop_cols)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200000 entries, 0 to 199999
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   publishes   200000 non-null  float64
 1   ai          200000 non-null  float64
 2   at3         200000 non-null  int64  
 3   at4         200000 non-null  float64
 4   at5         200000 non-null  float64
 5   conversion  200000 non-null  int64  
dtypes: float64(4), int64(2)
memory usage: 10.7 MB


### Train/Test Split

Splitting the data into a training dataset and test dataset.  We will train on the training set and evalute predictive power on the test data set

In [5]:
y = df['conversion']
x = df.drop(columns='conversion')
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=9)

### Splitting the training data into 3 seperate data frames. 

In addition to just overal predictive power as a benchmark, I am also thinking we can evalute the different methods on how accruate they are with more limited data sizes.  I create 3 databases, one with 10k records, another with 25k records, and a final with 50k records.  I will then train the model on all.  This will allow us to evalute how much better different models get with larger training data.

In [6]:
x_train_10k = x_train[0:10000]
x_train_25k = x_train[0:25000]
x_train_50k = x_train[0:50000]

y_train_10k = y_train[0:10000]
y_train_25k = y_train[0:25000]
y_train_50k = y_train[0:50000]

### Preditive Power Benchmarks

In [7]:
def class_metrics(prediction):
    a = print("Accuracy:",metrics.accuracy_score(y_test, prediction))
    p = print("Precision:",metrics.precision_score(y_test, prediction))
    r = print("Recall:",metrics.recall_score(y_test, prediction))
    f = print("F-1 Score:",metrics.f1_score(y_test, prediction))
    return a, p, r, f;

### Method 1 - Naive Bayes

In [8]:
#running first on x_train_10k
gn1 = GaussianNB()
gnm1 = gn1.fit(x_train_10k, y_train_10k)
gnr1 = gnm1.predict(x_test)
print(class_metrics(gnr1))
print(gnm1.get_params())

Accuracy: 0.70561
Precision: 0.5895155698911766
Recall: 0.9466321368928045
F-1 Score: 0.7265634433370797
(None, None, None, None)
{'priors': None, 'var_smoothing': 1e-09}


In [9]:
#running second time on x_train_20k
gn2 = GaussianNB()
gnm2 = gn2.fit(x_train_25k, y_train_25k)
gnr2 = gnm2.predict(x_test)
print(class_metrics(gnr2))
print(gnm2.get_params())

Accuracy: 0.70924
Precision: 0.5930577306107555
Recall: 0.9440666069656558
F-1 Score: 0.7284849843119677
(None, None, None, None)
{'priors': None, 'var_smoothing': 1e-09}


In [10]:
#running second time on x_train_20k
gn3 = GaussianNB()
gnm3 = gn3.fit(x_train_50k, y_train_50k)
gnr3 = gnm3.predict(x_test)
print(class_metrics(gnr3))
print(gnm3.get_params())

Accuracy: 0.70537
Precision: 0.5893050867888139
Recall: 0.9466079337802842
F-1 Score: 0.7263964340437387
(None, None, None, None)
{'priors': None, 'var_smoothing': 1e-09}
