In [None]:
import pandas as pd
import numpy as np
import math
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
df = pd.read_csv('/content/adult.csv')
mnist = df
data = df

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# find categorical variables
categorical = [var for var in df.columns if df[var].dtype=='O']
print('There are {} categorical variables\n'.format(len(categorical)))
print('The categorical variables are :\n\n', categorical)

In [None]:
# check missing values in categorical variables
df[categorical].isnull().sum()

In [None]:
df.workclass.value_counts()

In [None]:
# replace '?' values in workclass variable with `NaN`

df['workclass'].replace('?', np.NaN, inplace=True)

In [None]:
# replace '?' values in occupation variable with `NaN`
df['occupation'].replace('?', np.NaN, inplace=True)

In [None]:
# replace '?' values in native_country variable with `NaN`
df['native-country'].replace('?', np.NaN, inplace=True)

In [None]:
# find numerical variables
numerical = [var for var in df.columns if df[var].dtype!='O']
print('There are {} numerical variables\n'.format(len(numerical)))
print('The numerical variables are :', numerical)

In [None]:
# check missing values in numerical variables
df[numerical].isnull().sum()

In [None]:
df[categorical].isnull().sum()

In [None]:
X = df.drop(['class'], axis=1)
Y = df['class']

In [None]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = 0)

In [None]:
X_train.shape, X_test.shape

In [None]:
for df2 in [X_train, X_test]:
    df2['workclass'].fillna(X_train['workclass'].mode()[0], inplace=True)
    df2['occupation'].fillna(X_train['occupation'].mode()[0], inplace=True)
    df2['native-country'].fillna(X_train['native-country'].mode()[0], inplace=True)    

In [None]:
df2['workclass'].isnull().sum(),df2['occupation'].isnull().sum(),df2['native-country'].isnull().sum()

In [None]:
train = pd.read_csv('/content/adult_train.csv')
test = pd.read_csv('/content/adult_test.csv')
x_train = train.drop(columns = ['class'])
y_train = train['class']
x_test = test.drop(columns = ['class'])
y_test = test['class']

In [None]:
x_train

In [None]:
x=np.array(x_train)

In [None]:
y_train

In [None]:
classes = [' <=50K', ' >50K']

In [None]:
categorical = [var for var in x_train.columns if x_train[var].dtype=='O']
print(categorical)

In [None]:
numerical = [var for var in x_train.columns if x_train[var].dtype!='O']
print(numerical)

In [None]:
counts = x_train.nunique()
print(counts)

In [None]:
class_priors = {}
likelihoodsCat = {}
likelihoodsNum = {}

for category in categorical:
    likelihoodsCat[category] = {}
    for feat_val in np.unique(x_train[category]):
        for clas in classes:
            likelihoodsCat[category].update({feat_val+'_'+clas:0})

for number in numerical:
    likelihoodsNum[number] = {}
    for clas in classes:
        likelihoodsNum[number].update({clas:{}})

In [None]:
print(likelihoodsCat)

In [None]:
def classPrior():
    for clas in classes:
        class_priors.update({clas: 0})
    for clas in classes:
        class_priors[clas] = sum(y_train == clas) / len(y_train)
    return class_priors

In [None]:
print(classPrior())

In [None]:
features = list(x_train)
print(features)

In [None]:
for category in categorical:
        feature_vals = x_train[category].value_counts().to_dict()
        print(feature_vals)

In [None]:
def condProbCategorical():
    for category in categorical:
        for clas in classes:
            outcome_count = sum(y_train == clas)
            feat_likelihood = x_train[category][y_train[y_train == clas].index.values.tolist()].value_counts().to_dict()
            for feat_val, count in feat_likelihood.items():
                likelihoodsCat[category][feat_val + '_' + clas] = count/outcome_count 
                # if likelihoodsCat[category][feat_val + '_' + clas] == 0:
                #     likelihoodsCat[category][feat_val + '_' + clas] = 0.0001
    return likelihoodsCat

In [None]:
print(condProbCategorical())

In [None]:
def condProbNumerical():
    for number in numerical:
        for clas in classes:
            likelihoodsNum[number][clas]['mean'] = x_train[number][y_train[y_train == clas].index.values.tolist()].mean()
            likelihoodsNum[number][clas]['variance'] = x_train[number][y_train[y_train == clas].index.values.tolist()].var()
    return likelihoodsNum

In [None]:
print(condProbNumerical())

In [None]:
def prediction(df):
    leng = len(df)
    x=np.array(df)
    #print(x.shape)
    #print(x)
    results = []
    dfcat = df[['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']]
    dfnum = df[['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']]
    x1 = np.array(dfcat)
    #print(x1.shape)
    #print(x1)
    x2 = np.array(dfnum)
    #print(x2.shape)
    #print(x2)

    likelihood1 = []
    likelihood2 = []

    for query1 in x1:
        for clas in classes:
            likelihood = 1
            for feat, feat_val in zip(categorical, query1):
                likelihood *= likelihoodsCat[feat][feat_val + '_' + clas]
            likelihood1.append(likelihood)
    #print(likelihood1)
    
    for query2 in x2:
        for clas in classes:
            likelihood = 1
            for feat, feat_val in zip(numerical, query2):
                mean = likelihoodsNum[feat][clas]['mean']
                var = likelihoodsNum[feat][clas]['variance']
                likelihood *= (1/math.sqrt(2*math.pi*var)) * np.exp(-(feat_val - mean)**2 / (2*var))
            likelihood2.append(likelihood)
    #print(len(likelihood2))
    

    for i in range(0, 2*leng, 2):

        probs_outcome = {}
        for j in range (0,2):
        #for clas in classes:
            prior = class_priors[classes[j]]
            totallikelihood=1

            # for feat, feat_val in zip(categorical, query):
            #     likelihood1 *= likelihoodsCat[feat][feat_val + '_' + clas]

            # for feat, feat_val in zip(numerical, query):
            #     mean = likelihoodsNum[feat][clas]['mean']
            #     var = likelihoodsNum[feat][clas]['variance']
            #     likelihood2 *= (1/math.sqrt(2*math.pi*var)) * np.exp(-(feat_val - mean)**2 / (2*var))
            z=i+j
            totallikelihood = likelihood1[z]*likelihood2[z]

            posterior = (totallikelihood * prior) 
            probs_outcome[classes[j]] = posterior
        
        result = max(probs_outcome, key = lambda x: probs_outcome[x])
        results.append(result)
    print(results)

    return np.array(results)


In [None]:
def accuracy_score(y_true, y_pred):

	"""	score = (y_true - y_pred) / len(y_true) """

	return round(float(sum(y_pred == y_true))/float(len(y_true)) * 100 ,2)

In [None]:
print("Train Accuracy: {}".format(accuracy_score(y_train, prediction(x_train))))
print("Train Precision: %.3f" % precision_score(y_train, prediction(x_train), pos_label = ' >50K'))
print("Train Recall: %.3f" % recall_score(y_train, prediction(x_train), pos_label = ' >50K'))
print("Train F1-Score: %.3f" % f1_score(y_train, prediction(x_train), pos_label = ' >50K'))

print("\n")

print("Test Accuracy: {}".format(accuracy_score(y_test, prediction(x_test))))
print("Test Precision: %.3f" % precision_score(y_test, prediction(x_test), pos_label = ' >50K'))
print("Test Recall: %.3f" % recall_score(y_test, prediction(x_test), pos_label = ' >50K'))
print("Test F1-Score: %.3f" % f1_score(y_test, prediction(x_test), pos_label = ' >50K'))

[' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' >50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' >50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' >50K', ' <=50K', ' >50K', ' <=50K', ' <=50K', ' >50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' >50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' >50K', ' >50K', ' <=50K', ' <=50K', ' >50K', ' <=50K', ' <=50K', ' <=50K', ' >50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' >50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', 

Smoothing Techniques

Laplace

In [None]:
def integerconverter(a):
    
            row_number= len(a)

            for i in range(row_number):
                a[i]=float(a[i])
            return a


def integerconverter1(a):
    
    row_number= len(a)
    column_number = len(a[0])

    for i in range(row_number):
        for j in range(column_number):
            a[i][j]=float(a[i][j])
    return a

In [None]:
data = pd.read_csv('/content/adult (4).csv')
data['income'] = data['income'].replace('<=50K', 0)
data['income'] = data['income'].replace('>50K', 1)

In [None]:
workclass_encoding = {'Private': 0, 'Self-emp-not-inc': 1, 'Self-emp-inc': 2, 'Federal-gov': 3, 'Local-gov': 4, 'State-gov': 5, 'Without-pay': 6, 'Never-worked': 7}
education_encoding = {'Bachelors': 0, 'Some-college': 1, '11th': 2, 'HS-grad': 3, 'Prof-school': 4, 'Assoc-acdm': 5, 'Assoc-voc': 6, '9th': 7, '7th-8th': 8,'12th': 9, 'Masters': 10, '1st-4th': 11,'10th': 12, 'Doctorate': 13, '5th-6th': 14,'Preschool': 15}
marital_status_encoding = {'Married-civ-spouse': 0, 'Divorced': 1, 'Never-married': 2, 'Separated': 3, 'Widowed': 4, 'Married-spouse-absent': 5, 'Married-AF-spouse': 6}
occupation_encoding = {'Tech-support': 0, 'Craft-repair': 1, 'Other-service': 2, 'Sales': 3, 'Exec-managerial': 4, 'Prof-specialty': 5, 'Handlers-cleaners': 6, 'Machine-op-inspct': 7, 'Adm-clerical': 8,'Farming-fishing': 9, 'Transport-moving': 10, 'Priv-house-serv': 11,'Protective-serv': 12, 'Armed-Forces': 13}
relationship_encoding = {'Wife': 0, 'Own-child': 1, 'Husband': 2, 'Not-in-family': 3, 'Other-relative': 4, 'Unmarried': 5}
race_encoding = {'White': 0, 'Asian-Pac-Islander': 1, 'Amer-Indian-Eskimo': 2, 'Other': 3, 'Black': 4}
native_country_encoding = {'United-States': 0, 'Cambodia': 1, 'England': 2, 'Puerto-Rico': 3, 'Canada': 4, 'Germany': 5, 'Outlying-US(Guam-USVI-etc)': 6, 'India': 7, 'Japan': 8,'Greece': 9, 'South': 10, 'China': 11,'Cuba': 12, 'Iran': 13, 'Honduras': 14,'Philippines':15,'Italy': 16,'Poland':17,'Jamaica':18,'Vietnam':19,'Mexico':20,'Portugal':21,'Ireland':22,'France':23,'Dominican-Republic':24,'Laos':25,'Ecuador':26,'Taiwan':27,'Haiti':28,'Columbia':29,'Hungary':30,'Guatemala':31,'Nicaragua':32,'Scotland':33,'Thailand':34,'Yugoslavia':35,'El-Salvador':36,'Trinadad&Tobago':37,'Peru':38,'Hong':39,'Holand-Netherlands':40}
sex_encoding = {'Female': 0, 'Male': 1}



data['workclass'] = data['workclass'].map(workclass_encoding)
data['education'] = data['education'].map(education_encoding)
data['marital-status'] = data['marital-status'].map(marital_status_encoding)
data['occupation'] = data['occupation'].map(occupation_encoding)
data['relationship'] = data['relationship'].map(relationship_encoding)
data['race'] = data['race'].map(race_encoding)
data['native-country'] = data['native-country'].map(native_country_encoding)
data['gender'] = data['gender'].map(sex_encoding)
col = "income"

In [None]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 2,
                       ):
    print(data)
    

pd.reset_option('all')
display(data)



IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

  pd.reset_option('all')
  pd.reset_option('all')
: boolean
    use_inf_as_null had been deprecated and will be removed in a future
    version. Use `use_inf_as_na` instead.

  pd.reset_option('all')


Unnamed: 0.1,Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,0,25,0.0,226802,2,7,2,7.0,1,4,1,0,0,40,0.0,0
1,1,38,0.0,89814,3,9,0,9.0,2,0,1,0,0,50,0.0,0
2,2,28,4.0,336951,5,12,0,12.0,2,0,1,0,0,40,0.0,1
3,3,44,0.0,160323,1,10,0,7.0,2,4,1,7688,0,40,0.0,1
4,4,18,,103497,1,10,2,,1,0,0,0,0,30,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,48837,27,0.0,257302,5,12,0,0.0,0,0,0,0,0,38,0.0,0
48838,48838,40,0.0,154374,3,9,0,7.0,2,0,1,0,0,40,0.0,1
48839,48839,58,0.0,151910,3,9,4,8.0,5,0,0,0,0,40,0.0,0
48840,48840,22,0.0,201490,3,9,2,8.0,1,0,1,0,0,20,0.0,0


In [None]:
print(data.isnull().sum())
print("total                     ", data.isnull().sum().sum())

Unnamed: 0            0
age                   0
workclass          2799
fnlwgt                0
education             0
educational-num       0
marital-status        0
occupation         2809
relationship          0
race                  0
gender                0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country      857
income                0
dtype: int64
total                      6465


In [None]:
dataset_new=data
dataset_new['workclass'] = dataset_new['workclass'].replace(np.NaN, int(dataset_new['workclass'].mean()))
dataset_new['occupation'] = dataset_new['occupation'].replace(np.NaN, int(dataset_new['occupation'].mean()))
dataset_new['native-country'] = dataset_new['native-country'].replace(np.NaN, int(dataset_new['native-country'].mean()))

In [None]:
print(data.isnull().sum().sum())

0


In [None]:
df1=data.drop(['income'], axis=1)
df2 = data.loc[:,col]
X = np.asarray(df1)
Y = np.asarray(df2)
X = integerconverter1(X)
Y = integerconverter(Y)

X1 = np.asarray(df1)
Y1 = np.asarray(df2)
X1 = integerconverter1(X)
Y1 = integerconverter(Y)   
    
    
    
X_train, X_test, y_train, y_test = train_test_split(
X1, Y1, test_size=0.33, random_state=123)

In [None]:
import numpy as np
from collections import defaultdict

X_train = X_train.astype('int64', casting='unsafe')
y_train = y_train.astype('int64', casting='unsafe')
X_test = X_test.astype('int64', casting='unsafe')
y_test = y_test.astype('int64', casting='unsafe')

class_counts = np.bincount(y_train)
class_probs = class_counts / len(y_train)

probabilities = defaultdict(dict)

alpha = 0.1
for label in np.unique(y_train):
    subset = X_train[y_train == label]
    for feature in range(X_train.shape[1]):
        feature_counts = np.bincount(subset[:, feature], minlength=X_train[:, feature].max() + 1)
        feature_probs = (feature_counts + alpha) / (len(subset) + alpha*(X_train[:, feature].max() + 1))
        for value, prob in enumerate(feature_probs):
            probabilities[label][str(feature) + '_' + str(value)] = prob

predictions = []
for row in X_test:
    label_probabilities = {}
    for label in np.unique(y_train):
        label_probability = class_probs[label]
        for feature in range(len(row)):
            feature_value = row[feature]
            if str(feature) + '_' + str(feature_value) in probabilities[label]:
                prob = probabilities[label][str(feature) + '_' + str(feature_value)]
            else:
                prob = alpha / (len(X_train[y_train == label]) + alpha*(X_train[:, feature].max() + 1))
            label_probability *= prob
        label_probabilities[label] = label_probability
    predicted_label = max(label_probabilities, key=label_probabilities.get)
    predictions.append(predicted_label)

accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, pos_label=1)
recall = recall_score(y_test, predictions, pos_label=1)
f1 = f1_score(y_test, predictions, pos_label=1)
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)

Lidstone Smoothing

In [None]:
import numpy as np

def train_naive_bayes_lidstone(X_train, y_train, alpha):
    num_instances, num_features = X_train.shape
    num_classes = len(np.unique(y_train))
    prior_prob = np.zeros(num_classes)
    likelihood = np.zeros((num_classes, num_features))
    
    for c in range(num_classes):
        X_c = X_train[y_train == c]
        prior_prob[c] = (len(X_c) + alpha) / (num_instances + alpha*num_classes)
        likelihood[c,:] = (np.sum(X_c, axis=0) + alpha) / (np.sum(X_c) + alpha*num_features)
    
    return prior_prob, likelihood

def predict_naive_bayes_lidstone(X_test, prior_prob, likelihood):
    num_instances = X_test.shape[0]
    num_classes = len(prior_prob)
    y_pred = np.zeros(num_instances)
    
    for i in range(num_instances):
        joint_prob = np.zeros(num_classes)
        for c in range(num_classes):
            joint_prob[c] = np.sum(np.log(likelihood[c,:])*X_test[i] + np.log(1-likelihood[c,:])*(1-X_test[i])) + np.log(prior_prob[c])
        y_pred[i] = np.argmax(joint_prob)
    
    return y_pred.astype(int)
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

alpha = 1.0  

prior_prob, likelihood = train_naive_bayes_lidstone(X_train, y_train, alpha)

y_pred = predict_naive_bayes_lidstone(X_test, prior_prob, likelihood)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label=1)
recall = recall_score(y_test, y_pred, pos_label=1)
f1 = f1_score(y_test, y_pred, pos_label=1)
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)

Comparison

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression()
logisticRegr.fit(X_train, y_train)
predictions = logisticRegr.predict(X_train)
from sklearn.metrics import accuracy_score
accuracy_score(y_train, predictions)



KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_train)
accuracy_score(y_train, y_pred)