In [1]:
# Brenda Woodard
# Assignment 5

# Goal: The goal of this assignment is to give you the opportunity to build the Naïve Bayes Algorithm from scratch as well
# as using tools built into sklearn. 

import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn
import nltk
import string
import re
import statsmodels.tools.tools as stattools
import html
from scipy import stats
from nltk.corpus import stopwords
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
import math
# In sklearn, there are different types of Naive Bayes constructors for fitting Naïve Bayes models, dependent on the nature 
# of the data. 
# For example:
#     MultinomialNB() is used for text classification when data is represented as a feature vector 
#     ComplementNB() is an adaptation of the standard  MultinomialNB() for imbalanced data
#     GaussianNB() is used if the features are assumed numerical & are assumed to follow a Gaussian or normal distribution 
#     BernoulliNB() is used when each feature follows a Bernoulli distribution. The data or all features are binary with 
#     values 0 or 1
#     CategoricalNB() is used when each feature has its own categorical distribution
            
#     (https://scikit-learn.org/stable/modules/naive_bayes.html)

In [2]:
# Question 1:
    
# The features in this data include workclass, education, race & gender. 
# The output variable is income & contains two categorical values (<=50k or >50k) indicating whether the income of an 
# individual is less than/equal to $50,000 or greater than $50,000 respectively. 

# Print the unique values of each variable in this data. 
income_eval_cat = pd.read_csv('income_evaluation_cat.csv') 
print(income_eval_cat)

# Implement Naïve Bayes from scratch using Bayes’ rule - You don’t need to define a function, but you could if you want
# You can do your calculations in Python, but you can not use the sklearn package
lb_make = LabelEncoder()
income_eval_cat['income_bin'] = lb_make.fit_transform(income_eval_cat[' income'])
income_eval_cat = income_eval_cat.copy()
print(income_eval_cat.head(20))

# Compute means for each attribute by income

means = income_eval_cat.groupby(income_eval_cat['income_bin']).mean()
print(means)
# Compute standard deviations of each attribute by income status
sigmas = income_eval_cat.groupby(income_eval_cat['income_bin']).std()
print(sigmas)

# Extract means of attributes given y = 0, y = 1
means_0 = means.iloc[0].values
print(means_0)
means_1 = means.iloc[1].values
print(means_1)

# Extract standard deviations of attributes given y = 0, y = 1
sigmas_0 = sigmas.iloc[0].values
print(sigmas_0)
sigmas_1 = sigmas.iloc[1].values
print(sigmas_1)

# Suppose that all the data you uploaded is the training data, classify a 
# test instance into the class income<=50 or income>50k
X = ['Private', 'Bachelors', 'White', 'Female'] 
print(X)

# densities of attribute values given a class y0, y1
densities_y0 = stats.norm(means_0, sigmas_0).pdf(x)
print(densities_y0)
densities_y1 = stats.norm(means_1, sigmas_1).pdf(x)
print(densities_y1)

# counts for each class y in the dataset
y_counts = income_eval_cat.income.value_counts()
print(y_counts)
print(sum(y_counts.values))

# the prior probabilities of y
p_ys = y_counts/sum(y_counts.values)
print(p_ys)
p_y0 = p_ys[0]
print(p_y0)
p_y1 = p_ys[1]
print(p_y1)
 
# You need to compute the posterior probabilities P(income<=50/X) & P(income>50k/X)
# posterior probability of class y0 given the data x p(y0/x) = p(y0)*p(x/y0)
posterior_y0 = p_y0*np.prod(densities_y0)
print(posterior_y0)
# posterior probability of class y1 given the data x p(y0/x) = p(y0)*p(x/y0)
posterior_y1 = p_y1*np.prod(densities_y1)
print(posterior_y1)

# print the class with the greater posterior probability as the predicted class maximizing a function is the same
# as minimizing the negative of the function so include negative signs to the posteriors
if np.argmin([-posterior_y0, -posterior_y1]) == 0:
    print('predict class y0')
else:
    print('predict class_y1')
        
        
# Preprocess or transform the features in the data using an appropriate scaler in sklearn 
# You don’t need to transform the output variable; it should still work fine in a text format
# Normalize the posterior probabilities
posterior_y1 = p_y1*np.prod(densities_y1)/sum([p_y1*np.prod(densities_y1), p_y0*np.prod(densities_y0)])
print(posterior_y1)
posterior_y0 = p_y0*np.prod(densities_y0)/sum([p_y1*np.prod(densities_y1), p_y0*np.prod(densities_y0)])
print(posterior_y0)
# Check that the posteriors add up to 1
print(posterior_y1 + posterior_y0)

# Randomly split the transformed input & output data into X_train, y_train, X_test & y_test using sklearn, use numpy 
# arrays to store the data
X_train = income_eval_cat.iloc[:,0:3]
X_train = np.array(X_train)
y_train = income_eval_cat[' income'].values
X_test = np.array(x).reshape(1, -1)

# Use an appropriate Naïve Bayes constructor in sklearn to construct & fit a Naïve Bayes model on the training data
# use the model to compute the accuracy score of the training & test set
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print(y_pred)

               workclass    education    race   gender  income
0              State-gov    Bachelors   White     Male   <=50K
1       Self-emp-not-inc    Bachelors   White     Male   <=50K
2                Private      HS-grad   White     Male   <=50K
3                Private         11th   Black     Male   <=50K
4                Private    Bachelors   Black   Female   <=50K
...                  ...          ...     ...      ...     ...
32556            Private   Assoc-acdm   White   Female   <=50K
32557            Private      HS-grad   White     Male    >50K
32558            Private      HS-grad   White   Female   <=50K
32559            Private      HS-grad   White     Male   <=50K
32560       Self-emp-inc      HS-grad   White   Female    >50K

[32561 rows x 5 columns]
            workclass      education                 race   gender  income  \
0           State-gov      Bachelors                White     Male   <=50K   
1    Self-emp-not-inc      Bachelors                White     

DataError: No numeric types to aggregate

In [6]:
# Question 2 

# Upload the income_evaluation_continuous.csv data provided on canvas
inc_eval_cont = pd.read_csv('income_evaluation_continuous.csv')
print(inc_eval_cont.head())
# The features in this data include age, education_num, & hours_per_week
# The output variable is income & contains two categorical values (<=50k or >50k) indicating whether the income of an
# individual is less than/equal to $50,000 or greater than $50,000 respectively

# Implement Naïve Bayes from scratch using Bayes’ rule - You don’t need to define a function, but you could if you want
# You can do your calculations in Python, but you can not use the sklearn package
# Assume that all the features or input variables follow a normal distribution
lb_make = LabelEncoder()
inc_eval_cont['income_bin'] = lb_make.fit_transform(inc_eval_cont[' income'])
inc_eval_cont = inc_eval_cont.copy()
print(inc_eval_cont.head(20))

# Compute the mean & standard deviation of each input variable such that the results are presented on the same table 
# or data frame. You can call the .apply() function on the pandas DataFrame.  

# Compute means for each attribute by income
means = inc_eval_cont.groupby(inc_eval_cont['income_bin']).mean()
print(means)
# Compute standard deviations of each attribute by income status
sigmas = inc_eval_cont.groupby(inc_eval_cont['income_bin']).std()
print(sigmas)
# Extract means of attributes given y = 0, y = 1
means_0 = means.iloc[0].values
print(means_0)
means_1 = means.iloc[1].values
print(means_1)
# Extract standard deviations of attributes given y = 0, y = 1
sigmas_0 = sigmas.iloc[0].values
print(sigmas_0)
sigmas_1 = sigmas.iloc[1].values
print(sigmas_1)

# Given that all the income_evaluation_continuous.csv data you uploaded is the training data , classify a test instance,
# into the class income<=50 or income>50k. 
X = [30, 10, 45]

# densities of attribute values given a class y0, y1
densities_y0 = stats.norm(means_0, sigmas_0).pdf(X)
print(densities_y0)
densities_y1 = stats.norm(means_1, sigmas_1).pdf(X)
print(densities_y1)

# counts for each class y in the dataset
y_counts = inc_eval_cont.income_bin.value_counts()
print(y_counts)
print(sum(y_counts.values))

# the prior probabilities of y
p_ys = y_counts/sum(y_counts.values)
print(p_ys)
p_y0 = p_ys[0]
print(p_y0)
p_y1 = p_ys[1]
print(p_y1)
 
# Compute the posterior probabilities P(income<=50/X) & P(income>50k/X)
# posterior probability of class y0 given the data x p(y0/x) = p(y0)*p(x/y0)
posterior_y0 = p_y0*np.prod(densities_y0)
print(posterior_y0)
# posterior probability of class y1 given the data x p(y0/x) = p(y0)*p(x/y0)
posterior_y1 = p_y1*np.prod(densities_y1)
print(posterior_y1)

# Print the class with the greater posterior probability as the predicted class. 
if np.argmin([-posterior_y0, -posterior_y1]) == 0:
    print('predict class y0')
else:
    print('predict class_y1')
        
# Preprocess or transform the features in the income_evaluation_cont.csv data using an appropriate scaler in sklearn
# You don’t need to transform the output variable; it should still work fine in a text format 
# Normalize the posterior probabilities
posterior_y1 = p_y1*np.prod(densities_y1)/sum([p_y1*np.prod(densities_y1), p_y0*np.prod(densities_y0)])
print(posterior_y1)
posterior_y0 = p_y0*np.prod(densities_y0)/sum([p_y1*np.prod(densities_y1), p_y0*np.prod(densities_y0)])
print(posterior_y0)
# Check that the posteriors add up to 1
print(posterior_y1 + posterior_y0)

# Randomly split the input & output data into X_train, y_train, X_test and y_test using tools in sklearn 
X_train = inc_eval_cont.iloc[:,0:3]
X_train = np.array(X_train)
y_train = inc_eval_cont[' income'].values
X_test = np.array(X).reshape(1, -1)

# Use an appropriate Naïve Bayes constructor in sklearn to construct & fit a Naïve Bayes model on the training data
# Use the model to compute the accuracy score of the training and test set. 
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print(y_pred)

   age  education_num  hours_per_week  income
0   39             13              40   <=50K
1   50             13              13   <=50K
2   38              9              40   <=50K
3   53              7              40   <=50K
4   28             13              40   <=50K
    age  education_num  hours_per_week  income  income_bin
0    39             13              40   <=50K           0
1    50             13              13   <=50K           0
2    38              9              40   <=50K           0
3    53              7              40   <=50K           0
4    28             13              40   <=50K           0
5    37             14              40   <=50K           0
6    49              5              16   <=50K           0
7    52              9              45    >50K           1
8    31             14              50    >50K           1
9    42             13              40    >50K           1
10   37             10              80    >50K           1
11   30         

In [7]:
# Question 3:
# Implement a Naïve Bayes for text classification to detect fake or true news

# Read in given data
true = pd.read_csv('True.csv')
trueDF = pd.DataFrame(true)
# print(trueDF.head())

# Create a new data frame by selecting the “title” & “text” columns, then add a new column called “news_type” where all the 
# values on this new column are “True” - So, your new data frame should have three columns; “title”, “text” & “news_type” 
t_DF = trueDF[['title', 'text']].copy()
t_DF['news_type'] = 'True'
print(t_DF.head())

# Read in given data
fake = pd.read_csv('Fake.csv')
fakeDF = pd.DataFrame(fake)
# print(fakeDF.head())

# Create a new data frame by selecting the “title” & “text” columns, then add a new column called “news_type” where all the
# values on this new column are “Fake” - So, your new data frame should have three columns; “title”, “text” & “news_type”
f_DF = fakeDF[['title', 'text']].copy()
f_DF['news_type'] = 'False'
print(f_DF.head())

# Merge the data frame in a) & b) so that one of the data frames is stacked vertically on top of the other. 
m_DF = t_DF.append(f_DF)
print(m_DF.head(-1))

# Combine the text in the “title” & “text” columns of the merged data frame into another column called “news”
m_DF['news'] = m_DF['title'] + ' ' + m_DF['text']
print(m_DF.head(-1))

# Drop the “title” & “text” columns so that your final data frame is has only two columns, “news” & “news_type”
final = m_DF[['news_type', 'news']].copy()

# Print the first five rows of your final data frame
print(final.head(5))

                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text news_type  
0  WASHINGTON (Reuters) - The head of a conservat...      True  
1  WASHINGTON (Reuters) - Transgender people will...      True  
2  WASHINGTON (Reuters) - The special counsel inv...      True  
3  WASHINGTON (Reuters) - Trump campaign adviser ...      True  
4  SEATTLE/WASHINGTON (Reuters) - President Donal...      True  
                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name

In [8]:
# Preprocess your data by cleaning the textual data in the “news” column & removing the stop words, special characters, 
# punctuations, etc especially at the beginning & end of each word

# initialize stopwords
sw = set(stopwords.words("english"))
list(sw)[0:10] # view the first 10 stopwords

# view puntuations and special characters that need to be removed
print(set(string.punctuation))

# function that cleans text and removes stop words
def clean(text, stopwords):
    # remove tags like <tab>
    text = re.sub(r'<[^<>]*>', ' ', text)
    # split text on whitespace
    text_list = text.split()
    text_words = []
    
    punctuation = set(string.punctuation)
     # keep #tags and @mentions
     ## punctuation.remove("#")
     ## punctuation.remove("@")
    
    for word in text_list:
     # remove punctuation marks at the beginning
     # of each word
        while len(word) > 0 and word[0] in punctuation:
            word = word[1:]

        # remove punctuation marks at the end of each word
        while len(word) > 0 and word[-1] in punctuation:
            word = word[:-1]

        # a rule to eliminate most urls
        if len(word) > 0 and "/" not in word:
            # eliminate stopwords
            if word.lower() not in stopwords:
                # append the word to the text_words list
                text_words.append(word.lower())
        cleaner_text = " ".join(text_words)
    return cleaner_text

# display a few lines of messy news text before cleaning
print(final["news"][501:510])

# Apply the clean() function to the data & pass in the stopword argument
final["news"] = final["news"].apply(clean, stopwords=sw)
final.head() 

{']', '\\', '@', ',', '(', '.', '~', '=', '!', '$', ')', '[', '/', '?', ';', '}', '+', '|', '%', "'", '-', '&', '>', '#', '{', '*', '"', '^', '`', '<', '_', ':'}
501    Tillerson 'offended' by claims of State Depart...
502    Trump to make remarks at White House at 3 p.m....
503    U.S. budget chief Mulvaney says CFPB staff sho...
504    Russian envoy to U.S. to inspect San Francisco...
505    White House to Democratic leaders: 'stop the p...
506    Top Democrats in Congress say won't meet with ...
507    U.S. Senate liberals propose new steps for Pue...
508    Trump: 'I don't see a deal' with Democrats on ...
509    Trump-installed consumer agency head sets hiri...
Name: news, dtype: object


Unnamed: 0,news_type,news
0,True,u.s budget fight looms republicans flip fiscal...
1,True,u.s military accept transgender recruits monda...
2,True,senior u.s republican senator let mr mueller j...
3,True,fbi russia probe helped australian diplomat ti...
4,True,trump wants postal service charge much amazon ...


In [9]:
# display messy news text after cleaning
print(final["news"][501:510])

# check shape of the data
print(final.shape)

# Drop instances where the news text is less than 50 words for training 
final = final[final["news"].str.len() > 50]
print(final.shape)
print(final.head())

# Spit the feature vectors & the output variable into into X_train, y_train, X_test & y_test 
# Let the test set be 30% of the entire data
X_train, X_test, y_train, y_test = train_test_split(final["news"], final["news_type"], test_size= 0.3)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)


# Transform the input text data into feature vectors where the entries of the feature vectors are 
# term-frequency-inverse-document-frequency - Use the TfidfVectorizer() in sklearn
tfidf = TfidfVectorizer(ngram_range= (1,2), stop_words= "english", min_df= 10, max_features= None)
X_train_tfidf = tfidf.fit_transform(X_train)
X_train_tfidf.toarray()

X_train_tfidf.toarray().shape

501    tillerson offended claims state department's h...
502    trump make remarks white house 3 p.m est washi...
503    u.s budget chief mulvaney says cfpb staff disr...
504    russian envoy u.s inspect san francisco consul...
505    white house democratic leaders stop political ...
506    top democrats congress say meet trump planned ...
507    u.s senate liberals propose new steps puerto r...
508    trump see deal democrats keeping government op...
509    trump-installed consumer agency head sets hiri...
Name: news, dtype: object
(44898, 2)
(44828, 2)
  news_type                                               news
0      True  u.s budget fight looms republicans flip fiscal...
1      True  u.s military accept transgender recruits monda...
2      True  senior u.s republican senator let mr mueller j...
3      True  fbi russia probe helped australian diplomat ti...
4      True  trump wants postal service charge much amazon ...
(31379,) (31379,)
(13449,) (13449,)


(31379, 88027)

In [10]:
# Fit an appropriate Naïve Bayes model & compute the training & test accuracy of the model. 
clf = MultinomialNB()
clf = clf.fit(X_train_tfidf, y_train)

# make prediction on training set
print(clf.predict(X_train_tfidf))

# compute accuracy on training set
print(clf.score(X_train_tfidf, y_train))

X_test_tfidf = tfidf.transform(X_test)
X_test_tfidf.toarray()

print(clf.score(X_test_tfidf, y_test))

# Q: Is there overfitting?
# A: No, there does not seem to be because the training and test accuracy scores are similar. 

['False' 'True' 'False' ... 'True' 'False' 'False']
0.9577105707638867
0.953305078444494


In [11]:
# Fit a Naïve Bayes using cross validation & print the average cross validation score as well as the standard deviation 
# of the cross-validation scores

tfidf = TfidfVectorizer(ngram_range= (1,2), stop_words= "english", min_df= 10, max_features= None)
X_train_tfidf = tfidf.fit_transform(X_train)
scores = cross_val_score(estimator= MultinomialNB(), X= X_train_tfidf, y= y_train, cv= 5)
print("Average cross validation score: ", scores.mean())
print("Standard deviation of cross validation scores: ", scores.std())

Average cross validation score:  0.9493930308085694
Standard deviation of cross validation scores:  0.0025839205649281645


In [12]:
# Select some hypermeters of your choice & tune using the grid search cross validation. 
# Use some other hyperparameters than those used in class examples
tfidf.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 10,
 'ngram_range': (1, 2),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': 'english',
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

In [16]:
# pipe = Pipeline([("tfidf",TfidfVectorizer(stop_words= "english")), ("nb", MultinomialNB())])
# param_grid = [{"tfidf__min_df":[5, 20], "tfidf__ngram_range":[(1, 1), (1, 2), (1, 3), (1, 5), (1, 7)]}]
# grid = GridSearchCV(estimator= pipe , param_grid= param_grid, cv= 5)
# grid.fit(X_train, y_train)

In [15]:
# # find best hyperparameter values
# grid.best_params_
# # training accuracy
# grid.score(X_train, y_train)
# # test accuracy
# grid.score(X_test, y_test)