In [None]:
'''
This code is to understand the working of Naive Bayes classifier with text/document type of data. The text data requires a 
special approach before it can be presented to the predictive model This code implements a concept called Bag of Words. In this
approach data frame rows containing text/document are converted into a list of distinct words that are transformed into a
feature in a new data frame. The frequency/count of these distinct words for a  row of text/document in the original data frame 
will become values in the newly transformed data frame. A new data frame is ready with distinct words as column and frequency
of those words as a row for further steps. This whole process of converting text data to distinct words and get a count of 
these words in a given sentence/document will be implemented by the scikit library.

In this code, we will implement the scikit library, CounterVectorizer.

The details steps are as below.

The text/document data is read using the pandas read_csv function and converted into a data frame. Each row of the data frame 
represents a text/document. The data is split to get input(X) and output(Y) data. The X data is train-test split into training
and validation data.

An object of CounterVectorizer is created.The training data is fit on this object. The fit method will learn the vocabulary
and get distinct list of words. After fit, the data is transformed into an array of the count of the distinct words for each
sentencs/document.The library also has an attribute to get the feature names ie the list of distinct words. The same process is
done for validation data. Validation data is transformed based on training data ie the count of words for every sentence/document
in the validation data is based on distinct words of the training data set. The end output of the transform step is an array of
the count of the word for each row of training/validation data set and feature names(list of distinct words from training data
set). 
The array is then converted into a data frame with columns based on distinct words of the training data set.

For probability, the probability of each unique class of Y data is calculated and updated in a dictionary object.
Thereafter a training model is built, for this conditional probability of training data is calculated with below formula

(For each of the distinct words of the training data the corresponding sum of the count of that word given unique class) + 1
__________________________________________________________________________________________________________________________
                         total count of that distinct word +len(new data frame.columns) 
                         
The Laplace smoothing is used in this formula to avoid 0 probability.

eg The| no sport or game|sport, etc.
This process is done for every unique value of the Y data. The dictionary object is populated as given in example below.
eg The{Thesport:probability,Thenosport:probabilty}
 
For prediction, validation data is iterated through rows, corresponding to each row, the respective column name is matched with
the model probability dictionary ie column | unique class (eg given above).
For each row, for all columns in that row, column|class probability is multiplied and cumulative probability is finally
multiplied with probability of that class and updated in the dictionary (class: total probability). This process is repeated
for all unique values. Thus every row will have as many unique values that many probabilities. All of class:probability are 
populated in a dictionary/.The class which has the highest probability will be predicted for that row. This way a list is
populated with the prediction for validation data.
 
Finally, the accuracy score is calculated with predicted data and actual Y data.


'''

In [1]:


import pandas as pd
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer



In [2]:
def read_data():
    
    '''
    This function is to read data using Pandas read_csv function and convert into dataframe.
    
    Return: - 
    data :-Dataframe  holding the data.  
    
    '''
    
    data=pd.read_csv('D:\Dataset\Bag-Word-Data.csv',header=None,names=['sentences','category'])
    
    return data




def data_preprocess(df):
    '''
    This function is to split data into X and Y(input and output data)using the data frame.
    
    The X data is further train-test-Split to get training and testing data
    
    Argument :-
    df   :- Dataframe with text/document data.    
    
    Return :- 
    x_train:-training data
    x_test :- Validation data
    y_train :- Label data of training data
    y_test :- label data of validation data
    
    '''
    
    x = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    #spliting the dataset into training and test.
    xtrain, xtest, ytrain, ytest = train_test_split(x, y,test_size=.3, random_state=0)
    
    return xtrain,xtest,ytrain,ytest



def count_vector(datatofit,datatotransform):
    '''
    Create an object of CounterVectorizer.Fit the training data on this object. The fit will learn the vocabulary and get a
    distinct list of words. After the fit, the data is transformed into an array of the count of the distinct words 
    in the sentences/document.The library also has an attribute to get the feature names ie the list of distinct words.
    
    
    Argument :- 
    datatofit       :-training data 
    datatotransform :-testing data
    
    Return :-
    datatotransform  :-tranformed training/validation data,t
    data_dist        :- List of distinct words of training data set.
    
    '''
    
    vectorizer = CountVectorizer()
    vectorizer.fit(datatofit.sentences)
    datatotransform=vectorizer.transform(datatotransform.sentences).toarray()
    data_dist=vectorizer.get_feature_names()
    
    
    
    return datatotransform,data_dist




def create_newdf(transformed_data,train_dist_word):
    
    '''
    Create a new dataframe ,wheer column =training distinct word and row=training/Test transformed data.
    
    Argument :- Transformed data
    
    Return : New dataframe with transformed data.
    
    
    '''
    
    
    newdf=pd.DataFrame(transformed_data,columns=train_dist_word)
    
    return newdf




def class_prob(new_df,y_train):
    
    '''
    The function is to calculate the probability of the unique value of Y data.
    
    Argument :- 
    newdf :- New data frame with distinct words as features
    
    y_train :-Ytrain data
    
    Return :- 
    outdict  :- Dictionary of unique class label:prob
    unq_out  :- List of unique values of Y data
    
    '''
    
    #Find unique value of Output column and rowcount of X_train data 
    unq_out=y_train.unique()
    rowcnt=y_train.shape[0]
    outdict={}
    
     #concatenate new_df and y_train data for looping through column/name pair of input and output colmun
    #so have same index locations
    concatdftrain=pd.concat([new_df,y_train],axis=1)
    
    # Getting probablity of ouput column|unqiue values
    for k in unq_out:
        prob=concatdftrain[concatdftrain[y_train.name]==k].shape[0]/rowcnt
        outdict.update({k:prob})

    return outdict,unq_out




def prob_inp_out(train_dist_word,y_train,new_df,unqout):
    
    '''
    This function is to calculate the probability of distinct words given a unique class of data. The same step is repeated for very
    unique value of the Y data eg distinct_word|class 1,distincet_word|class 2..etc.This way for a given distinct word probabilities
    with all unique classes are stored in a dictionary object.Training model is built.
    
    Argument :-
    train_dist_word  :- distinct words of training data
    y_train          :- Y data 
    new_df           :- new data frame with features as distince words
    unqout           :- List of unique values of Y data.
    
    Return :-
    finalans :- Dictionary holding probabilities of every distinct word with every unique value of Y data.
    
    
    '''
    
    # 
    finalans={}
    inpdict={}

    prob=0
    result=0
    val=''
    w=''
    k=0
 
    #concatenate new_df and y_train data for looping through column/name pair of input and output colmum
    #so have same index locations.
    concatdftrain=pd.concat([new_df,y_train],axis=1)

    for w in train_dist_word:
        w=w.lower()
        newdict={}
        
    
        for val in unqout:
            wordcount=0
        
            wordcount=concatdftrain[concatdftrain[y_train.name]==val][w].sum() +1
        
            prob=wordcount/(concatdftrain[concatdftrain[y_train.name]==val].iloc[:,:-1].sum().sum()+len(new_df.columns))
            
                   
            newdict.update({w+val:prob})
        
                
        finalans.update({w:newdict})
    return finalans





def finalypred(test_new_df,inp_out_prob,unqout,class_dict):
    
    
    '''
    
    The function is created to predict validation data. The validation data is iterated overs rows.For each row, its iterated over
    columns.For every column | unique class, the corresponding probability is obtained from the model dictionary of probabilities.
    This way probabilites of all columns of a given row are multiplied and finally the cumulative column probabilty
    is multiplied with probabilty of that class . 
    Similarly, this is followed for every unique value of Y data. Thus every row will have as many unique values that many probabilties.
    Whichever class has the maximum probability is the predication for that row.
    This way a list of predictions is obtained for validation data.
    
    Argument :-
    test_new_df  :- new validation data frame.
    inp_out      : model dictionary with distinct word | class probabilities.
    unqout   :- list of unique values of Y data.
    class_dict  :- Dictinary of probabilities if every unique value of Y data.
    
    new test df,input-output column probabilty,class unique label,class unique label probabilty
    
    Return :- 
    Ypred  :- Array of predicted values.

    
    '''
    p=0
    d={}
    finalprob={}
    testprob=1
    Ypred=[]
    

# Iterating every row in x_test data
    for index,row in test_new_df.iterrows():
        
  
        # Iterating through unqiue values of Output column
        for testval in unqout:
            testprob=1

            
            #Iterating through unqiue columns of input column.
            for testcol in test_new_df.columns:
                
                
                if row[testcol] >0:
                    d=inp_out_prob[testcol]
                    
                
                    p=d[testcol+testval] # column|class probabilty is obtained from model's dictionary of probabilites 
                                          # eg the|sport or the|nosport
                                       
                    testprob=testprob*p 
                    
                #eg RednY * SportnY *domesticnY
        
        
            testprob=testprob*class_dict[testval]
            
            finalprob.update({testval:testprob})
        
        max_v = max(zip(finalprob.values(), finalprob.keys()))  # find class having maximum probabilty
    
        Ypred.append(max_v[1]) # dictionary to populate prediction of a given row
    return Ypred


    
def accuracyfunc(ypred,y_test):
    
    '''
    The function is to calculate the accuracy score with predicted data and actual labels.
    
    Arguement :- 
    ypred :- Prediction of validation data
    y_test :- Actual class of the data set
    
    Return :- 
    accuracy :- Accuracy score
    
    '''
    
    ytest=y_test.tolist()

    ytestrow=len(ytest)


    cntrow=0
    accuracy=0
    
    for x,y in zip(ytest,ypred):
        
        if x==y:
            cntrow=cntrow+1
  
    accuracy=cntrow*100/ytestrow
    print(accuracy)



In [3]:
#Read data into data frame.
df=read_data()

In [4]:
#Split X data into training and testing data.
x_train,x_test,y_train,y_test = data_preprocess(df)

In [5]:
#Tranform training data sentence to count based value.
x_train_transform,train_dist_word=count_vector(x_train,x_train)


In [6]:
#Tranform training data sentence to count based value.
xtest_transform,train_dist_word=count_vector(x_train,x_test)

In [7]:
#Create new dataframe with distinct words of training data as columns.
new_df=create_newdf(x_train_transform,train_dist_word)

In [8]:
#Create new dataframe with distinct words of test data as columns.
testnewdf=create_newdf(xtest_transform,train_dist_word)

In [9]:
#Calculate probability of unique values of Y data.
class_dict,unqout=class_prob(new_df,y_train)

In [10]:
#Calculate the probability of distinct words | unique class of training data.
inp_out_prob=prob_inp_out(train_dist_word,y_train,new_df,unqout)

In [11]:
#Predict test data.
ypred=finalypred(testnewdf,inp_out_prob,unqout,class_dict)

In [12]:
#Calculate accuracy score of predicted data and actual test Y data.
accuracyfunc(ypred,y_test)

33.333333333333336
