In [None]:


'''

This code is to understand the working of Naive Bayes classifier with text/document type of data. The text data requires special 
approach before it can be presented to the predictive model. This code implements a concept called Bag of Word. In this approach
data frame rows containing text/document are converted into a list of distinct words.This distinct words then are 
transformed into a feature in a new data frame. The frequency/count of these distinct words in every row of text/document data
frame becomes values for corresponding rows in the newly transformed data frame. Once a new data frame is ready with distinct
words as column and frequency of those words as row, it is now ready for probability calculation. While calculating probability
Laplace smoothing is implemented to avoid 0 probability. Thereafter validation data is run over the training model to predicate
a label for each row. The accuracy score is calculated at the end.

The details steps are as below.

The text/document data is read using the pandas read_csv function and converted into a data frame. Each row of the data frame 
represents a text/document. The data is split to get input(X) and output(Y) data. The X data is train-test split into training
and validation data.

The rows of the training data frame are split to create a list of words. From this list, a set of distinct words is created. 
A new training data frame is created with distinct words as a feature. Each row in this new data frame will correspond to row
in the original data frame. To populate the row, the original data frame is iterated through rows. Each row is split into words,
an incremental count for that word is populated in the corresponding row, column(word) location.
This way training data is converted into words based feature data frame.


For validation data, a similar data frame is created with training data's distinct words as features. To populate this new data
frame, the validation data is iterated through rows. Each row is split into words, an incremental count for that word is populated
in the corresponding row, column(word) location. This way new data frame for validation data is ready.


For probability, the probability of each unique class of Y data is calculated and updated in a dictionary object.
Thereafter a training model is built, for this conditional probability of training data is calculated. For each of the distinct
words of the training, data set the corresponding sum of the count of matching distinct word given unique class  + 1/total count
of that distinct word +len(new data frame) is calculated for the new data frame.
eg The| no sport or game|sport, etc.
This process is done for every unique value of the class. The dictionary object is populated.
eg The{Thesport:probability,Thenosport:probabilty}
 
For prediction, validation data is iterated through rows, corresponding to each row, the respective column name is matched with
the model's probability dictionary ie column | unique class (eg given above). For each row, for all columns in that row,
column|class probability is multiplied and cumulative probability is finally multiplied with the probability of that class
and updated in the dictionary (class: total probability).
This process is repeated for all unique values. Thus every row will have as many unique values that many probabilities.
The class: probability is populated in a dictionary.
The class which has the highest probability will be predicted for that row. This way a list is
populated with the prediction for validation data.
 
Finally, the accuracy score is calculated with predicted data and actual Y data.



 '''

In [1]:
import pandas as pd
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
import numpy as np



In [24]:
def read_data():
    
    '''
   This function is to read data using Pandas read_csv function and convert into dataframe.
    
    Return: - Dataframe     
    '''
    
    data=pd.read_csv('D:\Dataset\Bag-Word-Data.csv',header=None,names=['sentences','category'])
    
    return data



def data_preprocess(df):
    '''
    This function is to split data into X and Y(input and output data)using the data frame.
    
    The X data is further train-test-Split to get training and testing data
    
    Argument :-
    df   :- Dataframe with text/document data.    
    
    Return :- 
    x_train:-training data
    x_test :- Validation data
    y_train :- Label data of training data
    y_test :- label data of validation data
    
    '''
    
    x = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    
    #spliting the dataset into training and test set
    xtrain, xtest, ytrain, ytest = train_test_split(x, y,test_size=.3, random_state=0)
    
    return xtrain,xtest,ytrain,ytest




 
def list_of_word(datatoget):
    
    '''
    This function makes a list of sentences for every row of the data frame. Then it splits the sentences to make a list of words.
    From this list of words, it creates a set of distinct words.
    This process is done for both the training and validation data set.
    
    
    
    Argument :-
    datatoget :- Text/document based data frame.
    
    Return :- 
    dist_word : list of distinct words
    
    listword :- List of words.
    
    '''
    listword=[]
    commonword=[]
    dist_word=[]
    
    listword=datatoget['sentences'].values.tolist()
    
    
    #Create a single list of words from the above snetences FOR X_train.Once the list is made,get distinct words from the list

    for i in range(len(listword)):
        words=[]
        words=listword[i].split()
        
        for j in range(len(words)):
               commonword.append(words[j].lower())
        words.clear()

    dist_word=list(set(commonword))

    
    return dist_word,listword




def train_distinct_word_df(train_list_word,x_train,train_dist_word):
    
    '''
    This function is to create a new data frame for training /validation data. A new data frame is created where distinct words
    of training, data will be features. The same features will also be used to create a new data frame for validation data.
    
    The row index will correspond to training/validation data. The list of sentences of training data is iterated. Each sentence is 
    split into words. The words are incrementally counted and populated in the new data frame at the location of the index 
    value of the row and column =word since column name = distinct words.
    
    Argument :-
    train_list_word :- List of sentences of training data.
    x_train         :- Training data set
    train_dist_word :- List of distinct words of training data set
    
    Return :-
    newdf :- new data frame created.
    
    
    
    '''
    
    #Creating a dimension for new dataframe ,rows = number of rows in training data,columns = distinct words on training data.

    zero_data = np.zeros(shape=(x_train.shape[0],len(train_dist_word)))
    
    #Creating data frame where column name = distinct words of the training dataset. Rows are populated with zero values.
    newdf=pd.DataFrame(zero_data,columns=list(train_dist_word))
  
   

    for i in range(len(train_list_word)):
     
        traincnt=[]
        traincnt=train_list_word[i].split()
        
        
        for j in traincnt:
            j=j.lower()
        
            newdf.at[i,j]=newdf.at[i,j]+1
               
        traincnt.clear()
    return newdf




    
def class_prob(new_df,y_train):
    
    '''
    The function is to calculate the probability of the unique value of Y data.
    
    Argument :- 
    newdf :- New data frame with distinct words as features
    
    y_train :-Ytrain data
    
    Return :- 
    outdict  :- Dictionary of unique class label:prob
    unq_out  :- List of unique values of Y data
        
    '''
    
    #Find unique value of Output column and rowcount of X_train data 
    unq_out=y_train.unique()
    rowcnt=y_train.shape[0]
    outdict={}
    
     #concatenate new_df and y_train data for looping through column/name pair of input and output colmun,
     #so have same index locations
    concatdftrain=pd.concat([new_df,y_train],axis=1)
    
    # Getting probablity of ouput column for unqiue values
    for k in unq_out:
        prob=concatdftrain[concatdftrain[y_train.name]==k].shape[0]/rowcnt
        outdict.update({k:prob})

    return outdict,unq_out




    
def prob_inp_out(train_dist_word,y_train,new_df,unqout):
    
    '''
    
  This function is to calculate the probability of distinct words given a unique class of data. The same step is repeated for very
    unique value of the Y data eg distinct_word|class 1,distincet_word|class 2..etc.This way for a given distinct word probabilities
    with all unique classes are stored in a dictionary object.Training model is built.
    
    Argument :-
    train_dist_word  :- distinct words of training data
    y_train          :- Y data 
    new_df           :- new data frame with features as distinct words
    unqout           :- List of unique values of Y data.
    
    Return :-
    finalans :- Dictionary holding probabilities of every distinct word with every unique value of Y data.
    
    '''
    prob=0
    result=0
    val=''
    w=''
    k=0
    finalans={}
    #concatenate new_df and y_train data for looping through column/name pair of input and output colmun ,
    #so as to have same index locations.
  
    concatdftrain=pd.concat([new_df,y_train],axis=1)

    for w in train_dist_word:
        w=w.lower()
        newdict={}
        
    
        for val in unqout:
            wordcount=0
        
            wordcount=concatdftrain[concatdftrain[y_train.name]==val][w].sum() +1
        
            prob=wordcount/(concatdftrain[concatdftrain[y_train.name]==val].iloc[:,:-1].sum().sum()+len(new_df.columns))
            
                   
            newdict.update({w+val:prob}) #thenosport:probabilty
        
                
        finalans.update({w:newdict})    #the:{thenosport:probabilty,thesport:probabilty}
    return finalans




def test_dist_word_df(x_test,train_dist_word,test_list_word):
    
    '''
    The function is to fit the validation data over distinct words of training data ie a new data frame is created with
    distinct words of training data as features. The list of validation sentences is iterated.Each sentence is split into 
    words.If the words match the distinct words of training data then the row of the new validation data frame is populated
    with an incremental count of the word. The word is added at the location of row index and column name=word.
    
    
    
    Arguement :- 
    x_test  :- validation dataset
    train_dist_word :- list of distinct words of training data set.
    test_list_word :- List of sentences of validation data.
    
    Return :- 
    testnewdf :- new validation data frame.
    
    '''
    
    #Creating a matrix for new dataframe for X-Test data row=size of validation data ,columns = list of distinct words of training dataset
    testzero_data = np.zeros(shape=(x_test.shape[0],len(train_dist_word)))
    
    #Creating dataframe with size=nos of sentence of X-data,columns=distinct words of X-train sentences
    testnewdf=pd.DataFrame(testzero_data,columns=list(train_dist_word))
       
    

    for tword in range(len(test_list_word)):
    
   
        testcnt=[]
        testcnt=test_list_word[tword].split()
    
        
        for tcnt in testcnt:
            tcnt=tcnt.lower()
        
            for trainw in train_dist_word:
            
                if tcnt==trainw:
        
                    testnewdf.at[tword,trainw]=testnewdf.at[tword,trainw]+1
                else:
                    pass
           
    
                
        testcnt.clear()
    
    return testnewdf




def finalypred(test_new_df,inp_out_prob,unqout,class_dict):
    
    
    '''
    The function is created to predict validation data. The validation data is iterated overs rows.For each row, its iterated over
    columns.For every column | unique class, the corresponding probability is obtained from the model dictionary of probabilities.
    This way probabilites of all columns of a given row are multiplied and finally the cumulative column probabilty
    is multiplied with probabilty of that class . 
    Similarly, this is followed for every unique value of Y data. Thus every row will have as many unique values that many probabilties.
    Whichever class has the maximum probability is the predication for that row.
    This way a list of predictions is obtained for validation data.
    
    Arguement :-
    test_new_df  :- new validation data frame.
    inp_out      : model dictionary with distinct word | class probabilities.
    unqout   :- list of unique values of Y data.
    class_dict  :- Dictinary of probabilities if every unique value of Y data.
    
    new test df,input-output column probabilty,class unique label,class unique label probabilty
    
    Return :- 
    Ypred  :- Array of predicted values.

    
    '''
    p=0
    d={}
    finalprob={}
    testprob=1
    Ypred=[]
    

# Iterating every row in x_test data
    for index,row in test_new_df.iterrows():
        
  
        # Iterating through unqiue values of Output column
        for testval in unqout:
            testprob=1

            
            #Iterating through columns of validation data.
            for testcol in test_new_df.columns:
                
                
                if row[testcol] >0:
                    d=inp_out_prob[testcol]
                    
                
                    p=d[testcol+testval]  # column|class probabilty is obtained from model's dictionary of probabilites 
                                          # eg the|sport or the|nosport
                    
                    
                testprob=testprob*p 
                    
                #eg RednY * SportnY *domesticnY
        
        
            testprob=testprob*class_dict[testval] #Multiple cumulative column|class probability with that class probability
            
            finalprob.update({testval:testprob})
        
        max_v = max(zip(finalprob.values(), finalprob.keys()))  # find class having maximum probabilty
    
        Ypred.append(max_v[1]) # dictionary to populate prediction of a given row
    return Ypred




def accuracyfunc(ypred,y_test):
    
    '''
    The function is to calculate the accuracy score with predicted data and actual labels.
    
    Arguement :- 
    ypred :- Prediction of validation data
    y_test :- Actual class of the data set
    
    Return :- 
    accuracy :- Accuracy score
    
    '''
    
    ytest=y_test.tolist()

    ytestrow=len(ytest)


    cntrow=0
    accuracy=0
    
    for x,y in zip(ytest,ypred):
        
        if x==y:
            cntrow=cntrow+1
  
    accuracy=cntrow*100/ytestrow
    print(accuracy)



In [25]:
#Read data in pandas data frame
df=read_data()

In [26]:
#train-test split X and Y data.
x_train,x_test,y_train,y_test = data_preprocess(df)

In [27]:
#Create list of distinct words and list of words of training data set
train_dist_word,train_list_word=list_of_word(x_train)

In [28]:
#Create list of words of test data.
test_dist_word,test_list_word=list_of_word(x_test)

In [29]:
#Create new data frame with distinct words of training data as features and incremental count of distinct words as row value.
new_df=train_distinct_word_df(train_list_word,x_train,train_dist_word)

In [30]:
#Create probability of every unique class of Y data.
class_dict,unqout=class_prob(new_df,y_train)

In [31]:
#Calculate prabability of new training data set
inp_out_prob=prob_inp_out(train_dist_word,y_train,new_df,unqout)

In [32]:
#Create new test dataframe with training distinct words as features and incremental count of test data words as row value.
test_new_df=test_dist_word_df(x_test,train_dist_word,test_list_word)

In [33]:
#Predict new test data over training model
ypred=finalypred(test_new_df,inp_out_prob,unqout,class_dict)

In [34]:
#Find accuracy score
accuracyfunc(ypred,y_test)

33.333333333333336
