In [None]:
'''
The code is to evaluate the accuracy score of text-based SMS data over multiple classifier algorithms such as Naïve Bayes, 
Decision Tree, and K-nearest Neighbors. Text data requires special preparation before we can start using it for predictive 
modeling. The words need to convert to integer format. This code implements a concept called Bag of Words. In this
approach data frame rows containing text/document are converted into a list of distinct words that are transformed into a
feature in a new data frame. The frequency/count of these distinct words for a  row of text/document in the original data frame 
will become values in the newly transformed data frame. A new data frame is ready with distinct words as column and frequency
of those words as a row for further steps. This whole process of converting text data to distinct words and get a count of 
these words in a given sentence/document will be implemented by the scikit library. The scikit-learn CounterVectorizer library 
provides an easy method to transform the text data.

The data is read using pandas read_csv and converted to a data frame. The data is then split into X(input) and Y(output/class)
data. We will be implementing k-fold cross-validation with fold =5 to split the X and Y data into training and testing data.

The SMS data is passed over Naive Bayes, K-nearest Neighbors and Decision Tree classifiers to get the best classifier
with the highest accuracy score. For each of the classifiers,their respective objects are created with default parameters where
applicable. For k-nearest, training data is passed through a range of knn value to get the best knn.

This project is implementing k-fold cross-validation where n_split=5. The X data is passed over a kfold object to create training
and test data with their respective Y data. At each fold, training data is passed over CounterVectorizer object to fit and 
transform training data into frequency/count of distinct words for a row of a given sentence. Similarly, test data is transformed
to a frequency/count of distinct words for a row of a given sentence/documents(training data's distinct words are used to transform
test data). The transformed training data is passed over the fit method of the respective classifier object to build a model. The transformed
test data is run through this model to predict a class for every row. The incremental prediction of test data for every fold
is stored in a list.

Finally, the accuracy score is calculated with predicted data and actual Y data.

Paragraphs 4 and 5 are repeated for each of the classifiers.At the end the classifier which has the maximum accuracy score
will be the chosen classifier for the SMS data.

'''

In [2]:
import pandas as pd
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
import numpy as np

from IPython.display import display
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier


pd.set_option('display.max_columns', None)




In [3]:


def read_data():
    
    '''
    The function is to read data using pandas read_csv and convert into data frame.
    
    Return :-
    data :- Adult data set data frame.
    '''
    
    data=pd.read_table('https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv',header=None,names=['label','msg'])
    
    return data


def split_data_X_Y(df):
    
    ''' 
    This function split the data into X(input) and Y(output/class).
    
    Argument : - 
    df :- Data Frame is sms data.
    
    Return :- 
    X :- X data
    Y :- Y data
  
    '''
     
    Y = df.iloc[:, :-1]
    X = df.iloc[:, -1]
    
    
    return X,Y


def create_object_of_algo():
    
    '''
    The function is to create a classifier algorithm objects.
    Return :-
        
    multi_nom :-Multinominal Object.
    dt        :- Decision Tree classifier Object.
    
    
    '''
    
    multi_nom = MultinomialNB()
    
    dt=DecisionTreeClassifier(criterion="entropy")
    
    return multi_nom,dt


def kfold_split(split_val):
    
    '''
    The function is to create a object of kfold with n_split value.
    
    Argument :- 
    split_val :- n_split value.
    
    Return :- 
    kfold  :- Object of Kfold.
    
    '''
    kfold=KFold(n_splits=split_val)
    
    return kfold
  

def fit_transform_countvector(traindata_fit,testdata_fit):
    
    '''
    Create an object of CounterVectorizer.Fit the training data on this object. The fit will learn the vocabulary and get a
    a distinct list of words. After the fit, the data is transformed into an array of the count of the distinct words for every
    row of sentences/documents. Validation data is transformed over training data sets' distinct words.
    
    Argument :- 
    traindata_fit :-Training 
    testdata_fit :- Validation data
    
    Return :- 
    x_train_transform :- Transformed training data
    x_test_transform :- Transformed  validation data.
    
    
    '''

    vectorizer = CountVectorizer()
    vectorizer.fit(traindata_fit)
    x_train_transform=vectorizer.transform(traindata_fit)
    x_test_transform = vectorizer.transform(testdata_fit)
   
    
    return x_train_transform,x_test_transform



def fit_data(xtrain_transform,y_train,obj):
    
    '''
    This function is to fit the training data over the classifier algorithm library object to build a predictive model.
    
    Argument :- 
    xtrain_transform :- Trained transformed data.
    y_train - ,Y-train data.
    obj :- Classifier algorithm library object.
    
    Return :-
    obj :- Classifier algorithm library object.    
    '''
    
    obj.fit(xtrain_transform,y_train)
    
    return obj




def knn_algorithm(xdata,ydata,cv_obj):
    
    '''
    This function is to fit the training data over the K-nearest Neighbor library object with the number of neighbors being
    n_neighbors=val to build a predictive model.
    The algorithm uses Euclidean distance as a metric for calculating the distance between the data points.
    
    Argument :-
    Xdata : Transformed trained data
    Ydata :- label/class data
    cv_obj -: Object of kfold with n_split =3
    
    Return :- 
    max_v[0]:-Accuracy score
    max_v[1] :- Knn values with highest accuracy score
    
    
    '''
    accur_score={}
    knn_list=list(range(3,10,2))
    
    for val in knn_list:
        knn=KNeighborsClassifier(n_neighbors=val)
        knn_score=cross_val(xdata,ydata,cv_obj,knn)
        accur_score.update({val:knn_score})
    
    max_v = max(zip(accur_score.values(), accur_score.keys()))
    
    print(accur_score)
    print('Accuracy   :  ' ,max_v[0],'       ' ,'Top Knn with max accuracy      :   ',max_v[1])
    
    return max_v[0],max_v[1]
    
  
    
    
def NB_algorithm(xdata,ydata,cv_obj,clf):
    
    '''
    This function is to fit the training data over the Naive Bayes library object.
    The algorithm uses the conditional probability calculation. Class with the highest probability is chosen as the prediction 
    for the validation data.
    
    Argument :- 
    Xdata : Transformed trained data
    Ydata :- label/class data
    cv_obj -: Object of kfold with n_split =3
    clf :- Multinominal Library object
    
    Return :-   
    nbscore :- Accuracy score
        
    '''
        
    nbscore=cross_val(xdata,ydata,cv_obj,clf)
    
    return nbscore
            
    
    
    

def dt_algorithm(xdata,ydata,cv_obj,dt_obj):
    
    '''
    This function is to fit the training data over the Decision Tree library object.
    
    Argument :- 
    Xdata : Transformed trained data
    Ydata :- label/class data
    cv_obj -: Object of kfold with n_split =3
    dt_obj :-Decision tree lirabry object
    
    Return :-  
    
    dtscore :- accuracy score
    
    
    '''
    
        
    dtscore=cross_val(xdata,ydata,cv_obj,dt_obj)
    
    return dtscore
    
    


def pred_test(obj,xtest_transform):
    
    '''
    This function is to predict validation data over the classifier training model.
    
    Agrument :-
    obj :- Library Object of classifier algorithm.
    xtest_transform :- Transfored validation data.
    
    Return :- 
    ypred_class :- array of predicted test data.
    
    '''
   
    
    ypred_class = obj.predict(xtest_transform)
    
    return ypred_class



def cross_val(xdata,ydata,kf,algo_obj):
    
    '''
    K-fold cross-validation is used to split the input and output data. For every fold, training and validation text data is 
    fit and transformed over CounterVectorizer object to convert into integer format. 
    Transformed training data is fit on a given classifier algorithm object to build a predictive model.
    Transformed validation data is predicted over this model. Incremental predicted data is stored in a series object. 
    The accuracy score is calculated at the end of the cumulative predicted data and actual class data.

    Argument :- 
    Xdata : Transformed trained data
    Ydata :- label/class data
    kf -: Object of kfold with n_split =3
    algo_obj :-Classifier library object
    
    
    Return :-  
    scores :- accuracy score.
          
    '''
    
    
   
    y_pred_final=pd.Series([])
    for train_index,test_index in kf.split(xdata):
        
        
        x_train,x_test=xdata.iloc[train_index],xdata.iloc[test_index]
        y_train,y_test=ydata.iloc[train_index],ydata.iloc[test_index]
          
        # Fit and transform the training and validation data using CountVectorizer libraries. 
        xtrain_transform,xtest_transform=fit_transform_countvector(x_train,x_test)
        
        
        #Fit the data of respective library object. 
        fit_data(xtrain_transform,y_train,algo_obj)
        
        
        # Predict the test data.
        
        #Since we are going for cross validation,we need to append the predication of all folds.
        
        y_pred_class=pred_test(algo_obj,xtest_transform)
        
        for i in range(len(test_index)):
            y_pred_final[test_index[i]]=y_pred_class[i]
           
               
    scores=pred_accuracy(y_pred_final,ydata)
        
       
    return scores
        


def pred_accuracy(y_pred_final,ydata):
    
    '''
    The function is to calculate the accuracy score of predicted data and actual y data of test data set.
    
    Argument :- 
    y_pred_final :- Predicted data
    ydata        :- Actual Y class data
    
    Return :- 
    algo_score :- Accuracy score
    
    '''
    
    
    algo_score=metrics.accuracy_score(ydata,y_pred_final)
    
    return algo_score





In [4]:
#Dictionary to hold the highest accuracy score for each of the classifier algorithm.
best_accur_score={}

In [5]:
#To populated SMS data in a data frame.
df_sms=read_data()

In [6]:
#Split SMS data into input(X) and output(Y) data.
xdata,ydata=split_data_X_Y(df_sms)

In [7]:
#Create an object of k-fold cross-validation with n_split=3.
cv_obj=kfold_split(3)

In [8]:
#Create objects of multiple classification algorithm ie Multinominal and Decision Tree.
multi_nom_obj,dt_obj=create_object_of_algo()

In [9]:
#Execute the Decision Tree algorithm to get the accuracy score. Populate the dictionary with the score.
dt_score=dt_algorithm(xdata,ydata,cv_obj,dt_obj)
best_accur_score.update({'Decision Tree':dt_score})

In [None]:
#Execute the K-nearest Neighbors algorithm to get the accuracy score. Populate the dictionary with the score.
k_score,knear=knn_algorithm(xdata,ydata,cv_obj)
best_accur_score.update({'Knearest - '+ str(knear):k_score})

In [None]:
#Execute the Naive Bayes to get the accuracy score. Populate the dictionary with the score.
nb_score=NB_algorithm(xdata,ydata,cv_obj,multi_nom_obj)
best_accur_score.update({'Naive Bayes':nb_score})

In [13]:
#Get the best alogorithm with higest accuracy score for the SMS data.
best_v=max(zip(best_accur_score.values(), best_accur_score.keys()))
print('Best algorithm for SMS data is '  , best_v[1] ,'with accuracy score of ' ,best_v[0])
    

Best algorithm for SMS data is  Naive Bayes with accuracy score of  0.9867193108399138
