In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [None]:
df=pd.read_csv('C:\\Users\\Siddharth\\Documents\\Study Material\\Informs\\Owner and grade.csv',encoding="latin-1")

In [103]:
df.head()

Unnamed: 0,OWNER NAME,FACILITY ZIP,FACILITY ADDRESS,GRADE
0,LUCY'S DRIVE THRU INC,90230,11204 WASHINGTON PL,A
1,PAUL V ANAND,91405,14102 SHERMAN WAY,A
2,"CHARR, INC",90220,15228 S AVALON BLVD,A
3,"MARISCAL, ESTELA",90255,2871 E GAGE AVE,A
4,"TAWA SUPERMARKET, INC",91007,1300 S GOLDEN WEST AVE,A


In [11]:
X=df[['OWNER NAME']]
y=df[['GRADE']]

In [13]:
#Splitting train and test datasets 

from sklearn.model_selection import train_test_split
train_data, test_data, train_labels, test_labels = train_test_split(X, y, random_state = 100)

print('Shape of train {}, shape of test {}'.format(X_train.shape, X_test.shape))

Shape of train (32220, 1), shape of test (10740, 1)


In [15]:
import pandas as pd 
import numpy as np 
from collections import defaultdict
import re

In [16]:
def preprocess_string(str_arg):
    
    """"
        Parameters:
        ----------
        str_arg: example string to be preprocessed
        
        What the function does?
        -----------------------
        Preprocess the string argument - str_arg - such that :
        1. everything apart from letters is excluded
        2. multiple spaces are replaced by single space
        3. str_arg is converted to lower case 
        
        Example:
        --------
        Input :  Menu is absolutely perfect,loved it!
        Output:  ['menu', 'is', 'absolutely', 'perfect', 'loved', 'it']
        
        Returns:
        ---------
        Preprocessed string 
        
    """
    
    cleaned_str=re.sub('[^a-z\s]+',' ',str_arg,flags=re.IGNORECASE) #every char except alphabets is replaced
    cleaned_str=re.sub('(\s+)',' ',cleaned_str) #multiple spaces are replaced by single space
    cleaned_str=cleaned_str.lower() #converting the cleaned string to lower case
    
    return cleaned_str # returning the preprocessed string 

## 1. Simple Naive Bayes

In [17]:
class NaiveBayes:
    
    def __init__(self,unique_classes):
        
        self.classes=unique_classes # Constructor is sinply passed with unique number of classes of the training set
        

    def addToBow(self,example,dict_index):
        
        '''
            Parameters:
            1. example 
            2. dict_index - implies to which BoW category this example belongs to
            What the function does?
            -----------------------
            It simply splits the example on the basis of space as a tokenizer and adds every tokenized word to
            its corresponding dictionary/BoW
            Returns:
            ---------
            Nothing
        
       '''
        
        if isinstance(example,np.ndarray): example=example[0]
     
        for token_word in example.split(): #for every word in preprocessed example
          
            self.bow_dicts[dict_index][token_word]+=1 #increment in its count
            
    def train(self,dataset,labels):
        
        '''
            Parameters:
            1. dataset - shape = (m X d)
            2. labels - shape = (m,)
            What the function does?
            -----------------------
            This is the training function which will train the Naive Bayes Model i.e compute a BoW for each
            category/class. 
            Returns:
            ---------
            Nothing
        
        '''
    
        self.examples=dataset
        self.labels=labels
        self.bow_dicts=np.array([defaultdict(lambda:0) for index in range(self.classes.shape[0])])
        
        #only convert to numpy arrays if initially not passed as numpy arrays - else its a useless recomputation
        
        if not isinstance(self.examples,np.ndarray): self.examples=np.array(self.examples)
        if not isinstance(self.labels,np.ndarray): self.labels=np.array(self.labels)
            
        #constructing BoW for each category
        for cat_index,cat in enumerate(self.classes):
          
            all_cat_examples=self.examples[self.labels==cat] #filter all examples of category == cat
            
            #get examples preprocessed
            
            cleaned_examples=[preprocess_string(cat_example) for cat_example in all_cat_examples]
            
            cleaned_examples=pd.DataFrame(data=cleaned_examples)
            
            #now costruct BoW of this particular category
            np.apply_along_axis(self.addToBow,1,cleaned_examples,cat_index)
            
                
        ###################################################################################################
        
        '''
            Although we are done with the training of Naive Bayes Model BUT!!!!!!
            ------------------------------------------------------------------------------------
            Remember The Test Time Forumla ? : {for each word w [ count(w|c)+1 ] / [ count(c) + |V| + 1 ] } * p(c)
            ------------------------------------------------------------------------------------
            
            We are done with constructing of BoW for each category. But we need to precompute a few 
            other calculations at training time too:
            1. prior probability of each class - p(c)
            2. vocabulary |V| 
            3. denominator value of each class - [ count(c) + |V| + 1 ] 
            
            Reason for doing this precomputing calculations stuff ???
            ---------------------
            We can do all these 3 calculations at test time too BUT doing so means to re-compute these 
            again and again every time the test function will be called - this would significantly
            increase the computation time especially when we have a lot of test examples to classify!!!).  
            And moreover, it doensot make sense to repeatedly compute the same thing - 
            why do extra computations ???
            So we will precompute all of them & use them during test time to speed up predictions.
            
        '''
        
        ###################################################################################################
      
        prob_classes=np.empty(self.classes.shape[0])
        all_words=[]
        cat_word_counts=np.empty(self.classes.shape[0])
        for cat_index,cat in enumerate(self.classes):
           
            #Calculating prior probability p(c) for each class
            prob_classes[cat_index]=np.sum(self.labels==cat)/float(self.labels.shape[0]) 
            
            #Calculating total counts of all the words of each class 
            count=list(self.bow_dicts[cat_index].values())
            cat_word_counts[cat_index]=np.sum(np.array(list(self.bow_dicts[cat_index].values())))+1 # |v| is remaining to be added
            
            #get all words of this category                                
            all_words+=self.bow_dicts[cat_index].keys()
                                                     
        
        #combine all words of every category & make them unique to get vocabulary -V- of entire training set
        
        self.vocab=np.unique(np.array(all_words))
        self.vocab_length=self.vocab.shape[0]
                                  
        #computing denominator value                                      
        denoms=np.array([cat_word_counts[cat_index]+self.vocab_length+1 for cat_index,cat in enumerate(self.classes)])                                                                          
      
        '''
            Now that we have everything precomputed as well, its better to organize everything in a tuple 
            rather than to have a separate list for every thing.
            
            Every element of self.cats_info has a tuple of values
            Each tuple has a dict at index 0, prior probability at index 1, denominator value at index 2
        '''
        
        self.cats_info=[(self.bow_dicts[cat_index],prob_classes[cat_index],denoms[cat_index]) for cat_index,cat in enumerate(self.classes)]                               
        self.cats_info=np.array(self.cats_info)                                 
                                              
                                              
    def getExampleProb(self,test_example):                                
        
        '''
            Parameters:
            -----------
            1. a single test example 
            What the function does?
            -----------------------
            Function that estimates posterior probability of the given test example
            Returns:
            ---------
            probability of test example in ALL CLASSES
        '''                                      
                                              
        likelihood_prob=np.zeros(self.classes.shape[0]) #to store probability w.r.t each class
        
        #finding probability w.r.t each class of the given test example
        for cat_index,cat in enumerate(self.classes): 
                             
            for test_token in test_example.split(): #split the test example and get p of each test word
                
                ####################################################################################
                                              
                #This loop computes : for each word w [ count(w|c)+1 ] / [ count(c) + |V| + 1 ]                               
                                              
                ####################################################################################                              
                
                #get total count of this test token from it's respective training dict to get numerator value                           
                test_token_counts=self.cats_info[cat_index][0].get(test_token,0)+1
                
                #now get likelihood of this test_token word                              
                test_token_prob=test_token_counts/float(self.cats_info[cat_index][2])                              
                
                #remember why taking log? To prevent underflow!
                likelihood_prob[cat_index]+=np.log(test_token_prob)
                                              
        # we have likelihood estimate of the given example against every class but we need posterior probility
        post_prob=np.empty(self.classes.shape[0])
        for cat_index,cat in enumerate(self.classes):
            post_prob[cat_index]=likelihood_prob[cat_index]+np.log(self.cats_info[cat_index][1])                                  
      
        return post_prob
    
   
    def test(self,test_set):
      
        '''
            Parameters:
            -----------
            1. A complete test set of shape (m,)
            
            What the function does?
            -----------------------
            Determines probability of each test example against all classes and predicts the label
            against which the class probability is maximum
            Returns:
            ---------
            Predictions of test examples - A single prediction against every test example
        '''       
       
        predictions=[] #to store prediction of each test example
        for example in test_set: 
                                              
            #preprocess the test example the same way we did for training set exampels                                  
            cleaned_example=preprocess_string(example) 
             
            #simply get the posterior probability of every example                                  
            post_prob=self.getExampleProb(cleaned_example) #get prob of this example for both classes
            
            #simply pick the max value and map against self.classes!
            predictions.append(self.classes[np.argmax(post_prob)])
                
        return np.array(predictions) 

In [18]:
nb=NaiveBayes(np.unique(train_labels)) #instantiate a NB class object
print ("---------------- Training In Progress --------------------")
 
nb.train(train_data,train_labels) #start tarining by calling the train function
print ('----------------- Training Completed ---------------------')

---------------- Training In Progress --------------------
----------------- Training Completed ---------------------


In [19]:
pclasses=nb.test(test_data) #get predcitions for test set

#check how many predcitions actually match original test labels
test_acc=np.sum(pclasses==test_labels)/float(test_labels.shape[0]) 

print ("Test Set Examples: ",test_labels.shape[0]) # Outputs : Test Set Examples:  1502
print ("Test Set Accuracy: ",test_acc*100,"%") # Outputs : Test Set Accuracy:  93.8748335553 %

Test Set Examples:  10740
Test Set Accuracy:  GRADE    82.886406
dtype: float64 %


## 2. Multinomial Naive Bayes (Only Owner Name Variable)

In [21]:
df['GRADE']=df['GRADE'].map({'A':1,'B':2,'C':3,'D':4})

In [22]:
X=df['OWNER NAME']
Y=df['GRADE']

In [23]:
cv=CountVectorizer()
X=cv.fit_transform(X)

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [26]:
#Naive Bayes Classifier
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          1       0.83      0.94      0.88     11796
          2       0.04      0.01      0.02      2109
          3       0.00      0.00      0.00       252
          4       0.00      0.00      0.00        20

avg / total       0.69      0.79      0.74     14177



  'precision', 'predicted', average, warn_for)


## 3. Running it on a Smaller datasets (Under Sampling)
As the data is very imbalanced we decided to run the models on undersampled dataset

In [76]:
df['GRADE'].unique()

array([1, 2, 3], dtype=int64)

In [59]:
df['GRADE'].unique()
df['GRADE']=df['GRADE'].map({'A':1,'B':2,'C':3})

In [78]:
df.head()

Unnamed: 0,OWNER NAME,GRADE
0,LUCY'S DRIVE THRU INC,1
1,PAUL V ANAND,1
2,"CHARR, INC",1
3,"MARISCAL, ESTELA",1
4,"TAWA SUPERMARKET, INC",1


In [80]:
X=df['OWNER NAME']
Y=df['GRADE']

In [81]:
grade1_index=Y[Y.values==1].index
grade2_index=Y[Y.values==2].index
grade3_index=Y[Y.values==3].index

highest=grade1_index
higher=grade2_index
lower=grade3_index

In [82]:
# remember higher is a list of indexes, either of 0 or 1's in the response variable in training set
higher=np.random.choice(higher, size=2*len(lower))
highest=np.random.choice(highest, size=2*len(lower))

lower=np.asarray(lower)

new_indexes=np.concatenate((lower,higher,highest))

X=X.loc[new_indexes,]
Y=Y.loc[new_indexes]

In [85]:
cv=CountVectorizer()
X=cv.fit_transform(X)
X

<3965x4570 sparse matrix of type '<class 'numpy.int64'>'
	with 11561 stored elements in Compressed Sparse Row format>

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [87]:
#Naive Bayes Classifier
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          1       0.44      0.57      0.50       497
          2       0.45      0.51      0.48       527
          3       0.17      0.04      0.07       285

avg / total       0.39      0.43      0.40      1309



In [98]:
X1=df['OWNER NAME']
##cv=CountVectorizer()
##X1=cv.fit_transform(X1)
X1
Y1=df['GRADE']

In [None]:
y_pred = clf.predict(X1)
print(classification_report(Y1, y_pred))

## 4. Adding Zip and Address to the previous model

In [297]:
df1=pd.read_csv('C:\\Users\\Siddharth\\Documents\\Study Material\\Informs\\Zip, Address.csv',encoding="latin-1")

In [298]:
df1.head()

Unnamed: 0,FACILITY ADDRESS,FACILITY ZIP,GRADE
0,11204 WASHINGTON PL,90230,A
1,14102 SHERMAN WAY,91405,A
2,15228 S AVALON BLVD,90220,A
3,2871 E GAGE AVE,90255,A
4,1300 S GOLDEN WEST AVE,91007,A


In [299]:
df1['GRADE'].unique()
df1['GRADE']=df1['GRADE'].map({'A':1,'B':2,'C':3})

In [300]:
df1.head()

Unnamed: 0,FACILITY ADDRESS,FACILITY ZIP,GRADE
0,11204 WASHINGTON PL,90230,1.0
1,14102 SHERMAN WAY,91405,1.0
2,15228 S AVALON BLVD,90220,1.0
3,2871 E GAGE AVE,90255,1.0
4,1300 S GOLDEN WEST AVE,91007,1.0


In [301]:
df1=df1.dropna()
df1.isnull().sum()

FACILITY ADDRESS    0
FACILITY ZIP        0
GRADE               0
dtype: int64

In [302]:
df1['FACILITY ZIP'] = df1['FACILITY ZIP'].astype(str)

In [303]:
df1.dtypes

FACILITY ADDRESS     object
FACILITY ZIP         object
GRADE               float64
dtype: object

In [314]:
#X=df1[['FACILITY ZIP','FACILITY ADDRESS']]
X=df1['FACILITY ADDRESS']
Y=df1['GRADE']

In [313]:
X=pd.DataFrame(X)

In [306]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=40)

In [307]:
#Naive Bayes Classifier
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

        1.0       0.95      1.00      0.97     64039
        2.0       0.18      0.01      0.03      3270
        3.0       0.00      0.00      0.00       306

avg / total       0.91      0.94      0.92     67615



  'precision', 'predicted', average, warn_for)


In [315]:
grade1_index=Y[Y.values==1].index
grade2_index=Y[Y.values==2].index
grade3_index=Y[Y.values==3].index

highest=grade1_index
higher=grade2_index
lower=grade3_index

In [316]:
# remember higher is a list of indexes, either of 0 or 1's in the response variable in training set
higher=np.random.choice(higher, size=2*len(lower))
highest=np.random.choice(highest, size=2*len(lower))

lower=np.asarray(lower)

new_indexes=np.concatenate((lower,higher,highest))

X=X.loc[new_indexes,]
Y=Y.loc[new_indexes]

In [317]:
cv=CountVectorizer()
X=cv.fit_transform(X)
X

<4555x3601 sparse matrix of type '<class 'numpy.int64'>'
	with 15495 stored elements in Compressed Sparse Row format>

In [318]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=40)

In [319]:
#Naive Bayes Classifier
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

        1.0       0.50      0.55      0.53       603
        2.0       0.47      0.60      0.53       607
        3.0       0.43      0.11      0.17       294

avg / total       0.48      0.49      0.46      1504



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

## 5. Trying different Model

In [362]:
df1=pd.read_csv('C:\\Users\\Siddharth\\Documents\\Study Material\\Informs\\Zip, Address.csv',encoding="latin-1")

In [363]:
df1.head()

Unnamed: 0,OWNER NAME,FACILITY ADDRESS,FACILITY ZIP,GRADE
0,LUCY'S DRIVE THRU INC,11204 WASHINGTON PL,90230,A
1,PAUL V ANAND,14102 SHERMAN WAY,91405,A
2,"CHARR, INC",15228 S AVALON BLVD,90220,A
3,"MARISCAL, ESTELA",2871 E GAGE AVE,90255,A
4,"TAWA SUPERMARKET, INC",1300 S GOLDEN WEST AVE,91007,A


In [364]:
df1=df1.dropna()
df1.isnull().sum()

OWNER NAME          0
FACILITY ADDRESS    0
FACILITY ZIP        0
GRADE               0
dtype: int64

In [330]:
#X=df1[['FACILITY ZIP','FACILITY ADDRESS']]
X=df1[['FACILITY ADDRESS','FACILITY ZIP']]
Y=df1[['GRADE']]

In [None]:
X=pd.DataFrame(X)

In [272]:
grade1_index=Y[Y.values=='A'].index
grade2_index=Y[Y.values=='B'].index
grade3_index=Y[Y.values=='C'].index

highest=grade1_index
higher=grade2_index
lower=grade3_index

In [273]:
# remember higher is a list of indexes, either of 0 or 1's in the response variable in training set
higher=np.random.choice(higher, size=2*len(lower))
highest=np.random.choice(highest, size=2*len(lower))

lower=np.asarray(lower)

new_indexes=np.concatenate((lower,higher,highest))

X=X.loc[new_indexes,]
Y=Y.loc[new_indexes]

In [274]:
X.head()

Unnamed: 0,FACILITY ADDRESS
20,10905 MAGNOLIA BLVD
384,15321 ROSCOE BLVD
607,3803 W BURBANK BLVD
1258,1695 S AZUSA AVE
1565,3225 E PACIFIC COAST HWY #A


In [366]:
df1['FACILITY ZIP'] = df1['FACILITY ZIP'].astype(str)

In [367]:
#X=df1[['FACILITY ZIP','FACILITY ADDRESS']]
X=df1['FACILITY ADDRESS']
Y=df1['GRADE']

In [368]:
cv=CountVectorizer()
X=cv.fit_transform(X)
X

<204892x15382 sparse matrix of type '<class 'numpy.int64'>'
	with 692897 stored elements in Compressed Sparse Row format>

In [369]:
#X=df1[['FACILITY ZIP','FACILITY ADDRESS']]
X1=df1['FACILITY ZIP']


In [370]:
cv=CountVectorizer()
X1=cv.fit_transform(X1)
X1

<204892x396 sparse matrix of type '<class 'numpy.int64'>'
	with 204892 stored elements in Compressed Sparse Row format>

In [399]:
from scipy.sparse import hstack
Combined=hstack((X, X1))

In [400]:
X_train, X_test, y_train, y_test = train_test_split(Combined, Y, test_size=0.33, random_state=40)

In [401]:
#Naive Bayes Classifier
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          A       0.95      1.00      0.97     64039
          B       0.25      0.03      0.06      3270
          C       0.00      0.00      0.00       306

avg / total       0.91      0.94      0.92     67615



  'precision', 'predicted', average, warn_for)


## 6. Combining All three Variables

In [614]:
df1=pd.read_csv('C:\\Users\\Siddharth\\Documents\\Study Material\\Informs\\Zip, Address.csv',encoding="latin-1")

In [615]:
df1.head()

Unnamed: 0,OWNER NAME,FACILITY NAME,FACILITY ADDRESS,FACILITY ZIP,GRADE
0,LUCY'S DRIVE THRU INC,5 DE MAYO,11204 WASHINGTON PL,90650,A
1,PAUL V ANAND,7-ELEVEN STORE 18341D,14102 SHERMAN WAY,91744,A
2,"CHARR, INC",7-ELEVEN STORE 37021A,15228 S AVALON BLVD,90028,A
3,"MARISCAL, ESTELA",98 CENT DISCOUT STORE & MKT,2871 E GAGE AVE,90046,A
4,"TAWA SUPERMARKET, INC",99 RANCH MARKET #7,1300 S GOLDEN WEST AVE,91605,A


In [616]:
df1=df1.dropna()
df1.isnull().sum()

OWNER NAME          0
FACILITY NAME       0
FACILITY ADDRESS    0
FACILITY ZIP        0
GRADE               0
dtype: int64

In [617]:
#X=df1[['FACILITY ZIP','FACILITY ADDRESS']]
X=df1[['FACILITY ADDRESS','FACILITY ZIP']]
Y=df1[['GRADE']]

In [618]:
df1['FACILITY ZIP'] = df1['FACILITY ZIP'].astype(str)

In [619]:
#X=df1[['FACILITY ZIP','FACILITY ADDRESS']]
X=df1['FACILITY ADDRESS']
Y=df1['GRADE']

In [620]:
cv=CountVectorizer()
X=cv.fit_transform(X)
X

<204892x15382 sparse matrix of type '<class 'numpy.int64'>'
	with 692897 stored elements in Compressed Sparse Row format>

In [621]:
#X=df1[['FACILITY ZIP','FACILITY ADDRESS']]
X1=df1['FACILITY ZIP']

In [622]:
cv=CountVectorizer()
X1=cv.fit_transform(X1)
X1

<204892x2515 sparse matrix of type '<class 'numpy.int64'>'
	with 220284 stored elements in Compressed Sparse Row format>

In [392]:
#X=df1[['FACILITY ZIP','FACILITY ADDRESS']]
X2=df1['OWNER NAME']


In [393]:
cv=CountVectorizer()
X2=cv.fit_transform(X2)
X2

<204892x24677 sparse matrix of type '<class 'numpy.int64'>'
	with 611589 stored elements in Compressed Sparse Row format>

In [395]:
from scipy.sparse import hstack
Comb=hstack((X2, X))

In [396]:
Combined=hstack((Comb, X1))

In [397]:
X_train, X_test, y_train, y_test = train_test_split(Combined, Y, test_size=0.33, random_state=40)

In [398]:
#Naive Bayes Classifier
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          A       0.95      0.98      0.97     64039
          B       0.20      0.10      0.13      3270
          C       0.00      0.00      0.00       306

avg / total       0.91      0.93      0.92     67615



# 7. Running it on Smaller Dataset and all the 3 columns

In [3]:
df1=pd.read_csv('C:\\Users\\Siddharth\\Documents\\Study Material\\Informs\\Zip, Address.csv',encoding="latin-1")

In [4]:
df1['GRADE'].unique()

array(['A', 'B', 'C', nan], dtype=object)

In [5]:
df1=df1.dropna()
df1.isnull().sum()

OWNER NAME          0
FACILITY NAME       0
FACILITY ADDRESS    0
FACILITY ZIP        0
GRADE               0
dtype: int64

In [6]:
df1['GRADE']=df1['GRADE'].map({'A':1,'B':2,'C':3})

In [7]:
X=df1['OWNER NAME']
X1=df1['FACILITY ADDRESS']
X2=df1['FACILITY ZIP']
X3=df1['FACILITY NAME']
Y=df1['GRADE']

In [8]:
df1.head()

Unnamed: 0,OWNER NAME,FACILITY NAME,FACILITY ADDRESS,FACILITY ZIP,GRADE
0,LUCY'S DRIVE THRU INC,5 DE MAYO,11204 WASHINGTON PL,90650,1
1,PAUL V ANAND,7-ELEVEN STORE 18341D,14102 SHERMAN WAY,91744,1
2,"CHARR, INC",7-ELEVEN STORE 37021A,15228 S AVALON BLVD,90028,1
3,"MARISCAL, ESTELA",98 CENT DISCOUT STORE & MKT,2871 E GAGE AVE,90046,1
4,"TAWA SUPERMARKET, INC",99 RANCH MARKET #7,1300 S GOLDEN WEST AVE,91605,1


In [9]:
df1['GRADE'].unique()

array([1, 2, 3], dtype=int64)

In [10]:
grade1_index=Y[Y.values==1].index
grade2_index=Y[Y.values==2].index
grade3_index=Y[Y.values==3].index

highest=grade1_index
higher=grade2_index
lower=grade3_index

In [11]:
higher

Int64Index([     5,     21,     22,     37,     52,     59,     69,     96,
                97,    132,
            ...
            204513, 204522, 204534, 204568, 204632, 204679, 204806, 204845,
            204925, 204927],
           dtype='int64', length=9729)

In [12]:
# remember higher is a list of indexes, either of 0 or 1's in the response variable in training set
higher=np.random.choice(higher, size=5*len(lower))
highest=np.random.choice(highest, size=8*len(lower))

lower=np.asarray(lower)

new_indexes=np.concatenate((lower,higher,highest))

X=X.loc[new_indexes,]
X1=X1.loc[new_indexes,]
X2=X2.loc[new_indexes,]
X3=X3.loc[new_indexes,]
Y=Y.loc[new_indexes]

In [13]:
DataFrame123=pd.concat([X,X1,X2,X3,Y],axis=1)

In [14]:
DataFrame123.head()

Unnamed: 0,OWNER NAME,FACILITY ADDRESS,FACILITY ZIP,FACILITY NAME,GRADE
20,"VOSKANIAN, EDIK",10905 MAGNOLIA BLVD,90046,AMSTERDAM CAFE,3
384,"LEE, LEE CHAN",15321 ROSCOE BLVD,90712,USA DONUTS & CROISSANTS,3
607,"BLANCHETEAU, OLGA PATRICIA",3803 W BURBANK BLVD,90401,MI LATIN KITCHEN,3
1258,DEERBROOK ENTERPRISE CORP,1695 S AZUSA AVE,90066,SHANE HAILANDER PALACE,3
1565,CHANTIDA NHEUK,3225 E PACIFIC COAST HWY #A,91601,MANNA DONUTS,3


In [15]:
DataFrame123['GRADE'].unique()

array([3, 2, 1], dtype=int64)

In [16]:
DataFrame123['FACILITY ZIP'] = DataFrame123['FACILITY ZIP'].astype(str)

In [17]:
#X=df1[['FACILITY ZIP','FACILITY ADDRESS']]
X=DataFrame123['FACILITY ADDRESS']
Y=DataFrame123['GRADE']

In [18]:
X.tail()

148633    6800 RESEDA BLVD STE B
84015          4356 BEVERLY BLVD
49132           3161 N GAREY AVE
199248       2600 W VICTORY BLVD
39147      247 AVENIDA DEL NORTE
Name: FACILITY ADDRESS, dtype: object

In [19]:
cv=CountVectorizer()
X=cv.fit_transform(X)
X

<12754x6730 sparse matrix of type '<class 'numpy.int64'>'
	with 43178 stored elements in Compressed Sparse Row format>

In [24]:
#X=df1[['FACILITY ZIP','FACILITY ADDRESS']]
X1=DataFrame123['FACILITY ZIP']

In [25]:
cv=CountVectorizer()
X1=cv.fit_transform(X1)
X1

<12754x946 sparse matrix of type '<class 'numpy.int64'>'
	with 13706 stored elements in Compressed Sparse Row format>

In [29]:
#X=df1[['FACILITY ZIP','FACILITY ADDRESS']]
X2=DataFrame123['OWNER NAME']


In [30]:
cv=CountVectorizer()
X2=cv.fit_transform(X2)
X2

<12754x8553 sparse matrix of type '<class 'numpy.int64'>'
	with 37738 stored elements in Compressed Sparse Row format>

In [34]:
#X=df1[['FACILITY ZIP','FACILITY ADDRESS']]
X3=DataFrame123['FACILITY NAME']

In [35]:
cv=CountVectorizer()
X3=cv.fit_transform(X3)
X3

<12754x7097 sparse matrix of type '<class 'numpy.int64'>'
	with 34500 stored elements in Compressed Sparse Row format>

In [928]:
from scipy.sparse import hstack
Comb=hstack((X2, X3))
comb=hstack((Comb, X))

In [929]:
Combined=hstack((comb, X1))

In [930]:
Combined

<12754x23504 sparse matrix of type '<class 'numpy.int64'>'
	with 129247 stored elements in COOrdinate format>

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X3, Y, test_size=0.27, random_state=40)

In [37]:
#Naive Bayes Classifier
clf = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
clf.fit(X_train,y_train)
clf.score(X_test,y_test)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
a=classification_report(y_test, y_pred)

             precision    recall  f1-score   support

          1       0.70      0.76      0.73      1996
          2       0.52      0.53      0.52      1220
          3       0.25      0.03      0.05       228

avg / total       0.60      0.63      0.61      3444



In [None]:
#Naive Bayes Classifier
clf = MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
clf.fit(X,y)
clf.score(X,y)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
a=classification_report(y_test, y_pred)

In [38]:
#X_train_prob=pd.DataFrame(clf.predict_proba(X_train))
#X_test_prob=pd.DataFrame(clf.predict_proba(X_test))
#X1_train_prob=pd.DataFrame(clf.predict_proba(X_train))
#X1_test_prob=pd.DataFrame(clf.predict_proba(X_test))
#X2_train_prob=pd.DataFrame(clf.predict_proba(X_train))
#X2_test_prob=pd.DataFrame(clf.predict_proba(X_test))
X3_train_prob=pd.DataFrame(clf.predict_proba(X_train))
X3_test_prob=pd.DataFrame(clf.predict_proba(X_test))

In [46]:
X_test_prob.shape

(3444, 3)

In [49]:
y_test.shape

(3444,)

In [47]:
X1_test_prob.shape

(3444, 3)

In [42]:
X2_train_prob.shape

(9310, 3)

In [43]:
X3_train_prob.shape

(9310, 3)

In [52]:
a_test=pd.concat([X_train_prob,X1_train_prob,X2_train_prob,X3_train_prob],axis=1)
b_test=pd.concat([X_test_prob,X1_test_prob,X2_test_prob,X3_test_prob],axis=1)

In [None]:
a_train=pd.concat([a_test,y_train])

In [None]:
a_test=pd.concat([X_train_prob,X1_train_prob,X2_train_prob,X3_train_prob,y_train],axis=1)
b_test=pd.concat([X_test_prob,X1_test_prob,X2_test_prob,X3_test_prob,y_test],axis=1)

In [944]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
lreg_clf = LogisticRegression()

param_grid = {'penalty':['l1', 'l2']}

grid_search = GridSearchCV(lreg_clf , param_grid, cv = 5 , return_train_score=True)
grid_search.fit(X_train_prob, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2']}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [945]:
grid_search.best_score_

0.8774436090225564

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, f1_score
lreg_clf = LogisticRegression(penalty = 'l2')
lreg_clf.fit(X_train_prob, y_train)
results = pd.DataFrame(index=None, columns=['model'])
y_lreg_clf = lreg_clf.predict(X_test_prob)
train_Rsquare = lreg_clf.score(X_train_prob, y_train)
test_Rsquare = lreg_clf.score(X_test_prob, y_test)
train_MSE = mean_squared_error(y_train, lreg_clf.predict(X_train_prob))
test_MSE = mean_squared_error(y_test, lreg_clf.predict(X_test_prob))
f1_score_train=f1_score(y_train, lreg_clf.predict(X_train_prob))
f1_score_test=f1_score(y_test, lreg_clf.predict(X_test_prob))
results = results.append(pd.Series({'model':'KNN Classifier','train_Rsquare':train_Rsquare, 'test_Rsquare':test_Rsquare, 'train_MSE':train_MSE,'test_MSE':test_MSE,
                                    'f1_score_train':f1_score_train,'f1_score_test':f1_score_test}),ignore_index=True )
results

In [952]:
lreg_clf.score(X_train_prob, y_train)

0.8730397422126746

In [953]:
lreg_clf.score(X_test_prob, y_test)

0.6454703832752613

In [954]:
mean_squared_error(y_train, lreg_clf.predict(X_train_prob))

0.1920515574650913

In [955]:
mean_squared_error(y_test, lreg_clf.predict(X_test_prob))

0.45557491289198604

In [958]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()
param_grid = {'max_depth': [5, 10, 20, 50, 100]}

grid_search = GridSearchCV(dt_clf, param_grid, cv = 5, return_train_score=True)
grid_search.fit(X_train_prob, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [5, 10, 20, 50, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [960]:
grid_search.best_params_

{'max_depth': 5}

In [961]:
grid_search.best_score_

0.8785177228786252

In [963]:
results = pd.DataFrame(index=None, columns=['model'])
dt_clf = DecisionTreeClassifier(max_depth = 10)
dt_clf.fit(X_train_prob,y_train)
y_dt_clf = dt_clf.predict(X_test_prob)
train_Rsquare = dt_clf.score(X_train_prob, y_train)
test_Rsquare = dt_clf.score(X_test_prob, y_test)
train_MSE = mean_squared_error(y_train, dt_clf.predict(X_train_prob))
test_MSE = mean_squared_error(y_test, dt_clf.predict(X_test_prob))
results = results.append(pd.Series({'model':'Decision Tree Classifier','train_Rsquare':train_Rsquare, 'test_Rsquare':test_Rsquare, 'train_MSE':train_MSE,'test_MSE':test_MSE}),ignore_index=True )
results

Unnamed: 0,model,test_MSE,test_Rsquare,train_MSE,train_Rsquare
0,Decision Tree Classifier,0.623403,0.60482,0.10956,0.921375


In [965]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

#Tuning ridge on new dataset
param_grid = {"max_depth": [3, 5],
              "min_samples_split": sp_randint(2, 30),
              "min_samples_leaf": sp_randint(1, 20),
              "bootstrap": [True, False]}
random_search = RandomizedSearchCV(RandomForestClassifier(n_estimators=1000), param_distributions=param_grid,
                                   n_iter=20, random_state=0,n_jobs=-1, return_train_score=True)
random_search.fit(X_train_prob, y_train)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=20, n_jobs=-1,
          param_distributions={'max_depth': [3, 5], 'min_samples_split': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002558F5787F0>, 'min_samples_leaf': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002558F578DA0>, 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=0, refit=True,
          return_train_score=True, scoring=None, verbose=0)

In [966]:
random_search.best_params_

{'bootstrap': True,
 'max_depth': 5,
 'min_samples_leaf': 1,
 'min_samples_split': 5}

In [967]:
random_search.best_score_

0.882062298603652

In [969]:
rf_clf = RandomForestClassifier(bootstrap=True,max_depth=5,min_samples_leaf=1,min_samples_split=5)
rf_clf.fit(X_train_prob,y_train)
y_rf_clf = rf_clf.predict(X_test_prob)
train_Rsquare = rf_clf.score(X_train_prob, y_train)
test_Rsquare = rf_clf.score(X_test_prob, y_test)
train_MSE = mean_squared_error(y_train, rf_clf.predict(X_train_prob))
test_MSE = mean_squared_error(y_test, rf_clf.predict(X_test_prob))
results = results.append(pd.Series({'model':'Random Forest Classifier','train_Rsquare':train_Rsquare, 'test_Rsquare':test_Rsquare, 'train_MSE':train_MSE,'test_MSE':test_MSE}),ignore_index=True )
results

Unnamed: 0,model,test_MSE,test_Rsquare,train_MSE,train_Rsquare
0,Decision Tree Classifier,0.623403,0.60482,0.10956,0.921375
1,Random Forest Classifier,0.611789,0.608595,0.16391,0.887648


In [970]:
from  sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.5, random_state=0)
gb.fit(X_train_prob, y_train)
print(gb.score(X_train_prob, y_train))
print(gb.score(X_test_prob, y_test))

0.9553168635875403
0.6146922183507549
