# Libraries

In [2]:

# !pip install contractions
import sys  
!{sys.executable} -m pip install contractions
 
import re
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import PorterStemmer, WordNetLemmatizer
import contractions 
from collections import defaultdict	
import numpy as np
import pandas as pd
import math
from nltk.util import ngrams
from nltk.chunk import conlltags2tree, tree2conlltags
nltk.download('maxent_ne_chunker')
nltk.download('words')
# import en_core_web_sm
# nlp = en_core_web_sm.load()
# from google.colab import drive
# drive.mount('/content/gdrive')





[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aadar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aadar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aadar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\aadar\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\aadar\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\aadar\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-

True

# Class Definition and Obj initialization

In [3]:
from sys import base_prefix


class Text_preprocessor:
    """
       A class to preprocess text for NLP Application.

       ...

       Attributes
       ----------
       spec_filtered : list
         list with special character removed
       rm_stopwrds : list
         list with stopwords removed
       clean : list
         list after all text preprocessing
       index_word : dict
         dict of indexed words
       bag_of_words : df
         list of words after preprocessing document


       Methods
       -------
       expand_contraction(text=""):
           returns the expanded text.

       remove_special_characters(text=""):
           returns text with removed emailaddress, special characters and numbers

       tokenize(text=")
           returns list of words from text

       removal_stop_words(token=[],language='english')
           stop words are derived from  nltk.corpus
           returns the list with removed stopwords for english language

       stem_or_lem(token=[],method="stemm")
           return the list after lemmitization or stemmization depending upon 
           method argument

       preprocessed_text(text="")
           returns list of words after performing
           contaraction, removal of special characters, tokenization, removal of
           stop word, stemmization and lemmitization

       bow(document)
           returns bag of words for a document(sentence)


       get_word_dict
           returns key value pair for words in document

       computeTF
           return dict of computed TF for documents

       computeTFIDF
           returns dict of IDFS for words in corpus

       computeTFIDF
           returns dict of computed TFIDF for words in documents

       dict_to_df
           returns pandas dataframe representing TFIDS of  list of documents of document
       """

    def __init__(self):
        self.spec_filtered = []
        self.rm_stopwrds = []
        self.clean = []
        self.index_word = {}
        self.bag_of_words = []
        self.corpus_words = []
        self.clean_document_list = []

    def expand_contraction(self, text: str) -> str:
        '''
        Expands the words in text with contractions module.

            Parameters
            ----------
            text : str,
                text to be expanded

            Returns
            -------
            text with expanded words 
        '''
        # create an empty list
        expanded_words = []
        for word in text.split():
            # using contractions.fix to expand the shotened words and removes extra spaces
            expanded_words.append(contractions.fix(word))
        expanded_text = ' '.join(expanded_words)
        return expanded_text.lower()

    def remove_special_characters(self, text: str) -> str:
        '''
      Removes the email, special character and numbers from text.

          Special character includes ! @ # $ & * () + - [].

          Parameters
          ----------
          text : str
              String containing special character

          Returns
          -------
          String without email, special character and numbers
      '''
        # remove email if any
        txt_email = re.compile(r'[A-Za-z0-9]*@[A-Za-z]*\.com')
        cln_txt = txt_email.sub('email', text)
        # remove special character and number if any
        self.spec_filtered = re.sub('[^A-Za-z]+', ' ', cln_txt)
        return self.spec_filtered

    def tokenize(self, text: str) -> list:
        '''
          Tokenize the text  to form list.
             Use nltk.word_tokenize.

             Parameters
             ----------
             text : str, 
             text to  tokenize

             Returns
             -------
             list of tokenized words
         '''

        nltk_tokens = nltk.word_tokenize(text)
        return nltk_tokens

    def removal_stop_words(self, tokens: list, language: str = 'english') -> list:
        '''
         Removes the stop words from list.

             Use stopwords from nltk.corpus.

             Parameters
             ----------
             token : list
                 words token
             language : str, optional 
             Language of the words (default is english) 

             Returns
             -------
             list of words without stop words
         '''
        stopword_list = nltk.corpus.stopwords.words(language)
        self.rm_stopwrds = [
            word for word in tokens if not word in stopword_list]
        return self.rm_stopwrds

    def stem_or_lem(self, tokens: list, method: str) -> list:
        '''
         Perform Stemming or lemmatization.
         If the argument method is 'stemm' then performs stemmization, performs
         lemmitization if 'lemm' and return tokens for mismatched strings
         PorterStemmer  from nltk for stemming
         WordNetLemmatizer from nltk for lemmatization

             Parameters
             ----------
             tokens : list
                list of tokenized words

             Returns
             -------
             return words after Stemming or Lemmatization
         '''
        # instance of PorterStemmer
        ps = PorterStemmer()
        stemmed = []
        lemmed = []
        if method == 'stemm':
            for w in tokens:
                rootWord = ps.stem(w)
                stemmed.append(rootWord)
            return stemmed
        elif method == 'lemm':
            wordnet_lemmatizer = WordNetLemmatizer()
            for w in tokens:
                lemm = wordnet_lemmatizer.lemmatize(w)
                lemmed.append(lemm)
            return lemmed
        else:
            return tokens

    def preprocessed_text(self, text: str) -> list:
        '''
        Perfoms all the operation of text preprocessing.


            Parameters
            ----------
            text : str, 
                string to be preprocessed
            Returns
            -------
            returns list of words after performing
            contaraction, removal of special characters, tokenization, removal of
            stop word, stemmization and lemmitization

        '''
        exp_text = self.expand_contraction(text)
        prune_special = self.remove_special_characters(exp_text)
        tokenize_words = self.tokenize(prune_special)
        remove_stopwords = self.removal_stop_words(tokenize_words, 'english')
        # stemmed =self.stem_or_lem(remove_stopwords,'stemm')
        self.clean = self.stem_or_lem(remove_stopwords, 'lemm')
        return self.clean

    def get_word_dict(self, dirty_text, document):
        '''
         Returns the dict of words after achieved after cleaning

           Parameter
           ---------
           dirty_text : list 
             list of documents before preprocessing

           document : list 
              list of string [raw_document]

           Returns
           -------
             (key ,value) 
             Key : word
             value : frequency of words in document
        '''

       #  all_words = self.get_corpus_words(text)
        flatten_list = [item for sublist in self.pre_prep(
            dirty_text) for item in sublist]
        all_words = list(set(flatten_list))
        clean_document = self.preprocessed_text(document)

       #  words dictionary with value 0
        wordDict = dict.fromkeys(all_words, 0)
        for word in clean_document:
            # count occurence of each words
            if word in wordDict:
                wordDict[word] += 1
        return wordDict

    def pre_prep(self, dirty_text: list) -> list:
        '''
          Returns the list  of documents of documents

            Parameter
            ---------
            dirty_text : list 
              list of documents before preprocessing


            Returns
            -------
              list of clean documents of clean document
         '''
        for document in dirty_text:
            self.clean_document_list.append(self.preprocessed_text((document)))
        return self.clean_document_list

    def bow(self, dirty_text: list) -> list:
        '''
         Returns the bag of words in pandas dataframe format

           Parameter
           ---------
           dirty_text : list 
             list of documents of raw_documents


           Returns
           -------
             bag of words in pandas Df
        '''

        flatten_list = [item for sublist in self.pre_prep(
            dirty_text) for item in sublist]
        unique_words = list(set(flatten_list))
        #  indexing the words from corpus
        # first parameter  into keys and second is value(index)
        indexed_words = dict(zip(unique_words, range(len(unique_words))))
        bow_qrr = []
        for document in self.clean_document_list:
            #  create numpy array of of length of corpus
            empty_arr = np.zeros(len(unique_words))
            #  count the occurence of the each words
            for word in document:
                empty_arr[indexed_words[word]] += 1
            bow_qrr.append(empty_arr)
            #  convert array to dataframe
        df = pd.DataFrame(bow_qrr, columns=unique_words)
        return df

    def computeTF(self, dirty_text: list, document) -> dict:
        '''
        Computes TF for each word in  document

        Parameter
        ---------
          dirty_text : list documents of document

            document: list
             List of strings(raw_document)

        Returns
        --------
          dict
          key : word
          value : term frequency of the word

        '''
        tfDict = {}
        len_sntn = len(self.preprocessed_text(document))
        word_dict = self.get_word_dict(dirty_text, document)
        for word, count in word_dict.items():
            # term frequency for words in a sentence
            tfDict[word] = count/float(len_sntn)
        return tfDict

    def computeIDF(self, dirty_text: list) -> dict:
        '''
          Computes IDFs for all word in doclist

          Parameter
          ---------
            dirty_text : list  
              list of pre-processed documents

          Returns
          --------
            dict
            key : word
            value : IDFS for each word

          '''
        doc_list = []
        for doc in dirty_text:
            word_dict = self.get_word_dict(dirty_text, doc)
            doc_list.append(word_dict)

        idfDict = {}
        N = len(doc_list)
        # dict with all words with value 0(template)
        idfDict = dict.fromkeys(doc_list[0].keys(), 0)
        for doc in doc_list:
            for word, val in doc.items():
                if val > 0:
                    # increase the value if the word exist in doc
                    idfDict[word] += 1
        for word, val in idfDict.items():
            idfDict[word] = math.log10(N / float(val))
        return idfDict

    def computeTFIDF(self, tf, idfs) -> dict:
        '''
        Computes TFIDFs for all word in documents

          Parameter
          ---------
            tfBow : dict  
              Key: word
              value: tf of word in document
            idfs: dict
              key : Word
              value: idf of word in document list 


          Returns
          --------
            dict
            key : word
            value : TfIDFS for each word in document

        '''
        tfidf = {}
        for word, val in tf.items():
            tfidf[word] = val*idfs[word]
        return tfidf

    def dict_to_df(self, text) -> list:
        '''
        Performs all above operations to 
        Generates pandas dafarame

           Parameter
           ---------
             text : list of documents (corpus)  


           Returns
           --------
             Pandas dataframe
         '''
        arr = []
        idfs = self.computeIDF(text)
        for doc in text:
            tf = self.computeTF(text, doc)
            tfidf = self.computeTFIDF(tf, idfs)
            arr.append(tfidf)
        df = pd.DataFrame(arr)
        return df

    def pos_identification(self, text) -> list:
        '''
        identifies Part of speech in given text 

           Parameter
           ---------
             text : string   


           Returns
           --------
             list of tuple with identification

        '''
        return nltk.pos_tag(nltk.word_tokenize(self.remove_special_characters(text)))

    def name_entity_identification(self, text: str, lib='spacy') -> list:
        '''
        identifies name or entity of text in given text 

           Parameter
           ---------
             text : string   


           Returns
           --------
             list of tuple with identification

        '''
        #  use nltk for identification
        if lib == 'nltk':
            print("Using Nltk lib")
            for chunk in nltk.ne_chunk(self.pos_identification(text)):
                if hasattr(chunk, 'label'):
                    idntfy = ([(chunk.label(), chunk) for c in chunk])

            # use spacy for identification
        doc = nlp(obj.remove_special_characters(text))
        idntfy = ([(X.text, X.label_) for X in doc.ents])
        return ([(X.text, X.label_) for X in doc.ents])

    def ngram_tokenization(self, text, n=2) -> list:
        '''
        identifies name or entity of text in given text 

           Parameter
           ---------
             text : string   


           Returns
           --------
             list of tuple with identification

        '''
        n_grams = ngrams(nltk.word_tokenize(
            obj.remove_special_characters(text)), n)
        return [' '.join(grams) for grams in n_grams]


text2 = ['This is a good movie.',
         'It is a good movie, but you know good is relative.',
         'Movie is fun to watch.',
         'I had a good relaxing time.',
         'The whole cinema experience was good.',
         'This is a good cinema.']

text3 = '    Jack and  jill have made a delicious,dish.Then they started to play some12 game! and jill has attahacd# [a] photo frame to the straight9 wall and swung on sea-saw. She was very happy. After the game, they both went to central London to enjoy some fast food.'


obj = Text_preprocessor()


# print("BOW:")
# print(obj.bow(text2))

# print("TF for a sentence :")
# print(obj.computeTF(text2, text2[0]))

# print("IDFS:")
# print(obj.computeIDF(text2))

# print("TFIDFS:")
# print(obj.dict_to_df(text2))

# print("Cleaning:")
# print(obj.remove_special_characters(text3))

# print("POS Identification:")
# print(obj.pos_identification(text3))

# print("Name Entity identification:")
# print(obj.name_entity_identification(text3))

print("N Gram tokenization:")
obj.ngram_tokenization(text3, 2)








N Gram tokenization:


['Jack and',
 'and jill',
 'jill have',
 'have made',
 'made a',
 'a delicious',
 'delicious dish',
 'dish Then',
 'Then they',
 'they started',
 'started to',
 'to play',
 'play some',
 'some game',
 'game and',
 'and jill',
 'jill has',
 'has attahacd',
 'attahacd a',
 'a photo',
 'photo frame',
 'frame to',
 'to the',
 'the straight',
 'straight wall',
 'wall and',
 'and swung',
 'swung on',
 'on sea',
 'sea saw',
 'saw She',
 'She was',
 'was very',
 'very happy',
 'happy After',
 'After the',
 'the game',
 'game they',
 'they both',
 'both went',
 'went to',
 'to central',
 'central London',
 'London to',
 'to enjoy',
 'enjoy some',
 'some fast',
 'fast food']

# Classification

In [4]:
# Read the data
a=[]
b=[]
with open('./SMSSpamCollection.txt','r') as f:
    l=f.readlines()
    for j in l:
        # append label in a (that is either spam or ham )
        a.append(j.split('\t')[0])
        # actual email in b
        b.append(j.split('\t')[1])

In [5]:
# convert to pandas  dataframe
d={'label':a,'document':b}
df=pd.DataFrame(d)
df.head()

Unnamed: 0,label,document
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...\n
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
from sklearn.model_selection import train_test_split
'''Controls the shuffling applied to the data before applying the split. 
Pass an int for reproducible output across multiple function calls'''
training_data, testing_data = train_test_split(df, test_size=0.2, random_state=25)


In [8]:
training_data.shape

(4459, 2)

In [9]:
training_data_clean = (
    # remove_special_character removes number email and special character 
    training_data.assign(clean_document = lambda x:[obj.remove_special_characters(text) for text in x.document])
    # Tokenize word
    .assign(word_token = lambda x:[obj.tokenize(text) for text in x.clean_document])
    # remove stop words
     .assign(stops= lambda x:[obj.removal_stop_words(text) for text in x.word_token])
    # lemmitization
     .assign(stemorlem= lambda x:[obj.stem_or_lem(text,'stemm') for text in x.stops])
     # # input to sklearn
    .assign(document_to_sklearn= lambda x: [" ".join(map(str,list_of_words)) for list_of_words in x.stemorlem ])
    )

In [10]:
training_data_clean

Unnamed: 0,label,document,clean_document,word_token,stops,stemorlem,document_to_sklearn
3784,ham,Can you do online transaction?\n,Can you do online transaction,"[Can, you, do, online, transaction]","[Can, online, transaction]","[can, onlin, transact]",can onlin transact
2009,ham,See the forwarding message for proof\n,See the forwarding message for proof,"[See, the, forwarding, message, for, proof]","[See, forwarding, message, proof]","[see, forward, messag, proof]",see forward messag proof
3887,ham,"Same, I'm at my great aunts anniversary party ...",Same I m at my great aunts anniversary party i...,"[Same, I, m, at, my, great, aunts, anniversary...","[Same, I, great, aunts, anniversary, party, ta...","[same, i, great, aunt, anniversari, parti, tar...",same i great aunt anniversari parti tarpon spring
2326,ham,Apps class varaya elaya.\n,Apps class varaya elaya,"[Apps, class, varaya, elaya]","[Apps, class, varaya, elaya]","[app, class, varaya, elaya]",app class varaya elaya
305,spam,SMS. ac Blind Date 4U!: Rodds1 is 21/m from Ab...,SMS ac Blind Date U Rodds is m from Aberdeen U...,"[SMS, ac, Blind, Date, U, Rodds, is, m, from, ...","[SMS, ac, Blind, Date, U, Rodds, Aberdeen, Uni...","[sm, ac, blind, date, u, rodd, aberdeen, unit,...",sm ac blind date u rodd aberdeen unit kingdom ...
...,...,...,...,...,...,...,...
255,ham,"I'm back, lemme know when you're ready\n",I m back lemme know when you re ready,"[I, m, back, lem, me, know, when, you, re, ready]","[I, back, lem, know, ready]","[i, back, lem, know, readi]",i back lem know readi
2934,ham,Yo do you know anyone &lt;#&gt; or otherwise...,Yo do you know anyone lt gt or otherwise able ...,"[Yo, do, you, know, anyone, lt, gt, or, otherw...","[Yo, know, anyone, lt, gt, otherwise, able, bu...","[yo, know, anyon, lt, gt, otherwis, abl, buy, ...",yo know anyon lt gt otherwis abl buy liquor ou...
2191,ham,"Ooh, 4got, i'm gonna start belly dancing in mo...",Ooh got i m gonna start belly dancing in mosel...,"[Ooh, got, i, m, gon, na, start, belly, dancin...","[Ooh, got, gon, na, start, belly, dancing, mos...","[ooh, got, gon, na, start, belli, danc, mosele...",ooh got gon na start belli danc moseley wed u ...
318,ham,"Not really dude, have no friends i'm afraid :(\n",Not really dude have no friends i m afraid,"[Not, really, dude, have, no, friends, i, m, a...","[Not, really, dude, friends, afraid]","[not, realli, dude, friend, afraid]",not realli dude friend afraid


In [11]:
# Feature Extraction
# import sklearn
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features = 50)
X = vectorizer.fit_transform([i for i in training_data_clean['document_to_sklearn']])
df_bow_sklearn = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())
df_bow_sklearn['label'] = training_data_clean['label']
df_bow_sklearn



Unnamed: 0,back,but,call,come,day,dont,free,get,go,good,...,today,txt,ur,want,we,week,what,work,you,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4454,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
4455,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,ham
4456,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,
4457,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham


In [15]:
# !pip install pycaret[full]

Collecting pycaret[full]
  Using cached pycaret-2.3.6-py3-none-any.whl (301 kB)
Collecting wordcloud
  Using cached wordcloud-1.8.1-cp37-cp37m-win_amd64.whl (154 kB)
Collecting mlflow
  Using cached mlflow-1.23.1-py3-none-any.whl (15.6 MB)
Collecting pyyaml<6.0.0
  Using cached PyYAML-5.4.1-cp37-cp37m-win_amd64.whl (210 kB)
Collecting pyod
  Using cached pyod-0.9.7-py3-none-any.whl


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'e:\\anaconda\\envs\\pycaret_env\\lib\\site-packages\\~umpy\\.libs\\libopenblas.WCDJNK7YVMPZQ2ME2ZZHJJRJ3JIKNDB7.gfortran-win_amd64.dll'
Consider using the `--user` option or check the permissions.



Collecting kmodes>=0.10.1
  Using cached kmodes-0.11.1-py2.py3-none-any.whl (19 kB)
Collecting textblob
  Using cached textblob-0.17.1-py2.py3-none-any.whl (636 kB)
Collecting Boruta
  Using cached Boruta-0.3-py3-none-any.whl (56 kB)
Collecting mlxtend>=0.17.0
  Using cached mlxtend-0.19.0-py2.py3-none-any.whl (1.3 MB)
Collecting pyLDAvis
  Using cached pyLDAvis-3.3.1.tar.gz (1.7 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
    Preparing wheel metadata: started
    Preparing wheel metadata: finished with status 'done'
Collecting plotly>=4.4.1
  Using cached plotly-5.6.0-py2.py3-none-any.whl (27.7 MB)
Collecting spacy<2.4.0
  Using cached spacy-2.3.7-cp37-cp37m-win_amd64.whl (9.6 MB)
Collecting cufflinks>=0.17.

Collecting numexpr
  Using cached numexpr-2.8.1-cp37-cp37m-win_amd64.whl (88 kB)
Collecting pluggy<2.0,>=0.12
  Downloading pluggy-1.0.0-py2.py3-none-any.whl (13 kB)
Collecting atomicwrites>=1.0
  Downloading atomicwrites-1.4.0-py2.py3-none-any.whl (6.8 kB)
Collecting tomli>=1.0.0
  Using cached tomli-2.0.1-py3-none-any.whl (12 kB)
Collecting iniconfig
  Downloading iniconfig-1.1.1-py2.py3-none-any.whl (5.0 kB)
Collecting py>=1.8.2
  Downloading py-1.11.0-py2.py3-none-any.whl (98 kB)
Collecting qtpy
  Downloading QtPy-2.0.1-py3-none-any.whl (65 kB)
Collecting pynndescent>=0.5
  Using cached pynndescent-0.5.6-py3-none-any.whl
Collecting h11>=0.8
  Downloading h11-0.13.0-py3-none-any.whl (58 kB)
Collecting asgiref>=3.4.0
  Downloading asgiref-3.5.0-py3-none-any.whl (22 kB)
Building wheels for collected packages: dash-core-components, dash-html-components, dash-table, lime, pyperclip, emoji, dtreeviz, dash-auth, retrying, ffmpy, pyLDAvis, python-multipart
  Building wheel for dash-core-co

In [38]:



# !pip freeze
# !pip install pycaret==2.3.5
!pip install pycaret
!pip uninstall scikit-learn -y
!pip install scikit-learn==0.23.2
# !pip install numpy --upgrade



Collecting numpy==1.19.5
  Using cached numpy-1.19.5-cp37-cp37m-win_amd64.whl (13.2 MB)


Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.21.5
    Uninstalling numpy-1.21.5:
      Successfully uninstalled numpy-1.21.5
Successfully installed numpy-1.19.5


Found existing installation: scikit-learn 0.23.2
Uninstalling scikit-learn-0.23.2:
  Successfully uninstalled scikit-learn-0.23.2
Collecting scikit-learn==0.23.2
  Using cached scikit_learn-0.23.2-cp37-cp37m-win_amd64.whl (6.8 MB)
Installing collected packages: scikit-learn
Successfully installed scikit-learn-0.23.2


In [39]:
# Modeling
from pycaret.classification import *
s = setup(data = df_bow_sklearn, target='label',
          numeric_features=vectorizer.get_feature_names(),
          session_id=123,verbose=False,silent=True)

ImportError: cannot import name '_raise_dep_warning_if_not_pytest' from 'sklearn.utils.deprecation' (E:\Anaconda\envs\pycaret_env\lib\site-packages\sklearn\utils\deprecation.py)

In [12]:
m = create_model('nb')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.372,0.4058,0.4545,0.0974,0.1604,-0.0728,-0.1295
1,0.464,0.5532,0.697,0.1565,0.2556,0.0509,0.0863
2,0.352,0.382,0.5152,0.1043,0.1735,-0.059,-0.112
3,0.376,0.4506,0.5455,0.1132,0.1875,-0.0398,-0.0734
4,0.384,0.4899,0.6061,0.1242,0.2062,-0.0165,-0.0309
5,0.408,0.5699,0.7576,0.1515,0.2525,0.0417,0.0803
6,0.412,0.5284,0.7273,0.1481,0.2462,0.0344,0.0647
7,0.372,0.4703,0.5758,0.1173,0.1949,-0.0313,-0.059
8,0.452,0.5846,0.7273,0.1579,0.2595,0.0543,0.0953
9,0.436,0.5698,0.7576,0.1582,0.2618,0.0555,0.1015


In [None]:
# from sklearn.model_selection import RepeatedStratifiedKFold
# cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=100, random_state=1)
# model = create_model('knn', fold=cv)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8700,0.5441,0.0303,0.6667,0.0580,0.0470,0.1227
1,0.8560,0.4892,0.0455,0.2500,0.0769,0.0378,0.0547
2,0.8640,0.4206,0.0303,0.3333,0.0556,0.0343,0.0656
3,0.8560,0.4852,0.0000,0.0000,0.0000,-0.0225,-0.0430
4,0.8600,0.5198,0.0303,0.2500,0.0541,0.0263,0.0445
...,...,...,...,...,...,...,...
497,0.8460,0.5349,0.0152,0.0769,0.0253,-0.0189,-0.0266
498,0.8540,0.4588,0.0000,0.0000,0.0000,-0.0260,-0.0465
499,0.8640,0.4731,0.0455,0.3750,0.0811,0.0541,0.0915
Mean,0.8578,0.4901,0.0179,0.1748,0.0316,0.0057,0.0120


In [13]:
com  = compare_models(sort='F1') 


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.4028,0.5004,0.6364,0.1329,0.2198,0.0017,0.0023,0.023
dt,Decision Tree Classifier,0.828,0.5016,0.0545,0.1225,0.0753,-0.001,-0.0028,0.034
et,Extra Trees Classifier,0.8412,0.4786,0.0303,0.1116,0.0472,-0.0077,-0.0104,0.703
rf,Random Forest Classifier,0.852,0.4821,0.0273,0.1571,0.0458,0.0073,0.011,0.727
knn,K Neighbors Classifier,0.8592,0.4898,0.0182,0.1936,0.0318,0.0083,0.0176,0.143
lightgbm,Light Gradient Boosting Machine,0.8684,0.4847,0.0121,0.35,0.0234,0.0181,0.0536,0.126
ada,Ada Boost Classifier,0.8652,0.4956,0.0061,0.1,0.0114,0.0031,0.0051,0.181
lda,Linear Discriminant Analysis,0.8664,0.4939,0.003,0.05,0.0057,0.0011,-0.0001,0.041
lr,Logistic Regression,0.8672,0.4952,0.0,0.0,0.0,-0.0016,-0.0049,0.039
svm,SVM - Linear Kernel,0.8672,0.0,0.0,0.0,0.0,-0.0015,-0.0035,0.039
