# Important Libraries to Import

In [14]:
import numpy as np 
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
import pickle

from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE, SMOTENC, RandomOverSampler
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV

# Machine Learning Model Class 

In [21]:
class Model:
    def __init__(self, datafile = "airline_sentiment_analysis.csv"):
        self.data = pd.read_csv(datafile)
        self.porter = PorterStemmer()
        self.tfidf = TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None)
        self.kernel = 'rbf'
        self.degree=3
        self.pred_model=None
        self.stopword=False
        self.undersampling =0
# This function use to spliting train and test file as well as used to do oversampling by giving appropriate parameter
    def split(self, test_size, Oversampling=0, oversample_type='ros'):
        y = self.data['airline_sentiment']
        X = self.data['text']
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)
        if Oversampling==1:
            if oversample_type == 'ros':
                ros = RandomOverSampler(sampling_strategy=1)
                X_over, y_over = ros.fit_resample(self.X_train.values.reshape(-1, 1), self.y_train)
                self.X_train=X_over.reshape( -1)
                self.y_train = y_over
        if self.undersampling ==1:
            rus = RandomUnderSampler(sampling_strategy=1)
            X_over, y_over = rus.fit_resample(X_train.values.reshape(-1, 1), y_train)
            self.X_train=X_over.reshape( -1)
            self.y_train = y_over

# Function for default tokenizer i.e spliting by spaces 
    def tokenizer(self, text):
        return text.split()
# Fuction for Tokenizer using potter stemmer
    def tokenizer_porter(self, text):
        return [self.porter.stem(word) for word in text.split()]
# Function including basic text preprocessing
    def preprocessor(self, text):
        # Remove HTML markup
        text = re.sub('<[^>]*>', '', text)
        # Remove Website  markup
        text = re.sub(r'https?:\/\/\S+', '' , text)
        text = re.sub(r'\w*\@*\w*\.(com)\w*', '', text)
        text = re.sub(r'^(emailmailto:)\w*\.*\w+\@*\w+\.com',' ',text)
        # remove @ mentions
        text = re.sub(r"@([A-Za-z]+)", "", text)
        # remove numbers
        text = re.sub("[0-9]+", "", text)
        #  stopwords
        if self.stopword==True:
            text = " ".join([word for word in str(text).split() if word not in self.STOPWORDS])
        # Save emoticons for later appending
        # emoticons helps to find the sentiment
        emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
        # Remove any non-word character and append the emoticons,
        # removing the noise character for standarization. Convert to lower case
        text = (re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', ''))
        return text
    # I have created a pipeline were frist it will call functions for preprocessing and TfidfVectorizer
    # Then it will call the ML Algorithms.
    def logisticregression(self):
        param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__tokenizer': [self.tokenizer, self.tokenizer_porter],
               'vect__preprocessor': [None, self.preprocessor],
               'clf__penalty': ['l1','l2'],
               'clf__C': [1.0]},
              ]

        lr_pipe = Pipeline([('vect', self.tfidf),
                     ('clf', LogisticRegression(random_state=0))])

        lr_clf = GridSearchCV(lr_pipe, param_grid,scoring='accuracy',cv=5,verbose=1,n_jobs=-1)
        return lr_clf
    
    def support_vector_machine(self, kernal, degree):
        param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__tokenizer': [self.tokenizer, self.tokenizer_porter],
               'vect__preprocessor': [None, self.preprocessor]},
              ]

        cvm_pipe = Pipeline([('vect', self.tfidf),
                     ('clf',  svm.SVC(C=9.0,kernel=self.kernel, degree=self.degree,random_state=42))])

        svm_clf = GridSearchCV(cvm_pipe, param_grid,scoring='accuracy',cv=5)
        return svm_clf
    # Here I have change the TFIDF model I have used TfidfTransformer 
    # here I am getting some error while using TfidfVectorizer So I have changed pipeliine little bit 
    # add an vectorizer before TFIDF
    # Pipeline was vectorizer => TfidfTransformer => classifier to make things easier
    def Multinomial_Naive_Bayes(self):
        text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])
        tuned_parameters = {
                'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
                'tfidf__use_idf': (True, False),
                'tfidf__norm': ('l1', 'l2'),
                'clf__alpha': [1, 1e-1, 1e-2]
            }
        clf = GridSearchCV(text_clf, tuned_parameters, cv=10, scoring='accuracy')
        return clf
    # this function is basically used for fitting All above mentioned Machine learning Model
    # Used for Calling function and saving best estimator.
    def fit(self,model):
        if model == 'svm':
            clf = self.support_vector_machine('linear',3)
            self.model = clf.fit(self.X_train, self.y_train)
        if model == 'lr':
            clf = self.logisticregression()
            self.model = clf.fit(self.X_train, self.y_train)
        if model == 'MNB':
            clf = self.Multinomial_Naive_Bayes()
            self.model =clf.fit(self.X_train, self.y_train)
        self.pred_model = self.model.best_estimator_
        return self.model.best_estimator_
    # Here I am Using Precision, Recall, F1 Score and Also return Accuracy but Used F1 Score as METRIC
    def accuracy(self,clf):
        print('Accuracy in test: %.3f' % clf.score(self.X_test, self.y_test))
        y_pred = clf.predict(self.X_test)
        tn, fp, fn, tp=confusion_matrix(self.y_test, y_pred).ravel()
        print('true Positive '% tp)
        print('true Negative '% tp)
        print('False Positive '% tp)
        print('False Negative '% tp)
        print('Macro Precision Recall and F1 Score in test:' )
        print(precision_recall_fscore_support(self.y_test, y_pred, average='macro'))
        
        
    # Inference function used to find sentiment anaslysis for given Input String
    def predict(self, text):
        text = self.preprocessor(text)
        return self.pred_model.predict([text])
    # Function used to Load the model
    def load(self, filename='test_data1.pkl'):
        with open(filename, 'rb') as f:
            self.pred_model = pickle.load(f)
    # Function used to Load the model
    def save(self, filename='test_data1.pkl'):
        with open(filename, 'wb') as f:
            pickle.dump(self.pred_model, f)


In [23]:
model = Model()
model.split(0.3,1)
clf = model.fit('MNB')
model.accuracy(clf)
print(model.predict("bad airway"))
# model.save()



Accuracy in test: 0.900
true positive 
Macro Precision Recall and F1 Score in test:
(0.8779789239461648, 0.7971823008051886, 0.8285923562290807, None)
['negative']


Multinomial_Naive_Bayes:
ROS : (0.8779789239461648, 0.7971823008051886, 0.8285923562290807, None) Accuracy in test: 0.900
Simple: (0.9049782953020512, 0.7948925168981034, 0.8345640249974828, None) Accuracy in test: 0.906\

SVM:\
ROS: (0.9137339444555155, 0.8508839047294203, 0.8774770551417379, None) Accuracy in test: 0.926\
Simple: (0.9098036549451257, 0.8519102359639985, 0.8767087962049738, None) Accuracy in test: 0.925

Logistic REGRESSION:\
ROS: (0.8459922550527823, 0.8674373369470025, 0.8559969305296486, None)\
Simple: (0.9149278679494364, 0.813249198754882, 0.851406139465841, None)

# Inference 
Steps used to Load and Predict on any string\
model1 = Model()\
model1.load()\
model1.predict("bar airline")