In [1]:
import numpy as np 
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer

In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix

In [7]:
class Model:
    def __init__(self, datafile = "airline_sentiment_analysis.csv"):
        self.data = pd.read_csv(datafile)
        self.porter = PorterStemmer()
        self.tfidf = TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None)
        self.kernel = 'rbf'
        self.degree=3
        self.pred_model=None

    def split(self, test_size):
        y = self.data['airline_sentiment']
        X = self.data['text']
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)
    
    def tokenizer(self, text):
        return text.split()
    
    def tokenizer_porter(self, text):
        return [self.porter.stem(word) for word in text.split()]
    
    def preprocessor(self, text):
        # Remove HTML markup
        text = re.sub('<[^>]*>', '', text)
        
        text = re.sub(r'https?:\/\/\S+', '' , text)
        text = re.sub(r'\w*\@*\w*\.(com)\w*', '', text)
        text = re.sub(r'^(emailmailto:)\w*\.*\w+\@*\w+\.com',' ',text)
        # Save emoticons for later appending
        emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
        # Remove any non-word character and append the emoticons,
        # removing the nose character for standarization. Convert to lower case
        text = (re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', ''))
        return text
    
    def logisticregression(self):
        param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__tokenizer': [self.tokenizer, self.tokenizer_porter],
               'vect__preprocessor': [None, self.preprocessor],
               'clf__penalty': ['l2'],
               'clf__C': [1.0]},
              ]

        lr_pipe = Pipeline([('vect', self.tfidf),
                     ('clf', LogisticRegression(random_state=0))])

        lr_clf = GridSearchCV(lr_pipe, param_grid,scoring='accuracy',cv=5,verbose=1,n_jobs=-1)
        return lr_clf
    
    def support_vector_machine(self, kernal, degree):
        param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__tokenizer': [self.tokenizer, self.tokenizer_porter],
               'vect__preprocessor': [None, self.preprocessor]},
              ]

        cvm_pipe = Pipeline([('vect', self.tfidf),
                     ('clf',  svm.SVC(C=9.0,kernel=self.kernel, degree=self.degree,random_state=42))])

        svm_clf = GridSearchCV(cvm_pipe, param_grid,scoring='accuracy',cv=5)
        return svm_clf
    
    def fit(self,model):
        if model == 'svm':
            clf = self.support_vector_machine('linear',3)
            self.model = clf.fit(self.X_train, self.y_train)
        if model == 'lr':
            clf = self.logisticregression()
            self.model = clf.fit(self.X_train, self.y_train)
        self.pred_model = self.model.best_estimator_
        return self.model.best_estimator_
    
    def accuracy(self,clf):
        print('Accuracy in test: %.3f' % clf.score(self.X_test, self.y_test))
    
    def predict(self, text):
        text = self.preprocessor(text)
        return self.pred_model.predict([text])

In [5]:
if __name__ == '__main__':
    model = Model()
    model.split(0.3)
    clf = model.fit('svm')
    model.accuracy(clf)

In [6]:
x=model.predict("@bad #badfligth airline")

In [18]:
import pickle
pickle_out = open("classifier.pkl","wb")
pickle.dump(model, pickle_out)
pickle_out.close()

In [3]:
pickle_in = open("classifier.pkl","rb")
model=pickle.load(pickle_in)

FileNotFoundError: [Errno 2] No such file or directory: 'classifier.pkl'

In [5]:
data = pd.read_csv("airline_sentiment_analysis.csv")

In [6]:
data.rename(columns = {'Unnamed: 0':'ID'}, inplace = True)

In [7]:
data.head()

Unnamed: 0,ID,airline_sentiment,text
0,1,positive,@VirginAmerica plus you've added commercials t...
1,3,negative,@VirginAmerica it's really aggressive to blast...
2,4,negative,@VirginAmerica and it's a really big bad thing...
3,5,negative,@VirginAmerica seriously would pay $30 a fligh...
4,6,positive,"@VirginAmerica yes, nearly every time I fly VX..."


In [8]:
data.airline_sentiment.value_counts()

negative    9178
positive    2363
Name: airline_sentiment, dtype: int64

In [9]:
y = data['airline_sentiment']
X = data['text']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [11]:
len(X_train), len(X_test), len(y_train), len(y_test)

(8078, 3463, 8078, 3463)

In [12]:
import nltk

In [13]:
from nltk.stem import PorterStemmer

porter = PorterStemmer()

def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [14]:

def preprocessor(text):
    """ Return a cleaned version of text
    """
    data['text'] = data.text.str.lower()
    # Remove HTML markup
    text = re.sub('<[^>]*>', '', text)
    text = re.sub(r'https?:\/\/\S+', '' , text)
    text = re.sub(r'\w*\@*\w*\.(com)\w*', '', text)
    text = re.sub(r'^(emailmailto:)\w*\.*\w+\@*\w+\.com',' ',text)
    #  stopwords

#     text = " ".join(
#             [word for word in str(text).split() if word not in self.STOPWORDS]
#         )
    # Save emoticons for later appending
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    # Remove any non-word character and append the emoticons,
    # removing the nose character for standarization. Convert to lower case
    text = (re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', ''))
    
    return text

# Logistic Regression

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__preprocessor': [None, preprocessor],
               'clf__penalty': ['l2'],
               'clf__C': [1.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,scoring='accuracy',cv=5,verbose=1,n_jobs=-1)

In [16]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [17]:
print('Best parameter set: ' + str(gs_lr_tfidf.best_params_))
print('Best accuracy: %.3f' % gs_lr_tfidf.best_score_)

Best parameter set: {'clf__C': 1.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__preprocessor': <function preprocessor at 0x7fa7408cc8b0>, 'vect__tokenizer': <function tokenizer_porter at 0x7fa7408cc820>}
Best accuracy: 0.910


In [18]:
clf = gs_lr_tfidf.best_estimator_
print('Accuracy in test: %.3f' % clf.score(X_test, y_test))

Accuracy in test: 0.912


In [20]:
clf.predict(["bad airline"])

array(['negative'], dtype=object)

In [21]:
import pickle
pickle_out = open("lr_clf.pkl","wb")
pickle.dump(clf, pickle_out)
pickle_out.close()

# SVM

In [21]:
param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__preprocessor': [None, preprocessor]},
              ]

cvm_pipe = Pipeline([('vect', tfidf),
                     ('clf',  svm.SVC(C=9.0,kernel='linear', degree=5,random_state=42))])

svm_clf = GridSearchCV(cvm_pipe, param_grid,scoring='accuracy',cv=5)

In [22]:
model = svm_clf.fit(X_train, y_train)

In [23]:
print('Best parameter set: ' + str(svm_clf.best_params_))
print('Best accuracy: %.3f' % svm_clf.best_score_)

Best parameter set: {'vect__ngram_range': (1, 1), 'vect__preprocessor': <function preprocessor at 0x7f87584c9550>, 'vect__tokenizer': <function tokenizer at 0x7f87584c9310>}
Best accuracy: 0.916


In [24]:
svm_clf = svm_clf.best_estimator_
print('Accuracy in test: %.3f' % svm_clf.score(X_test, y_test))

Accuracy in test: 0.910


In [25]:
import pickle
pickle_out = open("svm_clf.pkl","wb")
pickle.dump(svm_clf, pickle_out)
pickle_out.close()

In [44]:
Y_pred = clf.predict(X_test)

In [19]:
score=0
for a,b in zip(Y_pred, y_test):
    if a==b:
        score+=1

In [20]:
accuracy= (score)/len(y_test)*100

In [21]:
accuracy

89.97978631244585

# Multinomial Naive Bayes

Pipeline was made vectorizer => transformer => classifier to make things easier

In [54]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV

text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])
tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1, 1e-1, 1e-2]
}

In [56]:
from sklearn.metrics import classification_report
clf = GridSearchCV(text_clf, tuned_parameters, cv=10, scoring='accuracy')
model =clf.fit(X_train, y_train)

In [57]:
print('Best parameter set: ' + str(clf.best_params_))
print('Best accuracy: %.3f' % clf.best_score_)

Best parameter set: {'clf__alpha': 0.01, 'tfidf__norm': 'l2', 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}
Best accuracy: 0.907


In [60]:
clf = clf.best_estimator_
print('Accuracy in test: %.3f' % clf.score(X_test, y_test))

Accuracy in test: 0.903


# Random Over-sampling

In [130]:
from imblearn.over_sampling import RandomOverSampler

In [131]:
ros = RandomOverSampler(sampling_strategy=1)
X_over, y_over = ros.fit_resample(X_train.values.reshape(-1, 1), y_train)

In [132]:
len(X_over), len(y_over)

(12848, 12848)

In [133]:
y_over.value_counts()

negative    6424
positive    6424
Name: airline_sentiment, dtype: int64

In [134]:
new_X_train=X_over.reshape( -1)

In [136]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__preprocessor': [None, preprocessor],
               'clf__penalty': ['l2'],
               'clf__C': [1.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

ROS_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,scoring='accuracy',cv=5,verbose=1,n_jobs=-1)

In [138]:
ROS_lr_tfidf.fit(new_X_train, y_over)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [139]:
print('Best parameter set: ' + str(ROS_lr_tfidf.best_params_))
print('Best accuracy: %.3f' % ROS_lr_tfidf.best_score_)

Best parameter set: {'clf__C': 1.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__preprocessor': None, 'vect__tokenizer': <function tokenizer at 0x7f85b111e5e0>}
Best accuracy: 0.943


In [141]:
ROS_lr_tfidf = ROS_lr_tfidf.best_estimator_
print('Accuracy in test: %.3f' % ROS_lr_tfidf.score(X_test, y_test))

Accuracy in test: 0.906


In [152]:
y_pred = ROS_lr_tfidf.predict(X_test)

In [153]:
y_test

2811     negative
7737     negative
7376     negative
2303     negative
11247    negative
           ...   
3736     positive
7594     negative
4967     negative
1280     positive
2057     positive
Name: airline_sentiment, Length: 3463, dtype: object

In [154]:
y_pred

array(['negative', 'negative', 'negative', ..., 'negative', 'positive',
       'positive'], dtype=object)

In [156]:
tn, fp, fn, tp=confusion_matrix(y_test, y_pred).ravel()
print('true positive '% tp)
print('Macro Precision Recall and F1 Score in test:' )
print(precision_recall_fscore_support(y_test, y_pred, average='macro'))

true positive 
Macro Precision Recall and F1 Score in test:
(0.8555693741601624, 0.8566849296266592, 0.8561252933006334, None)


# Random Under Sampling

In [19]:
from imblearn.under_sampling import RandomUnderSampler

In [20]:
rus = RandomUnderSampler(sampling_strategy=1)
X_over, y_over = rus.fit_resample(X_train.values.reshape(-1, 1), y_train)

In [21]:
len(X_over), len(y_over)

(3308, 3308)

In [22]:
y_over.value_counts()

negative    1654
positive    1654
Name: airline_sentiment, dtype: int64

In [23]:
new_X_train=X_over.reshape( -1)

In [24]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__preprocessor': [None, preprocessor],
               'clf__penalty': ['l2'],
               'clf__C': [1.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

ROS_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,scoring='accuracy',cv=5,verbose=1,n_jobs=-1)

In [25]:
ROS_lr_tfidf.fit(new_X_train, y_over)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [26]:
print('Best parameter set: ' + str(ROS_lr_tfidf.best_params_))
print('Best accuracy: %.3f' % ROS_lr_tfidf.best_score_)

Best parameter set: {'clf__C': 1.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__preprocessor': <function preprocessor at 0x7fa7408cc8b0>, 'vect__tokenizer': <function tokenizer_porter at 0x7fa7408cc820>}
Best accuracy: 0.883


In [27]:
ROS_lr_tfidf = ROS_lr_tfidf.best_estimator_
print('Accuracy in test: %.3f' % ROS_lr_tfidf.score(X_test, y_test))

Accuracy in test: 0.899
