In [2]:
import nltk 
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("./airline_review_training_data.csv")
df

Unnamed: 0,text,airline_sentiment
0,@USAirways BF has been stuck in CLT all day. I...,0
1,@united DM sent. This lack if customer servic...,0
2,@JetBlue thank you for the information.,1
3,@AmericanAir I DMed you my AA &amp; phone #s &...,0
4,@united @44Stocker my wife Sarah stocker did a...,0
...,...,...
8073,@JetBlue Apparently the pilot had made some an...,0
8074,@united customer service sucks! They hang up ...,0
8075,@USAirways I paid for my seat. I expect to be...,0
8076,"@united so, not only were you Late Flight, you...",0


In [4]:
# Cleaning Training Data set
import re
training_data_clean = (
    # start with training data
    df
    # remove all the '\n' values with space
     .assign(clean_document= lambda x: [re.sub(r"&amp;","",text) for text in x.text])
    #Remove “n\”
    .assign(clean_document= lambda x: [re.sub(r"\n","",text) for text in x.clean_document] )
    #Remove mentions i.e. any alphanumeric starting with “@”
    .assign(clean_document= lambda x: [re.sub("@[A-Za-z0-9_]+","",text) for text in x.clean_document])
    #Remove all hashtags i.e. “#”
    .assign(clean_document= lambda x: [re.sub("#[A-Za-z0-9_]+","",text) for text in x.clean_document])
#     
#     .assign(clean_document= lambda x: [re.sub(r"#","",text) for text in x.clean_document])
#    Remove all links i.e. any alphanumeric starting with https or http
    .assign(clean_document= lambda x: [re.sub(r"http\S+","",text) for text in x.clean_document])   
)
training_data_clean


Unnamed: 0,text,airline_sentiment,clean_document
0,@USAirways BF has been stuck in CLT all day. I...,0,BF has been stuck in CLT all day. Is the loun...
1,@united DM sent. This lack if customer servic...,0,DM sent. This lack if customer service is ge...
2,@JetBlue thank you for the information.,1,thank you for the information.
3,@AmericanAir I DMed you my AA &amp; phone #s &...,0,I DMed you my AA phone you can't have some...
4,@united @44Stocker my wife Sarah stocker did a...,0,my wife Sarah stocker did also called but co...
...,...,...,...
8073,@JetBlue Apparently the pilot had made some an...,0,Apparently the pilot had made some announceme...
8074,@united customer service sucks! They hang up ...,0,customer service sucks! They hang up after w...
8075,@USAirways I paid for my seat. I expect to be...,0,I paid for my seat. I expect to be able to u...
8076,"@united so, not only were you Late Flight, you...",0,"so, not only were you Late Flight, you broke ..."


In [5]:
# Feature Extraction
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features = 60)
# fit_transform(raw_documents[, y])===>Learn vocabulary and idf, return document-term matrix.
X = vectorizer.fit_transform([i for i in training_data_clean['clean_document']])
df_tfid_sklearn = pd.DataFrame(X.toarray(),columns=vectorizer.get_feature_names())
Y_train = training_data_clean['airline_sentiment']
X_train =df_tfid_sklearn.iloc[:, :-1] # aLL ROWS EXCEPT LAST
Y_train


0       0
1       0
2       1
3       0
4       0
       ..
8073    0
8074    0
8075    0
8076    0
8077    1
Name: airline_sentiment, Length: 8078, dtype: int64

In [6]:
# Training the classifier & predicting on test data
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
# Learn using fit method
classifier.fit(X_train, Y_train)

MultinomialNB()

In [7]:
# Cleaning the test Data
test_data = pd.read_csv("./airline_review_test_data.csv")
test_data
test_data_clean = (
    # start with training data
    test_data
    # remove all the '\n' values with space
     .assign(clean_document= lambda x: [re.sub(r"&amp;","",text) for text in x.text])
    #Remove “n\”
    .assign(clean_document= lambda x: [re.sub(r"\n","",text) for text in x.clean_document] )
    #Remove mentions i.e. any alphanumeric starting with “@”
    .assign(clean_document= lambda x: [re.sub("@[A-Za-z0-9_]+","",text) for text in x.clean_document])
    #Remove all hashtags i.e. “#”
    .assign(clean_document= lambda x: [re.sub("#[A-Za-z0-9_]+","",text) for text in x.clean_document])
#     
#     .assign(clean_document= lambda x: [re.sub(r"#","",text) for text in x.clean_document])
#    Remove all links i.e. any alphanumeric starting with https or http
    .assign(clean_document= lambda x: [re.sub(r"http\S+","",text) for text in x.clean_document])   
)
test_data_clean

# use same vectorizer to transform the data
# transform(raw_documents)================>Transform documents to document-term matrix.
X_test = vectorizer.transform([i for i in test_data_clean['clean_document']])
df_tfid_sklearn_test = pd.DataFrame(X_test.toarray(),columns=vectorizer.get_feature_names())
X_test =df_tfid_sklearn_test.iloc[:, :-1] # aLL ROWS EXCEPT LAST
Y_test =df_tfid_sklearn_test.iloc[:, -1] # lAST row
Y_test = Y_test.astype(int)
test_data_clean['airline_sentiment']



0       0
1       1
2       0
3       0
4       0
       ..
3458    1
3459    0
3460    0
3461    0
3462    0
Name: airline_sentiment, Length: 3463, dtype: int64

In [8]:
X_test

Unnamed: 0,all,an,and,are,at,be,been,but,can,cancelled,...,up,us,was,we,what,when,why,will,with,you
0,0.0,0.000000,0.311435,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.00000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.425578,0.0
1,0.0,0.000000,0.000000,0.000000,1.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.00000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0
2,0.0,0.000000,0.250795,0.000000,0.000000,0.0,0.000000,0.357448,0.000000,0.371283,...,0.00000,0.00000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0
3,0.0,0.000000,0.000000,0.356425,0.000000,0.0,0.000000,0.000000,0.000000,0.352969,...,0.00000,0.00000,0.0,0.349927,0.000000,0.0,0.000000,0.0,0.000000,0.0
4,0.0,0.000000,0.319869,0.000000,0.436946,0.0,0.000000,0.000000,0.450728,0.000000,...,0.00000,0.00000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3458,0.0,0.000000,0.222394,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.36203,0.00000,0.0,0.000000,0.363889,0.0,0.000000,0.0,0.000000,0.0
3459,0.0,0.000000,0.000000,0.000000,0.415854,0.0,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.52088,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0
3460,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.00000,0.0,0.000000,0.000000,0.0,0.550656,0.0,0.000000,0.0
3461,0.0,0.393837,0.000000,0.000000,0.000000,0.0,0.392894,0.000000,0.000000,0.000000,...,0.00000,0.00000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0


In [9]:
Y_test = Y_test.astype(int)
Y_test


0       0
1       0
2       0
3       0
4       0
       ..
3458    0
3459    0
3460    0
3461    0
3462    0
Name: your, Length: 3463, dtype: int32

In [10]:
# Prediction and Classification Report
from sklearn.metrics import accuracy_score, classification_report

y_pred = classifier.predict(X_test)


# Classification metrics

def generate_classrpt(Y_test, y_pred,algo=''):
    if algo!= "":
        print(" Clasification report for {algo}".format(algo=algo))
    classification_report_= classification_report(Y_test, y_pred)
    print('\n Accuracy: ', accuracy_score(Y_test, y_pred))
    print('\nClassification Report')
    print('======================================================')
    print('\n', classification_report_)
    return True

generate_classrpt(Y_test, y_pred,algo='MNB')



 Clasification report for MNB

 Accuracy:  0.9301183944556742

Classification Report

               precision    recall  f1-score   support

           0       1.00      0.93      0.96      3461
           1       0.00      0.00      0.00         2

    accuracy                           0.93      3463
   macro avg       0.50      0.47      0.48      3463
weighted avg       1.00      0.93      0.96      3463



True

In [11]:
from sklearn import model_selection
import joblib
# save the model to disk
filename = 'finalized_MNB_model.sav'
joblib.dump(classifier, filename)
 
# some time later...
 
# load the model from disk
loaded_model = joblib.load('finalized_MNB_model.sav')
result = loaded_model.score(X_test, Y_test)
print(result)

0.9301183944556742


In [12]:
# TASK2
!pip install TextBlob



In [13]:
!pip freeze


certifi==2021.10.8
click @ file:///D:/bld/click_1645238350348/work
colorama @ file:///home/conda/feedstock_root/build_artifacts/colorama_1602866480661/work
joblib @ file:///home/conda/feedstock_root/build_artifacts/joblib_1633637554808/work
nltk==3.7
regex @ file:///D:/bld/regex_1646210304648/work
textblob==0.17.1
tqdm @ file:///home/conda/feedstock_root/build_artifacts/tqdm_1646031859244/work
wincertstore==0.2


In [14]:
from textblob import TextBlob


# it accepts single text
# for a corpus, we need to run it in a loop

# Polarity score ranges from -1 to 1 
# -1 means very negative , 0 means nutral and +1 means positive sentiment


corpus = ['I am happy','I am very happy', 'I am HAPPY' , 'I am happy but sad too','that SUX' ,'I am happy!!!!!']

for text in corpus:
 score = TextBlob(text).sentiment[0]
 print (f"{text}: Sentiment score, {score:0.3f}")

ModuleNotFoundError: No module named 'textblob'

In [None]:
!pip freeze


In [None]:
strr = 'https://stackoverflow.com/questions/3559559/how-to-delete-a-character-from-a-string-using-python'
check  = re.sub("http[A-Za-z0-9_]+","",strr)
check2  = re.sub(r"http\S+","",strr)
strr2 ='@wertyyy'
strr2_check = re.sub("@[A-Za-z0-9_]+","",strr2)
# cc=r'http\S+'


print('check2',check2)
print('check',check)
print('strr2_check:',strr2_check)

In [None]:
""" 
Reference:
https://medium.com/swlh/text-classification-using-tf-idf-7404e75565b8
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
https://datatofish.com/string-to-integer-dataframe/
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iloc.html
# Saving Model
https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/

"""
    

In [None]:
import sys
print(sys.path)