# Sentiment Analysis Experiment

## Import Libraries

In [1]:
!pip install emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!pip install wordninja

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import pandas as pd
import re
import numpy as np
import emoji
import wordninja
import nltk
from nltk.corpus import stopwords
import spacy
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.metrics import classification_report

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
nlp =spacy.load('en_core_web_sm')
stop_words=set(stopwords.words('english'))

## Loading Data

In [6]:
datapath='/content/drive/MyDrive/airline_sentiment_analysis.csv'

In [7]:
data=pd.read_csv(datapath)

In [8]:
data.head()

Unnamed: 0.1,Unnamed: 0,airline_sentiment,text
0,1,positive,@VirginAmerica plus you've added commercials t...
1,3,negative,@VirginAmerica it's really aggressive to blast...
2,4,negative,@VirginAmerica and it's a really big bad thing...
3,5,negative,@VirginAmerica seriously would pay $30 a fligh...
4,6,positive,"@VirginAmerica yes, nearly every time I fly VX..."


In [9]:
len(data)

11541

In [10]:
data.columns

Index(['Unnamed: 0', 'airline_sentiment', 'text'], dtype='object')

In [11]:
data=data[['airline_sentiment', 'text']]

## Data Mining

In [12]:
data.isna().sum()

airline_sentiment    0
text                 0
dtype: int64

In [13]:
data.groupby('airline_sentiment').count()

Unnamed: 0_level_0,text
airline_sentiment,Unnamed: 1_level_1
negative,9178
positive,2363


### Data Cleaning

In [14]:
def clean(text):
    
    #remove_mentions, urls, hash_sign:
    mention_words_removed= re.sub(r'@\w+','',text)
    hash_sign_removed=re.sub(r'#','',mention_words_removed)
    url_removed=' '.join(word for word in hash_sign_removed.split(" ") if not word.startswith('http'))
    
    #Transform emoji to text
    demoj=emoji.demojize(url_removed)
    
    #Split compound words coming from hashtags
    splitted=wordninja.split(demoj)
    splitted=" ".join(word for word in splitted)
    
    # Implement lemmatization & remove punctuation
    lem = nlp(splitted)
    punctuations = string.punctuation
    punctuations=punctuations+'...'

    sentence=[]
    for word in lem:
        word = word.lemma_.lower().strip()
        if ((word != '-pron-') & (word not in punctuations)):
            sentence.append(word)    
            
    #Remove stopwords
    stop_words_removed=[word for word in sentence if word not in stop_words]
    
    return stop_words_removed

In [15]:
clean(data['text'].values[0])

['plus', "'ve", 'add', 'commercial', 'experience', 'tacky']

In [16]:
data["clean_text_list"]=data["text"].apply(clean)

In [17]:
data[["text","clean_text_list",'airline_sentiment']].head()

Unnamed: 0,text,clean_text_list,airline_sentiment
0,@VirginAmerica plus you've added commercials t...,"[plus, 've, add, commercial, experience, tacky]",positive
1,@VirginAmerica it's really aggressive to blast...,"[really, aggressive, blast, obnoxious, enterta...",negative
2,@VirginAmerica and it's a really big bad thing...,"[really, big, bad, thing]",negative
3,@VirginAmerica seriously would pay $30 a fligh...,"[seriously, would, pay, 30, flight, seat, play...",negative
4,"@VirginAmerica yes, nearly every time I fly VX...","[yes, nearly, every, time, fly, v, x, ear, wor...",positive


### Vectorisation - TF-IDF

In [18]:
data["clean_text"]=[" ".join(word) for word in data["clean_text_list"]]
X=data["clean_text"].values
Y=data["airline_sentiment"].map({"negative":-1,"positive":1})

#### Spliting Training and Testing Data

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42,stratify=Y)

In [20]:
tfidf_vector = TfidfVectorizer()
X_train_tf=tfidf_vector.fit_transform(X_train)
X_test_tf = tfidf_vector.transform(X_test)

### Modelling

#### Logistic Regression

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
model_lr=LogisticRegression(random_state=0, solver='lbfgs',max_iter=500).fit(X_train_tf,y_train)

In [23]:
pred_train=model_lr.predict(X_train_tf)
pred_test=model_lr.predict(X_test_tf)

##### Logistic Regression Metrics

In [24]:
print("Logistic Regression Train Accuracy:",np.round(metrics.accuracy_score(y_train, pred_train),4))
print("Logistic Regression Test Accuracy:",np.round(metrics.accuracy_score(y_test, pred_test),4))
print("")
print("Logistic Regression Confusion Matrix:",metrics.confusion_matrix(y_test, pred_test,labels=[-1, 1]),sep="\n")


Logistic Regression Train Accuracy: 0.9226
Logistic Regression Test Accuracy: 0.9007

Logistic Regression Confusion Matrix:
[[2704   50]
 [ 294  415]]


In [25]:
print(classification_report(y_test, pred_test,target_names=['Negative','Positive']))

              precision    recall  f1-score   support

    Negative       0.90      0.98      0.94      2754
    Positive       0.89      0.59      0.71       709

    accuracy                           0.90      3463
   macro avg       0.90      0.78      0.82      3463
weighted avg       0.90      0.90      0.89      3463



#### Gradient Boosting Classifier

In [26]:
from sklearn.ensemble import GradientBoostingClassifier

In [27]:
model_gbm=GradientBoostingClassifier(n_estimators=180, max_depth=6, random_state=0,learning_rate=0.1).fit(X_train_tf,y_train)

In [28]:
pred_train=model_gbm.predict(X_train_tf)
pred_test=model_gbm.predict(X_test_tf)

##### GBM Metrics

In [29]:
print("GBM Train Accuracy:",np.round(metrics.accuracy_score(y_train, pred_train),4))
print("GBM Train Accuracy:",np.round(metrics.accuracy_score(y_test, pred_test),4))
print("")
print("GBM Train Accuracy:",metrics.confusion_matrix(y_test, pred_test,labels=[-1, 1]),sep="\n")

GBM Train Accuracy: 0.944
GBM Train Accuracy: 0.8929

GBM Train Accuracy:
[[2635  119]
 [ 252  457]]


In [30]:
print(classification_report(y_test, pred_test,target_names=['Negative','Positive']))

              precision    recall  f1-score   support

    Negative       0.91      0.96      0.93      2754
    Positive       0.79      0.64      0.71       709

    accuracy                           0.89      3463
   macro avg       0.85      0.80      0.82      3463
weighted avg       0.89      0.89      0.89      3463



#### XGBoosting Classifier

In [31]:
from xgboost import XGBClassifier

In [32]:
model_xgb=XGBClassifier(n_estimators=200,random_state=0,learning_rate=0.7,objective='multi:softprob',num_class=2).fit(X_train_tf,y_train)

In [33]:
pred_train=model_xgb.predict(X_train_tf)
pred_test=model_xgb.predict(X_test_tf)

##### XGBoosting Metrics

In [34]:
print("XGBoost train Accuracy:",np.round(metrics.accuracy_score(y_train, pred_train),4))
print("XGBoost train Accuracy:",np.round(metrics.accuracy_score(y_test, pred_test),4))
print("")
print("XGBoost train Accuracy:",metrics.confusion_matrix(y_test, pred_test,labels=[-1, 1]),sep="\n")

XGBoost train Accuracy: 0.9751
XGBoost train Accuracy: 0.8934

XGBoost train Accuracy:
[[2623  131]
 [ 238  471]]


In [35]:
print(classification_report(y_test, pred_test,target_names=['Negative','Positive']))

              precision    recall  f1-score   support

    Negative       0.92      0.95      0.93      2754
    Positive       0.78      0.66      0.72       709

    accuracy                           0.89      3463
   macro avg       0.85      0.81      0.83      3463
weighted avg       0.89      0.89      0.89      3463



#### Random Forest

In [36]:
from sklearn.ensemble import RandomForestClassifier

In [37]:
model_rf=RandomForestClassifier(n_estimators = 100, random_state=56).fit(X_train_tf,y_train)

In [38]:
pred_train=model_rf.predict(X_train_tf)
pred_test=model_rf.predict(X_test_tf)

##### Random Forest Metrics

In [39]:
print("RandomForest train Accuracy:",np.round(metrics.accuracy_score(y_train, pred_train),4))
print("RandomForest train Accuracy:",np.round(metrics.accuracy_score(y_test, pred_test),4))
print("")
print("RandomForest train Accuracy:",metrics.confusion_matrix(y_test, pred_test,labels=[-1, 1]),sep="\n")

RandomForest train Accuracy: 0.999
RandomForest train Accuracy: 0.8937

RandomForest train Accuracy:
[[2647  107]
 [ 261  448]]


In [40]:
print(classification_report(y_test, pred_test,target_names=['Negative','Positive']))

              precision    recall  f1-score   support

    Negative       0.91      0.96      0.94      2754
    Positive       0.81      0.63      0.71       709

    accuracy                           0.89      3463
   macro avg       0.86      0.80      0.82      3463
weighted avg       0.89      0.89      0.89      3463



**Reason to Choose these Models**

1) Lightweight

2) Low Data Require

3) Less Chance of Overfitting due to ensemble and regularisation techniques

**Out Of All the models we will choose Logistic Regression**

1) High Precision of Positive Class ( as it is having very less data )

2) Hight F1-score