## Airline Sentiment

In [1]:
%load_ext watermark
%watermark 

Last updated: 2022-04-15T10:43:35.422445-07:00

Python implementation: CPython
Python version       : 3.9.12
IPython version      : 8.0.1

Compiler    : MSC v.1929 64 bit (AMD64)
OS          : Windows
Release     : 10
Machine     : AMD64
Processor   : Intel64 Family 6 Model 142 Stepping 12, GenuineIntel
CPU cores   : 8
Architecture: 64bit



In [2]:
## We start by importing the packages we will use.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import datetime
import re

from nltk.corpus import stopwords

# Stemming
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split

# pipeline
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# cleaning geographical data
#import geotext
import nltk

# vectorisor
from sklearn.feature_extraction.text import TfidfVectorizer

# models
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

# metrics
from sklearn.metrics import confusion_matrix, classification_report, cohen_kappa_score, roc_auc_score, roc_curve

# multiclass
from sklearn.multiclass import OneVsRestClassifier

# hyperparameter tuning
from sklearn.model_selection import GridSearchCV

## Import Local Functions

from functions import (preview_data,
                       get_missing_counts,
                       get_value_counts,
                       get_unique_column_count,
                       get_datetimes,
                       drop_null_values)

pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_rows', None,'max.columns', None,'display.max_colwidth', 200)

In [3]:
## We Load our dataframe in df
df = pd.read_csv('./Airlines-NLP-Data/Tweets.csv')
df.head(3)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.349,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials to the experience... tacky.,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.684,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I need to take another trip!,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)


In [4]:
## We preview the data
preview_data(df)

First Five Rows of Data: 



Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.349,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials to the experience... tacky.,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.684,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I need to take another trip!,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.703,Virgin America,,jnardino,,0,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse",,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing about it,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)



 Shape: 

(14640, 15)

 Info: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      14640 non-null  int64  
 1   airline_sentiment             14640 non-null  object 
 2   airline_sentiment_confidence  14640 non-null  float64
 3   negativereason                9178 non-null   object 
 4   negativereason_confidence     10522 non-null  float64
 5   airline                       14640 non-null  object 
 6   airline_sentiment_gold        40 non-null     object 
 7   name                          14640 non-null  object 
 8   negativereason_gold           32 non-null     object 
 9   retweet_count                 14640 non-null  int64  
 10  text                          14640 non-null  object 
 11  tweet_coord                   1019 non-null   object 
 12  tweet_created              

In [5]:
## Check for null values
get_missing_counts(df)

Missing Value Percentages by Column: 

tweet_id -----> 0 -----> 0.0%
airline_sentiment -----> 0 -----> 0.0%
airline_sentiment_confidence -----> 0 -----> 0.0%
negativereason -----> 5462 -----> 37.31%
negativereason_confidence -----> 4118 -----> 28.13%
airline -----> 0 -----> 0.0%
airline_sentiment_gold -----> 14600 -----> 99.73%
name -----> 0 -----> 0.0%
negativereason_gold -----> 14608 -----> 99.78%
retweet_count -----> 0 -----> 0.0%
text -----> 0 -----> 0.0%
tweet_coord -----> 13621 -----> 93.04%
tweet_created -----> 0 -----> 0.0%
tweet_location -----> 4733 -----> 32.33%
user_timezone -----> 4820 -----> 32.92%


The target column or the predictor variables are not missing any values. Let's define X and y.

In [6]:
## We define X and y.
X = df['text']
y = df['airline_sentiment']

## We train test split.
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=10)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(11712,) (2928,) (11712,) (2928,)


We proceed to clean our tweets.

In [11]:
## Let's preview X
X.head(2)

0                                         @VirginAmerica What @dhepburn said.
1    @VirginAmerica plus you've added commercials to the experience... tacky.
Name: text, dtype: object

In [13]:
## Let's preview Y
y.head(2)

0     neutral
1    positive
Name: airline_sentiment, dtype: object

In [8]:
## We 
global stopwords_list  
stopwords_list = stopwords.words('english')

## We take a look at all the stopwords in the english language. 
print(stopwords_list) 

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [None]:
class clean_tweet(BaseEstimator,TransformerMixin):
    
    def __init__(self):
        print('init called')
    
    def fit(self,X,y=None):
        print('fit called')
        return self
       
    def transform(self, X, y = None):
        
        print('transform called')
        
        X_1 = X.copy()
  
        def clean_tweet_func(tweet):

            if type(tweet) == float:
                return ""

            temp = tweet.lower()
            temp = re.sub("'", "", temp) # to avoid removing contractions in english
            temp = re.sub("@[A-Za-z0-9_]+","", temp)
            temp = re.sub("#[A-Za-z0-9_]+","", temp)
            temp = re.sub(r'http\S+', '', temp)
            temp = re.sub('[()!?]', ' ', temp)
            temp = re.sub('\[.*?\]',' ', temp)
            temp = re.sub("[^a-z0-9]"," ", temp)
            temp = temp.split()
            temp = [w for w in temp if not w in stopwords_list]


            #Instantiate stemming class
            stemmer = PorterStemmer()

            #Creating a list of stems of words in tweet
            tweets_stem = []
            for word in temp:
                stem_word = stemmer.stem(word)
                tweets_stem.append(stem_word)

            tweets_stem = " ".join(word for word in tweets_stem)

            return tweets_stem
        
        X_1 = X_1.apply(lambda x: clean_tweet_func(x))
        
        return X_1