# Sentiment Analysis Data
## 1. Load Required Libraries

In [1]:
import pandas as pd
import joblib
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report

In [2]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nvic\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\nvic\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## 2. Constant Variables

In [16]:
PREDICTOR = "text"
LABEL = "airline_sentiment"
DATASET_PATH = "data/raw/tweet_airlines.csv"
VECTORIZER_PATH = "models/vectorizer.pkl"
LABEL_ENCODER_PATH = "models/le.pkl"
STANDARD_SCALER_PATH = "models/standard_scaler.pkl"


## 3. Load Data

In [4]:
def load_data(path: str) -> pd.DataFrame:
    return pd.read_csv(path)

def get_predictor_and_label(dataset: pd.DataFrame, label: str, predictor: str or list) -> pd.DataFrame:
  dataset = dataset.copy()
  dataset = pd.concat([dataset[predictor], dataset[label]], axis = 1)
  return dataset

def duplicates_handler(dataset: pd.DataFrame) -> pd.DataFrame:
  dataset = dataset.copy()
  dataset.drop_duplicates(inplace = True)
  return dataset

def data_splitting(dataset: pd.DataFrame, label: str, predictor: str or list) -> pd.DataFrame: 
    dataset = dataset.copy()
    x_train, x_test, y_train, y_test = train_test_split(
        dataset[predictor],
        dataset[label],
        test_size = 0.3,
        random_state = 123
    )
    x_valid, x_test, y_valid, y_test = train_test_split(
        x_test,
        y_test,
        test_size = 0.5,
        random_state = 123
    )
    return x_train, x_valid, x_test, y_train, y_valid, y_test

In [5]:
data = load_data(DATASET_PATH)

In [6]:
data.shape

(14640, 14)

In [7]:
data.head()

Unnamed: 0.1,Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,name,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,0,570306133677760513,neutral,1.0,,,Virgin America,cairdin,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,1,570301130888122368,positive,0.3486,,0.0,Virgin America,jnardino,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,2,570301083672813571,neutral,0.6837,,,Virgin America,yvonnalynn,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,jnardino,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,jnardino,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [8]:
data = get_predictor_and_label(data, LABEL, PREDICTOR)

In [9]:
data.head()

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative


In [10]:
data = duplicates_handler(data)

In [11]:
data.shape

(14452, 2)

In [12]:
x_train, x_valid, x_test, y_train, y_valid, y_test = data_splitting(data, LABEL, PREDICTOR)

## 4. Preprocessing

In [13]:
def cleaning_text(data: pd.DataFrame, stemmer, lemmatizer):
  data = data.copy()
  
  clear_text = pd.Series([], dtype = str)

  for i, string in enumerate(data):      
      # Stemming
      string = str(string)
      string = string.split(" ")
      string = [stemmer.stem(word) for word in string]
      string = " ".join(string)
      string = str(string)
      
      # Lemmatizing
      string = string.split(" ")
      string = [lemmatizer.lemmatize(word) for word in string]
      string = " ".join(string)
      string = str(string)

      # Preprocess using RegularExpression
      string = str(string)
      string = re.sub('[^A-Za-z0-9\']+', ' ', string)
      string = re.sub(' +', ' ', string.strip())
      string = string.lower()
      
      # Save to clear_text[i]
      clear_text[i] = string

  return clear_text

def fit_vectorizer(text, path):
    tfidf = TfidfVectorizer(min_df = 50, stop_words = "english")
    tfidf.fit(text)
    joblib.dump(tfidf, path)
    return tfidf

def transform_text(text, vectorizer):
    vectorized_text = vectorizer.transform(text)
    feature_word = pd.DataFrame(
        vectorized_text.toarray(),
        columns = vectorizer.get_feature_names_out(), 
        index = text.index
    )
    return feature_word
  
def fit_standard_scaler(data, path):
    standard = StandardScaler()
    standard.fit(data)
    joblib.dump(standard, path)
    return standard
  
def standard_scale_data(data, standard_scaler_object):
    data_standard = pd.DataFrame(standard_scaler_object.transform(data))
    data_standard.columns = data.columns
    data_standard.index = data.index
    return data_standard
  
def le_fit(data, path):
    le = LabelEncoder()
    le.fit(data)

    joblib.dump(le, path)

    return le

def le_transform(data, le_object):
    data = data.copy()
    data = le_object.transform(data)

    return data

In [14]:
stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

In [15]:
x_train_cleaned = cleaning_text(x_train, stemmer, lemmatizer)
x_valid_cleaned = cleaning_text(x_valid, stemmer, lemmatizer)
x_test_cleaned = cleaning_text(x_test, stemmer, lemmatizer)

In [17]:
vectorizer = fit_vectorizer(x_train_cleaned, VECTORIZER_PATH)

In [18]:
x_train_vec = transform_text(x_train_cleaned, vectorizer)
x_valid_vec = transform_text(x_valid_cleaned, vectorizer)
x_test_vec = transform_text(x_test_cleaned, vectorizer)

In [19]:
x_train_vec.shape

(10116, 327)

In [20]:
x_train_vec.head()

Unnamed: 0,10,11,15,1st,20,25,30,40,45,50,...,whi,wife,wifi,won,work,worst,wrong,year,yes,yesterday
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
standard_scaler = fit_standard_scaler(x_train_vec, STANDARD_SCALER_PATH)

In [22]:
x_train_scaled = standard_scale_data(x_train_vec, standard_scaler)
x_valid_scaled = standard_scale_data(x_valid_vec, standard_scaler)
x_test_scaled = standard_scale_data(x_test_vec, standard_scaler)

In [23]:
x_train_scaled.head()

Unnamed: 0,10,11,15,1st,20,25,30,40,45,50,...,whi,wife,wifi,won,work,worst,wrong,year,yes,yesterday
0,-0.107599,-0.072827,-0.080846,-0.075046,-0.085134,-0.06803,-0.107123,-0.071025,-0.076822,-0.06899,...,-0.189738,-0.076294,-0.089355,-0.108521,-0.152032,-0.124006,-0.072238,-0.090145,-0.120185,-0.079535
1,-0.107599,-0.072827,-0.080846,-0.075046,-0.085134,-0.06803,-0.107123,-0.071025,-0.076822,-0.06899,...,-0.189738,-0.076294,-0.089355,-0.108521,-0.152032,-0.124006,-0.072238,-0.090145,-0.120185,-0.079535
2,-0.107599,-0.072827,-0.080846,-0.075046,-0.085134,-0.06803,-0.107123,-0.071025,-0.076822,-0.06899,...,-0.189738,-0.076294,-0.089355,-0.108521,-0.152032,-0.124006,-0.072238,-0.090145,-0.120185,-0.079535
3,-0.107599,-0.072827,-0.080846,-0.075046,-0.085134,-0.06803,-0.107123,-0.071025,-0.076822,-0.06899,...,-0.189738,-0.076294,-0.089355,-0.108521,-0.152032,-0.124006,-0.072238,-0.090145,-0.120185,-0.079535
4,-0.107599,-0.072827,-0.080846,-0.075046,-0.085134,-0.06803,-0.107123,-0.071025,-0.076822,-0.06899,...,-0.189738,-0.076294,-0.089355,-0.108521,-0.152032,-0.124006,-0.072238,-0.090145,-0.120185,-0.079535


In [24]:
le = le_fit(y_train, LABEL_ENCODER_PATH)

In [25]:
le.classes_

array(['negative', 'neutral', 'positive'], dtype=object)

In [26]:
y_train_encoded = le_transform(y_train, le)
y_valid_encoded = le_transform(y_valid, le)
y_test_encoded = le_transform(y_test, le)

## 5. Training Model

In [27]:
def dtc_fit(x_train, y_train, scoring = 'accuracy'):
    dtc = DecisionTreeClassifier(random_state = 123)

    hyperparam = {
        'min_samples_leaf': [3, 5, 7, 9, 13, 17, 21, 27, 33, 41, 50, 60, 80, 100],
        'max_features': ['sqrt', 'log2', 0.25, 0.5, 0.75]
    }

    dtc = RandomizedSearchCV(
        dtc,
        param_distributions = hyperparam,
        cv = 5,
        n_iter = 15,
        scoring = scoring,
        n_jobs=-1,
        random_state = 123
    )
    
    dtc.fit(x_train, y_train)
    
    return dtc

In [28]:
dtc = dtc_fit(x_train_scaled, y_train_encoded)

In [29]:
def rfc_fit(x_train, y_train, scoring = 'accuracy'):
    rfc = RandomForestClassifier(random_state = 123)

    hyperparam = {
        'min_samples_leaf': [3, 5, 7, 9, 13, 17, 21, 27, 33],
        'max_features': [0.25, 0.5, 0.75],
        'n_estimators': [30, 40, 60, 100]
    }
    
    rfc = RandomizedSearchCV(
        rfc,
        param_distributions = hyperparam,
        cv = 5,
        n_iter = 10,
        scoring = scoring,
        n_jobs = -1,
        random_state = 123,
        verbose = 124
    )

    rfc.fit(x_train, y_train)

    return rfc

In [30]:
rfc = rfc_fit(x_train_scaled, y_train_encoded)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [31]:
joblib.dump(dtc, "models/dtc.pkl")
joblib.dump(rfc, "models/rfc.pkl")

['models/rfc.pkl']

## 6. Evaluation Model

In [32]:
y_pred = dtc.predict(x_valid_scaled)
print(classification_report(y_valid_encoded, y_pred, zero_division = 0))

              precision    recall  f1-score   support

           0       0.75      0.90      0.82      1393
           1       0.48      0.25      0.33       453
           2       0.59      0.50      0.54       322

    accuracy                           0.70      2168
   macro avg       0.61      0.55      0.56      2168
weighted avg       0.67      0.70      0.68      2168



In [33]:
y_pred = rfc.predict(x_valid_scaled)
print(classification_report(y_valid_encoded, y_pred, zero_division = 0))

              precision    recall  f1-score   support

           0       0.77      0.92      0.84      1393
           1       0.57      0.32      0.41       453
           2       0.69      0.50      0.58       322

    accuracy                           0.73      2168
   macro avg       0.67      0.58      0.61      2168
weighted avg       0.71      0.73      0.71      2168



## 7. Prediction

In [34]:
def manual_predict(text_data, model, stemmer, lemmatizer, vectorizer, standard_scaler, le):
  manual_data = text_data.copy()
  manual_data_cleaned = cleaning_text(manual_data, stemmer, lemmatizer)
  manual_data_vec = transform_text(manual_data_cleaned, vectorizer)
  manual_data_scaled = standard_scale_data(manual_data_vec, standard_scaler)
  y_pred = model.predict(manual_data_scaled)

  y_pred = le.inverse_transform(y_pred)

  return y_pred

In [35]:
y_pred = manual_predict(x_test, dtc, stemmer, lemmatizer, vectorizer, standard_scaler, le)

In [36]:
data_test = pd.concat([x_test, y_test, pd.Series(y_pred).set_axis(y_test.index)], axis = 1)

In [37]:
data_test

Unnamed: 0,text,airline_sentiment,0
14165,"@AmericanAir agents refuse to help, ""too busy""...",negative,negative
8932,"@JetBlue unfortunately no, but hoping I can ca...",neutral,positive
1411,@united you're my early frontrunner for best a...,positive,negative
10482,@USAirways can you DM me please?,neutral,negative
8693,@JetBlue can I change my flight if I already p...,neutral,negative
...,...,...,...
6330,"@SouthwestAir Secondly, we did not begin board...",negative,negative
10128,@USAirways have had a medical issue Late Fligh...,negative,negative
3225,@united WHAT?! Y'all have zero concept of cust...,negative,negative
4758,@southwestair does anyone realize that banning...,negative,negative


In [38]:
data_test[data_test["airline_sentiment"] == "positive"]

Unnamed: 0,text,airline_sentiment,0
1411,@united you're my early frontrunner for best a...,positive,negative
4184,@united thank you! 😊,positive,positive
12372,"@AmericanAir mission accomplished today, Thank...",positive,positive
2914,@united of course not. The inflight crew was g...,positive,positive
8351,"@JetBlue, never been delayed before; 533 to TP...",positive,neutral
...,...,...,...
2035,@united thanks for the epic service on 863- al...,positive,positive
4455,"@SouthwestAir @intuit @jhamilton2007 4 moms, 4...",positive,neutral
723,@united this will definitely be a trip to reme...,positive,negative
7561,@JetBlue I love #JetBlue ! #FlyFi when will we...,positive,positive


In [39]:
data_test.loc[2914].text

'@united of course not. The inflight crew was great!'