In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# IMPORTS

In [None]:
import pandas as pd
import numpy as np
import re
import spacy

nlp = spacy.load('en_core_web_sm', disable = ["tagger", "parser","ner"])

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# LOADING DATA

In [None]:
dir = "drive/MyDrive/Multi-Class-Text-Classification/"

In [None]:
df = pd.read_csv(dir + 'Tweets.csv')
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials to the experience... tacky.,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I need to take another trip!,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &...",,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing about it,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [None]:
df['text'].sample(5)

6320     @SouthwestAir I consider myself a loyal customer to the brand but I'm astounded at the lack of c...
8991                                    @USAirways I have two tight connections in #Charlotte and #Frankfurt
4822     @SouthwestAir kudos to the crew of flight 1050 to GRR for making a very special memory for a swe...
4120     @united If you have had any issues with  United Airlines PLEASE retweet. Paid for a full fight t...
12739                            @AmericanAir are you guys intentionally trying to lose customers and money?
Name: text, dtype: object

In [None]:
df['airline_sentiment'].value_counts(normalize = True)*100

negative    62.691257
neutral     21.168033
positive    16.140710
Name: airline_sentiment, dtype: float64

# TEXT CLEANING

In [None]:
def text_cleaner(text):
  text = re.sub(r'@[A-Za-z0-9]+', '', text)           
  text = re.sub(r'#[A-Za-z0-9]+', '', text)         
  text = re.sub(r'http\S+', '', text)  
  text = text.lower()
  text = re.sub("[^a-z]+", " ", text)
  text=re.sub("[\s]+", " ", text)
  doc=nlp(text)
  tokens=[token.lemma_ for token in doc if(token.is_stop == False)]
  return " ".join(tokens)

In [None]:
df['clean_text'] = df['text'].apply(text_cleaner)

In [None]:
text   = df['clean_text'].values
labels = df['airline_sentiment'].values

In [None]:
text[:10]

array(['  say', '  plus have add commercial experience tacky',
       '  didn t today mean need trip',
       '  s aggressive blast obnoxious entertainment guest face amp little recourse',
       '  s big bad thing',
       '  seriously pay flight seat didn t play s bad thing fly va',
       '  yes nearly time fly vx ear worm win t away',
       '  miss prime opportunity man hat parody', '  didn t have',
       '  amaze arrive hour early good'], dtype=object)

In [None]:
labels[:10]

array(['neutral', 'positive', 'neutral', 'negative', 'negative',
       'negative', 'positive', 'neutral', 'positive', 'positive'],
      dtype=object)

# LABEL ENCODING

In [None]:
le = LabelEncoder()
labels = le.fit_transform(labels)

In [None]:
labels[:10]

array([1, 2, 1, 0, 0, 0, 2, 1, 2, 2])

In [None]:
le.inverse_transform([0,1,2])

array(['negative', 'neutral', 'positive'], dtype=object)

# TRAIN / VAL SPLIT

In [None]:
x_train, x_val, y_train, y_val = train_test_split(text, labels, stratify = labels, test_size = 0.2, random_state = 0, shuffle = True)

# TF-IDF

In [None]:
word_vectorizer = TfidfVectorizer(max_features = 1000)

In [None]:
word_vectorizer.fit(x_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=1000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [None]:
train_word_features = word_vectorizer.transform(x_train)
train_word_features

<11712x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 71165 stored elements in Compressed Sparse Row format>

In [None]:
val_word_features = word_vectorizer.transform(x_val)
val_word_features

<2928x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 18000 stored elements in Compressed Sparse Row format>

# MODEL

## NAIVE BAYES

In [None]:
nb_model = MultinomialNB().fit(train_word_features, y_train)
nb_model

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
train_pred_nb = nb_model.predict(train_word_features)

In [None]:
print("F1-score on Train Set:", f1_score(y_train, train_pred_nb, average = "weighted"))

F1-score on Train Set: 0.7231423010381972


In [None]:
val_pred_nb = nb_model.predict(val_word_features)

In [None]:
print("F1-score on Validation Set:", f1_score(y_val, val_pred_nb, average = "weighted"))

F1-score on Validation Set: 0.6727960317708854


## LOGISTICS REGRESSION

In [None]:
lr_model = LogisticRegression(max_iter = 1000).fit(train_word_features, y_train)
lr_model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
train_pred_lr = lr_model.predict(train_word_features)

In [None]:
print("F1-score on Train Set:", f1_score(y_train, train_pred_lr, average = "weighted"))

F1-score on Train Set: 0.8053475227597584


In [None]:
val_pred_lr = lr_model.predict(val_word_features)

In [None]:
print("F1-score on Validation Set:", f1_score(y_val, val_pred_lr, average = "weighted"))

F1-score on Validation Set: 0.7562646817248584


# FINAL PIPELINE

In [None]:
def sentiment_analyzer(tweet):
  cleaned_tweet = text_cleaner(tweet)
  tweet_vector = word_vectorizer.transform([cleaned_tweet])
  label = lr_model.predict(tweet_vector)
  return le.inverse_transform(np.array(label))

In [None]:
sentiment_analyzer("flight flew")

array(['neutral'], dtype=object)