In [1]:
import pandas as pd
import numpy as np
import nltk                               # Natural language processing tool-kit
from nltk.corpus import stopwords         # Import stopwords.
from nltk import word_tokenize, sent_tokenize
import re, unicodedata                    # Regular Expression
from bs4 import BeautifulSoup             # Beautiful soup is a parsing library that can use different parsers.
import spacy
import warnings
from google.colab import drive
nltk.download('stopwords')                # Download Stopwords.
nltk.download('punkt')
nltk.download('wordnet')
!pip install contractions
import contractions
from nltk.stem.wordnet import WordNetLemmatizer         # Import Lemmatizer.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from numpy import asarray
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### 1. Import the libraries, load dataset, print shape of data, data descriptions

In [2]:
warnings.filterwarnings("ignore")
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
tweet_df = pd.read_csv('/content/drive/MyDrive/aiml/Tweets.csv')

In [4]:
tweet_df.shape

(14640, 15)

In [5]:
tweet_df.columns.values

array(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'], dtype=object)

In [6]:
tweet_df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [7]:
tweet_df.shape

(14640, 15)

In [8]:
tweet_df.airline.unique()

array(['Virgin America', 'United', 'Southwest', 'Delta', 'US Airways',
       'American'], dtype=object)

### 2. Understand of data-columns

In [9]:
# Drop all other columns except "text" and "ariline_sentiment"
tweet_mini_df = tweet_df[['text', 'airline_sentiment']]
tweet_mini_df

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative
...,...,...
14635,@AmericanAir thank you we got on a different f...,positive
14636,@AmericanAir leaving over 20 minutes Late Flig...,negative
14637,@AmericanAir Please bring American Airlines to...,neutral
14638,"@AmericanAir you have my money, you change my ...",negative


In [10]:
print (tweet_mini_df.shape)
tweet_mini_df.head(5)

(14640, 2)


Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative


In [11]:
stopwords = stopwords.words('english')

customlist = ['not', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn',
        "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',
        "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn',
        "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]

# Set custom stop-word's list as not, couldn't etc. words matter in Sentiment, so not removing them from original data.

stopwords = list(set(stopwords) - set(customlist)) 

### 3. Text pre-processing: Data preparation 

In [12]:
#!pip install spacy
#!python -m spacy download en_core_web_sm
#!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz

lemmatizer = WordNetLemmatizer()

nlp = spacy.load('en_core_web_sm', parse=True, tag=True, entity=True)
def text_pre_processing(orig_text):
    # 3a. html tag removal
    text = BeautifulSoup(orig_text, "html.parser").get_text()
    # Contraction isn't to is not
    text = contractions.fix(text)
    # 3b. Tokenization
    words = nltk.word_tokenize(text)
    new_words = []

    for word in words:
      # Remove non-ascii characters
      word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
      # 3c. Remove the numbers
      word = re.sub(r'\d+', '', word)
      # 3d. Remove punctuation
      word = re.sub(r'[^\w\s]', '', word)
      # 3e. Conversion to Lower Case
      word = word.lower()
      # 3f Lemmatize including stopwords
      if word not in stopwords and len(word) > 0:
        new_words.append(lemmatizer.lemmatize(word, pos='v'))
    #3g. join     
    return ' '.join(new_words)

In [13]:
#print(text_pre_processing(tweet_mini_df.text[5]))
print("Before pre_processing ", tweet_mini_df.text[5])
tweet_mini_df['text'] = tweet_mini_df.apply(lambda row: text_pre_processing(row['text']), axis=1)
print("After pre_processing ", tweet_mini_df.text[5])

Before pre_processing  @VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing.
it's really the only bad thing about flying VA
After pre_processing  virginamerica seriously would pay flight seat not play really bad thing fly va


In [14]:
# 3h Print first 5 rows of data after pre-processing
for i in range(5):
  print("Before", i, " : ", tweet_df.text[i])
  print("After", i, " : ", tweet_mini_df.text[i])

Before 0  :  @VirginAmerica What @dhepburn said.
After 0  :  virginamerica dhepburn say
Before 1  :  @VirginAmerica plus you've added commercials to the experience... tacky.
After 1  :  virginamerica plus add commercials experience tacky
Before 2  :  @VirginAmerica I didn't today... Must mean I need to take another trip!
After 2  :  virginamerica not today must mean need take another trip
Before 3  :  @VirginAmerica it's really aggressive to blast obnoxious "entertainment" in your guests' faces &amp; they have little recourse
After 3  :  virginamerica really aggressive blast obnoxious entertainment guests face little recourse
Before 4  :  @VirginAmerica and it's a really big bad thing about it
After 4  :  virginamerica really big bad thing


### 4. Vectorization

In [15]:
# 4a. Count Vectorizer
# Vectorization (Convert text data to numbers).
vectorizer = CountVectorizer(max_features=2000)                # Keep only 1000 features as number of features will increase the processing time.
data_features = vectorizer.fit_transform(tweet_mini_df['text'])

data_features = data_features.toarray()                        # Convert the data features to array.

In [16]:
data_features.shape

(14640, 2000)

In [17]:
# 4b. TdifVctorizer
tfidVectorizer = TfidfVectorizer(max_features=2000)
tfidVector = tfidVectorizer.fit_transform(tweet_mini_df['text'])
tfid_data_features = tfidVector.toarray()

In [18]:
print(tfid_data_features.shape)
tfid_data_features

(14640, 2000)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [19]:
tfidVectorizer.vocabulary_

{'virginamerica': 1889,
 'say': 1541,
 'plus': 1336,
 'add': 25,
 'commercials': 351,
 'experience': 630,
 'not': 1198,
 'today': 1781,
 'must': 1162,
 'mean': 1095,
 'need': 1174,
 'take': 1725,
 'another': 86,
 'trip': 1809,
 'really': 1426,
 'blast': 213,
 'entertainment': 588,
 'face': 645,
 'little': 1029,
 'big': 201,
 'bad': 161,
 'thing': 1754,
 'seriously': 1568,
 'would': 1969,
 'pay': 1286,
 'flight': 702,
 'seat': 1552,
 'play': 1331,
 'fly': 718,
 'va': 1878,
 'yes': 1986,
 'nearly': 1173,
 'every': 607,
 'time': 1774,
 'vx': 1899,
 'go': 787,
 'away': 153,
 'miss': 1137,
 'opportunity': 1238,
 'without': 1953,
 'hat': 820,
 'https': 873,
 'well': 1929,
 'amaze': 73,
 'arrive': 115,
 'hour': 864,
 'early': 558,
 'good': 792,
 'know': 978,
 'suicide': 1698,
 'second': 1554,
 'lead': 1002,
 'death': 462,
 'among': 78,
 'teens': 1738,
 'pretty': 1366,
 'much': 1159,
 'better': 197,
 'great': 796,
 'deal': 460,
 'already': 67,
 'think': 1756,
 'nd': 1170,
 'australia': 140,
 '

In [20]:
tfidVectorizer.idf_

array([5.03089946, 8.19368582, 8.39435651, ..., 7.88353089, 6.98066318,
       8.19368582])

###5 Fit and Evaluate both type of vectorization

In [21]:
tweet_mini_df.head(5)

Unnamed: 0,text,airline_sentiment
0,virginamerica dhepburn say,neutral
1,virginamerica plus add commercials experience ...,positive
2,virginamerica not today must mean need take an...,neutral
3,virginamerica really aggressive blast obnoxiou...,negative
4,virginamerica really big bad thing,negative


In [22]:
le = preprocessing.LabelEncoder()

In [23]:
tweet_mini_df["sentimentId"] = le.fit_transform(tweet_mini_df['airline_sentiment'])

In [24]:
tweet_mini_df.head()

Unnamed: 0,text,airline_sentiment,sentimentId
0,virginamerica dhepburn say,neutral,1
1,virginamerica plus add commercials experience ...,positive,2
2,virginamerica not today must mean need take an...,neutral,1
3,virginamerica really aggressive blast obnoxiou...,negative,0
4,virginamerica really big bad thing,negative,0


In [25]:
labels = tweet_mini_df['sentimentId']

In [26]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_features, labels, test_size=0.3, random_state=42)

In [27]:
X_train.shape

(10248, 2000)

In [28]:
X_test.shape

(4392, 2000)

In [29]:


forest = RandomForestClassifier(n_estimators=30, n_jobs=4)

forest = forest.fit(X_train, y_train)

print(forest)

print(np.mean(cross_val_score(forest, data_features, labels, cv=10)))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=4,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)
0.726639344262295


In [30]:
result = forest.predict(X_test)
conf_mat = confusion_matrix(y_test, result)
print("RandomForest Count Vectorization")

print(conf_mat)
print(forest.score(X_train,y_train))
print(forest.score(X_test,y_test))

RandomForest Count Vectorization
[[2529  221   64]
 [ 356  453   75]
 [ 178  120  396]]
0.9919984387197502
0.7691256830601093


In [31]:
#6 TfidVectorization
X_train_tfid, X_test_tfid, y_train_tfid, y_test_tfid = train_test_split(tfid_data_features, labels, test_size=0.3, random_state=42)

In [32]:
forestTfid = RandomForestClassifier(n_estimators=30, n_jobs=4)

forestTfid = forestTfid.fit(X_train_tfid, y_train_tfid)

print(forestTfid)

print(np.mean(cross_val_score(forestTfid, tfid_data_features, labels, cv=10)))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=4,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)
0.7224043715846994


In [33]:
resultTfid = forestTfid.predict(X_test_tfid)
conf_mat_tfid = confusion_matrix(y_test_tfid, resultTfid)

print("RandomForest TFID")
print(conf_mat_tfid)
print(forestTfid.score(X_test_tfid,y_test_tfid))

RandomForest TFID
[[2642  130   42]
 [ 454  372   58]
 [ 216   98  380]]
0.7727686703096539


In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [36]:
#Logistic Regression on CountVector
print("Logistic Regression with Counter")
modelCount = LogisticRegression()
modelCount.fit(X_train, y_train)
y_predict_count_logistic = modelCount.predict(X_test)
model_score_count = modelCount.score(X_test, y_test)
print(model_score_count)
print(metrics.confusion_matrix(y_test, y_predict_count_logistic))
print(metrics.classification_report(y_test, y_predict_count_logistic))

Logistic Regression with Counter
0.7928051001821493
[[2485  246   83]
 [ 290  514   80]
 [ 116   95  483]]
              precision    recall  f1-score   support

           0       0.86      0.88      0.87      2814
           1       0.60      0.58      0.59       884
           2       0.75      0.70      0.72       694

    accuracy                           0.79      4392
   macro avg       0.74      0.72      0.73      4392
weighted avg       0.79      0.79      0.79      4392



In [37]:
# Logistic Regression on TFID
print("Logistic Regression with TFID")
modelTfid = LogisticRegression()
modelTfid.fit(X_train_tfid, y_train_tfid)
y_predict_tfid_logistic = modelTfid.predict(X_test_tfid)
model_score_tfid = modelTfid.score(X_test_tfid, y_test_tfid)
print(model_score_tfid)
print(metrics.confusion_matrix(y_test_tfid, y_predict_tfid_logistic))
print(metrics.classification_report(y_test_tfid, y_predict_tfid_logistic))

Logistic Regression with TFID
0.8012295081967213
[[2630  142   42]
 [ 377  452   55]
 [ 163   94  437]]
              precision    recall  f1-score   support

           0       0.83      0.93      0.88      2814
           1       0.66      0.51      0.58       884
           2       0.82      0.63      0.71       694

    accuracy                           0.80      4392
   macro avg       0.77      0.69      0.72      4392
weighted avg       0.79      0.80      0.79      4392



###6. Summarize your understanding

Summary:


*   The Twitter dataset had airline review in text and sentiment in negative, neutral and positive which was converted to 0,1,2 respectively for ML
*   Airline review text was cleaned up with

1.   Removed the html tags using BeautifulSoup library
2.   Removed the contraction such as isn’t to is not using contractions library
3.   Tokenized the review using nltk.word_tokenize
4.   Once tokenized, removed the non-ascii characters
5.   Removed the numbers and punctuation using regex library
6.   Converted to lower case
7.   Words all went through lemmatization before merged back into the sentences
One TextProcessing was done, vectorization took place. Vectorization will convert text into the numbers which is required for ML.

*   Two vectorization method is used Count and Tfid and both were limited to 2000 features.
*   Once vectorized, I have used RandomForest and LogisticRegression ML algorithms. TfidVector behaved little better than CountVector counterpart in either case.
*   The best model score was achieved with Logistic Regression using TFID Vectorization which is 0.80122

