## Text project

In [14]:
# Import libraries
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk 
import string
import re
%matplotlib inline
pd.set_option('display.max_colwidth', 100)

In [28]:
# Load dataset
def load_data():
    data = pd.read_csv('./data/Tweets.csv')
    return data

In [30]:
tweet_df = load_data()
tweet_df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials to the experience... tacky.,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I need to take another trip!,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &...",,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing about it,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [32]:
print('Dataset size:',tweet_df.shape)
print('Columns are:',tweet_df.columns)
tweet_df.info()

Dataset size: (14640, 15)
Columns are: Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      14640 non-null  int64  
 1   airline_sentiment             14640 non-null  object 
 2   airline_sentiment_confidence  14640 non-null  float64
 3   negativereason                9178 non-null   object 
 4   negativereason_confidence     10522 non-null  float64
 5   airline                       14640 non-null  object 
 6   airline_sentiment_gold        40 non-null    

## Step 1 : take "text" column (not sure)

In [35]:
tweet_df_minus = tweet_df['text']
tweet_df_minus[:5]

0                                                                    @VirginAmerica What @dhepburn said.
1                               @VirginAmerica plus you've added commercials to the experience... tacky.
2                                @VirginAmerica I didn't today... Must mean I need to take another trip!
3    @VirginAmerica it's really aggressive to blast obnoxious "entertainment" in your guests' faces &...
4                                                @VirginAmerica and it's a really big bad thing about it
Name: text, dtype: object


## Step 2: Data Preprocessing

We will perform the following steps: 

1. **Tokenization** : Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
2. Words that have fewer than 3 characters are removed.
3. All **stopwords** are removed.
4. Words are **lemmatized** - words in third person are changed to first person and verbs in past and future tenses are changed into present.
5. Words are **stemmed** - words are reduced to their root form.



In [36]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)

In [37]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\zigbo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [54]:
Words_deleted_length = 3
Lemmatized = True
Stemmed = False


import pandas as pd
stemmer = SnowballStemmer("english")

'''
Write a function to perform the pre processing steps on the entire dataset
'''
def lemmatize_stemming(text):
    if(Lemmatized & Stemmed):
        return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
    if(Stemmed):
        return stemmer.stem(text)
    if(Lemmatized):
        return WordNetLemmatizer().lemmatize(text, pos='v')
    return text

# Tokenize and lemmatize
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > Words_deleted_length:
            result.append(lemmatize_stemming(token))
            
    return result

In [57]:
processed_docs = []
nan_processed_docs = []
for doc in tweet_df_minus :
    nan_processed_docs.append(doc)
for doc in tweet_df_minus :
    processed_docs.append(preprocess(doc))
nan_processed_docs[:2]
processed_docs[:2]

[['virginamerica', 'dhepburn', 'say'],
 ['virginamerica', 'plus', 'add', 'commercials', 'experience', 'tacky']]

## Step 3: Bag of words on the dataset

Now let's create a dictionary from 'processed_docs' containing the number of times a word appears in the training set. To do that, let's pass processed_docs to gensim.corpora.Dictionary() and call it 'dictionary'.


In [59]:
'''
Create a dictionary from 'processed_docs' containing the number of times a word appears 
in the training set using gensim.corpora.Dictionary and call it 'dictionary'
'''
dictionary = gensim.corpora.Dictionary(processed_docs)

In [60]:
'''
Checking dictionary created
'''
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 dhepburn
1 say
2 virginamerica
3 add
4 commercials
5 experience
6 plus
7 tacky
8 mean
9 need
10 today


In [None]:
'''
OPTIONAL STEP
Remove very rare and very common words:

- words appearing less than 15 times
- words appearing in more than 10% of all documents
'''
#dictionary.filter_extremes(no_below=15, no_above=0.1, keep_n= 100000)

In [64]:
'''
Create the Bag-of-words model for each document i.e for each document we create a dictionary reporting how many
words and how many times those words appear. Save this to 'bow_corpus'
'''
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[:4]

[[(0, 1), (1, 1), (2, 1)],
 [(2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(2, 1), (8, 1), (9, 1), (10, 1), (11, 1)],
 [(2, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1)]]

In [65]:
'''
Preview BOW for our sample preprocessed document
'''
document_num = 20
bow_doc_x = bow_corpus[document_num]

for i in range(len(bow_doc_x)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_x[i][0], 
                                                     dictionary[bow_doc_x[i][0]], 
                                                     bow_doc_x[i][1]))

Word 2 ("virginamerica") appears 1 time.
Word 24 ("seat") appears 1 time.
Word 28 ("time") appears 1 time.
Word 88 ("available") appears 1 time.
Word 89 ("carriers") appears 1 time.
Word 90 ("fare") appears 1 time.
Word 91 ("select") appears 1 time.


## Step 4: Running LDA using Bag of Words
We are going for 10 topics in the document corpus.

We will be running LDA using all CPU cores to parallelize and speed up model training.

Some of the parameters we will be tweaking are:

1. **num_topics** is the number of requested latent topics to be extracted from the training corpus.
2. **id2word** is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing.
3. **workers** is the number of extra processes to use for parallelization. Uses all available cores by default.
4. **alpha** and **beta** are hyperparameters that affect sparsity of the document-topic (theta) and topic-word (lambda) distributions. We will let these be the default values for now(default value is 1/num_topics)

 **Alpha** is the per document topic distribution.
   - *High alpha* : Every document has a mixture of all topics(documents appear similar to each other).
   - *Low alpha* : Every document has a mixture of very few topics

  **bEta** is the per topic word distribution.
    - *High beta* : Each topic has a mixture of most words(topics appear similar to each other).
    - *Low eta* : Each topic has a mixture of few words.

5. **passes** is the number of training passes through the corpus. For example, if the training corpus has 50,000 documents, chunksize is 10,000, passes is 2, then online training is done in 10 updates:
- documents 0-9,999
- documents 10,000-19,999
- documents 20,000-29,999
- documents 30,000-39,999
- documents 40,000-49,999
- documents 0-9,999
- documents 10,000-19,999
- documents 20,000-29,999
- documents 30,000-39,999
- documents 40,000-49,999


In [82]:
# LDA mono-core -- fallback code in case LdaMulticore throws an error on your machine
# lda_model = gensim.models.LdaModel(bow_corpus, 
#                                    num_topics = 10, 
#                                    id2word = dictionary,                                    
#                                    passes = 50)

# LDA multicore 
'''
Train your lda model using gensim.models.LdaMulticore and save it to 'lda_model'
'''
# TODO
lda_model =  gensim.models.LdaMulticore(bow_corpus, 
                                   num_topics = 4, 
                                   id2word = dictionary,                                    
                                   passes = 10,
                                   workers = 2)

In [83]:
'''
For each topic, we will explore the words occuring in that topic and its relative weight
'''
for idx, topic in lda_model.print_topics(-1):
    print("Topic: {} \nWords: {}".format(idx, topic ))
    print("\n")

Topic: 0 
Words: 0.076*"americanair" + 0.039*"http" + 0.024*"jetblue" + 0.009*"need" + 0.007*"usairways" + 0.007*"work" + 0.006*"check" + 0.006*"southwestair" + 0.006*"people" + 0.005*"good"


Topic: 1 
Words: 0.044*"jetblue" + 0.044*"flight" + 0.037*"unite" + 0.025*"delay" + 0.019*"usairways" + 0.017*"southwestair" + 0.016*"plane" + 0.013*"thank" + 0.013*"americanair" + 0.011*"http"


Topic: 2 
Words: 0.056*"americanair" + 0.045*"usairways" + 0.033*"service" + 0.031*"thank" + 0.025*"customer" + 0.024*"unite" + 0.014*"southwestair" + 0.009*"wait" + 0.009*"time" + 0.009*"luggage"


Topic: 3 
Words: 0.087*"flight" + 0.059*"americanair" + 0.033*"cancel" + 0.023*"usairways" + 0.019*"southwestair" + 0.017*"help" + 0.016*"flightled" + 0.015*"hold" + 0.014*"book" + 0.013*"try"




## Step 5: Testing LDA model

## Step 6: create T_e_i an input to have $t_{ei}$ after apply Doc2Vec 

In [107]:
list_topics=lda_model.show_topics(formatted=False)
list_topics 

topic = []
T_e_i = []
for tup in list_topics:
    topic = []
    for tup2 in tup[1]:
        topic.append(tup2[0])
    T_e_i.append(topic) 
    
T_e_i  

[['americanair',
  'http',
  'jetblue',
  'need',
  'usairways',
  'work',
  'check',
  'southwestair',
  'people',
  'good'],
 ['jetblue',
  'flight',
  'unite',
  'delay',
  'usairways',
  'southwestair',
  'plane',
  'thank',
  'americanair',
  'http'],
 ['americanair',
  'usairways',
  'service',
  'thank',
  'customer',
  'unite',
  'southwestair',
  'wait',
  'time',
  'luggage'],
 ['flight',
  'americanair',
  'cancel',
  'usairways',
  'southwestair',
  'help',
  'flightled',
  'hold',
  'book',
  'try']]

## Step 7 : Train Doc2Vec on non-pre-processed data "nan_processed_docs"


In [117]:
import smart_open
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
def read(fname):
    for i, line in enumerate(f):
        tokens = fname[i]
        yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [121]:
train_nan_processed_docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(nan_processed_docs)]

In [None]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(train_nan_processed_docs)
model.train(train_nan_processed_docs, total_examples=len(nan_processed_docs), epochs=model.epochs)

## Step 8 : Apply Doc2Vec on each document in your data to form $d_{ei}$

In [124]:
Liste_D_e_i = []
Liste_of_n_docs = processed_docs
for i in range(len(Liste_of_n_docs)):
    vector = []
    vector = model.infer_vector(Liste_of_n_docs[i])
    Liste_D_e_i.append(vector) 
    
print(Liste_of_n_docs[:2])
print(Liste_D_e_i[:2])

[['virginamerica', 'dhepburn', 'say'], ['virginamerica', 'plus', 'add', 'commercials', 'experience', 'tacky']]
[array([-8.1121130e-03, -8.2422188e-03, -8.5009653e-03,  9.4700670e-03,
       -6.1424044e-03, -6.0291924e-03,  9.3967384e-03, -3.2938537e-03,
       -7.4124401e-03,  4.4365735e-03,  3.3982443e-03,  5.7742302e-03,
       -2.0059585e-03, -6.6704517e-03,  3.3299492e-03,  8.7660858e-03,
        3.0309090e-03,  9.5953336e-03,  2.2019406e-03, -3.9082011e-03,
       -5.5418705e-04, -7.8981556e-03,  1.1231005e-03, -7.9345554e-03,
        2.9577192e-05,  9.9902209e-03,  5.6541548e-03, -1.1336298e-03,
       -9.5684947e-03,  7.5632879e-03,  3.8973801e-03,  1.7543974e-04,
        3.7038026e-03,  6.9300961e-03, -5.4608597e-03,  5.8965799e-03,
       -8.2795974e-03, -2.2272721e-03,  1.6476009e-03,  4.3660291e-03,
        3.5117818e-03,  3.2109192e-03, -2.7814955e-03, -6.1880853e-03,
        8.8355839e-03, -2.6708855e-03,  3.3353553e-03,  7.7089434e-03,
        3.2863969e-03, -6.2256311e-0

## Step 9 : Apply Doc2Vec on your topics to form $t_{ei}$

In [126]:
Liste_T_e_i = []
for i in range(len(T_e_i)):
    vector = []
    vector = model.infer_vector(T_e_i[i])
    Liste_T_e_i.append(vector) 
Liste_T_e_i

[array([-1.6189611e-03, -5.7761609e-03,  8.8887429e-03,  7.1812181e-05,
         1.3372501e-03,  2.2232926e-03,  3.3773489e-03, -5.0686724e-03,
         2.3342411e-03,  9.5074857e-03, -5.6510912e-03,  4.9197194e-03,
        -6.3650357e-03,  8.8025462e-03, -4.6121841e-03, -4.7187209e-03,
        -6.0499255e-03,  8.9482125e-03,  4.5692851e-03, -1.5405561e-04,
        -4.2776135e-03,  7.6732994e-03, -9.7185532e-03, -1.5098313e-03,
         2.4043793e-04,  3.5099888e-03,  6.6660470e-03, -6.9918861e-03,
         7.2210683e-03, -9.8983021e-03,  1.9597057e-03, -1.4058575e-03,
         4.4692433e-03, -7.0016026e-03,  4.2176726e-03, -7.0681684e-03,
         9.9102091e-03, -7.9297964e-03,  6.6172373e-03,  9.6123023e-03,
         5.0199782e-03,  8.7301964e-03, -4.1522384e-03,  4.8934389e-03,
         2.2044850e-03,  9.0630125e-04,  3.2948253e-03, -8.0621773e-03,
         4.5056785e-03,  5.5571552e-03], dtype=float32),
 array([ 0.00275685, -0.00764542, -0.009705  ,  0.00741494, -0.00103935,
      

## Step 9 : Calculate the similarity between every pair $d_{ei}$ and $t_{ei}$

## Step 10 : Graph