In [1]:
import pandas as pd
import numpy as np

import os
import itertools
from collections import Counter
import re
import string


import nltk 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer 

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
cust_reviews = pd.read_csv('data/cleaned_reviews.csv')
cust_reviews.head()

Unnamed: 0.1,Unnamed: 0,reviews,cleaned,tokens
0,0,Old A320 with narrow pitch. Flight perfectly o...,old a320 with narrow pitch. flight perfectly o...,"['old', 'a320', 'narrow', 'pitch', 'perfectly'..."
1,1,Another BA Shambles. Started off well with exc...,another ba shambles. started off well with exc...,"['another', 'shambles', 'started', 'well', 'ex..."
2,2,BA cancelled my flight home to Heathrow on Dec...,ba cancelled my flight home to heathrow on dec...,"['cancelled', 'home', 'heathrow', 'dec', '19th..."
3,3,"BA cancelled my flight home, the last flight o...","ba cancelled my flight home, the last flight o...","['cancelled', 'home', 'last', 'day', 'heathrow..."
4,4,"Turned up 3.5 hours in advance, Terminal 5 at ...","turned up 3.5 hours in advance, terminalat lon...","['turned', '3.5', 'hours', 'advance', 'termina..."


In [3]:
def regex_clean(txt, regex):
    """Replace any text matching the regex

    Parameters
    ----------
    txt : string
        A text string that you want to parse and remove matches
    regex : string 
        A text string of the regex pattern you want to match

    Returns
    -------
    The same txt string with the matches removes
    """

    return " ".join(re.sub(regex, " ", txt).split())

In [24]:
## Creating a stemmer object and a list of stopwords to remove

lemma = WordNetLemmatizer()
stpwrd = nltk.corpus.stopwords.words('english')
stpwrd.extend(string.punctuation)

## Utilising the above techniques, we define a more complex function to prepare our data for analysis


def prep_data(review):
    """ Cleans up a review!

    Parameters
    ----------
    review : string
        A text string that you want to parse and remove matches

    Returns
    -------
    Cleaned up string - tokenized & stemmed!
    """   
    
    ## Pre Token Cleaning - Stuff that applies to a string.
    
    review = review.lower() 
    review = regex_clean(review, r'\s\d+\s')

    review = re.sub(r'\d+', '', review)

    ## Go-Go-Token-Rangers
    review = word_tokenize(review)

    ## Post Token Cleaning - Stuff that applies to a list 
        
    review = [lemma.lemmatize(word) for word in review]
    review = [word for word in review if word not in stpwrd]
    
    return review

### Creating documents

In [25]:
docs = [" ".join(prep_data(x)) for x in cust_reviews['reviews']]

### CountVectorizer to get word frequencies

In [90]:
# create a vocabulary of words, 
# ignore words that appear in 85% of documents, 
# eliminate stop wordsa

cv=CountVectorizer(ngram_range=(1,2),stop_words=stpwrd,min_df=.1) 
# this steps generates word counts for the words in your docs 
word_count_vector=cv.fit_transform(docs)

In [91]:
word_count_vector.shape

(1000, 106)

In [92]:
train_features = word_count_vector.toarray()

In [93]:
train_features.shape

(1000, 106)

In [94]:
vocab = list(cv.get_feature_names_out())


In [95]:
X = pd.DataFrame(train_features,columns=vocab)
X.head()

Unnamed: 0,aircraft,airline,airport,airway,also,another,arrival,arrived,asked,ba,...,told,took,two,via,wa,water,way,well,would,year
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,1,0,0,0,2,...,1,0,0,0,1,1,0,2,0,0
2,0,0,1,0,0,0,0,0,0,2,...,0,0,0,0,6,0,0,0,1,0
3,0,1,0,0,0,2,0,0,0,3,...,0,0,0,0,1,0,0,0,0,1
4,0,0,0,0,0,0,0,1,0,2,...,0,0,0,0,1,0,0,0,0,0


1000 rows with 6825 columns showing 6825 uique words

Compute the IDF values

In [96]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

In [102]:
	
# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names_out(),columns=["idf_weights"]) 
# sort ascending 

df_idf.sort_values(by=['idf_weights'],ascending=False)

Unnamed: 0,idf_weights
late,3.293634
luggage,3.293634
premium,3.293634
call,3.283782
nothing,3.274026
...,...
service,1.712311
ba,1.573701
london,1.563118
flight,1.216671


Compute the TFIDF score for your documents

In [103]:

# count matrix 
count_vector=cv.transform(docs) 
# tf-idf scores 
tf_idf_vector=tfidf_transformer.transform(count_vector)

In [None]:

# count matrix 
count_vector=cv.transform(docs) 
# tf-idf scores 
tf_idf_vector=tfidf_transformer.transform(count_vector)

In [104]:
feature_names = cv.get_feature_names_out() 
#get tfidf vector for first document 
first_document_vector=tf_idf_vector[0] 
#print the scores 
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
luggage,0.460695
water,0.456601
old,0.447490
friendly,0.416026
staff,0.324719
...,...
customer service,0.000000
customer,0.000000
crew,0.000000
could,0.000000


Tfidfvectorizer Usage

In [105]:
from sklearn.feature_extraction.text import TfidfVectorizer 
# settings that you use for count vectorizer will go here 
tfidf_vectorizer=TfidfVectorizer(use_idf=True) 
# just send in all your docs here 
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(docs)

In [106]:
# get the first vector out (for the first document) 
first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0] 
# place tf-idf values in a pandas data frame 
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names_out(), columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
cockpit,0.348671
chip,0.338580
constant,0.322656
delivery,0.316137
perfectly,0.300214
...,...
famed,0.000000
false,0.000000
falling,0.000000
fallen,0.000000


https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/