### Importing the dependencies

In [2]:
import pandas as pd
import numpy as np
import re
import math

### Exploratory Data Analysis

#### Loading the data

In [3]:
data = pd.read_csv('./datasets/yelp_ratings.csv') # Load the data

In [4]:
data.head() # Get a glimpse of the data

Unnamed: 0,text,stars,sentiment
0,Total bill for this horrible service? Over $8G...,1.0,0
1,I *adore* Travis at the Hard Rock's new Kelly ...,5.0,1
2,I have to say that this office really has it t...,5.0,1
3,Went in for a lunch. Steak sandwich was delici...,5.0,1
4,Today was my second out of three sessions I ha...,1.0,0


#### Gathering informations

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20806 entries, 0 to 20805
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   text       20806 non-null  object 
 1   stars      20806 non-null  float64
 2   sentiment  20806 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 487.8+ KB


Seems like there's no missing values, but let's check that later on

In [6]:
data.describe()

Unnamed: 0,stars,sentiment
count,20806.0,20806.0
mean,3.831491,0.747092
std,1.518851,0.434689
min,1.0,0.0
25%,2.0,0.0
50%,5.0,1.0
75%,5.0,1.0
max,5.0,1.0


Let's see how many customers gave a 5 stars review

In [7]:
five_stars_count = data.loc[data['stars'] == 5, 'stars'].count()
print(f'Number of 5 stars reviews: {five_stars_count}')

Number of 5 stars reviews: 10438


The average stars the customers gave is 3.8. Now let's gather more information

In [8]:
print(f"Number of rows: {len(data)}") # Check the number of rows

Number of rows: 20806


In [9]:
data.columns.tolist() # Check the columns

['text', 'stars', 'sentiment']

In [10]:
data.isna().sum() # Check missing values

text         0
stars        0
sentiment    0
dtype: int64

In [11]:
data['sentiment'].unique()

array([0, 1], dtype=int64)

Now let's check how many sentiment reviews are there in the data

In [12]:
sentimental_reviews_count = data.loc[data['sentiment'] == 1, 'sentiment'].count()
print(f'Number of sentimental reviews: {sentimental_reviews_count} from {len(data)}')

Number of sentimental reviews: 15544 from 20806


In [13]:
nonsentimental_reviews_count = data.loc[data['sentiment'] == 0, 'sentiment'].count()
print(f'Number of non-sentimental reviews: {nonsentimental_reviews_count} from {len(data)}')

Number of non-sentimental reviews: 5262 from 20806


In [14]:
print(f"Sentimental portion: {round((sentimental_reviews_count / len(data)) * 100)}%")

Sentimental portion: 75%


In [15]:
print(f"Non-sentimental portion: {round((nonsentimental_reviews_count / len(data)) * 100)}%")

Non-sentimental portion: 25%


Great! Now that's done, let's try to tokenize the words. So it would be easier for us in the next step

In [16]:
def tokenize_words(text: str):
    pattern = r'\w+'
    tokens = re.findall(pattern, text.casefold())
    return tokens

In [17]:
tokenized_sentences = [tokenize_words(doc) for doc in data['text'].to_list()] # Also known as documents

In [18]:
tokenized_sentences # Get a glimpse of the tokenized sentences

[['total',
  'bill',
  'for',
  'this',
  'horrible',
  'service',
  'over',
  '8gs',
  'these',
  'crooks',
  'actually',
  'had',
  'the',
  'nerve',
  'to',
  'charge',
  'us',
  '69',
  'for',
  '3',
  'pills',
  'i',
  'checked',
  'online',
  'the',
  'pills',
  'can',
  'be',
  'had',
  'for',
  '19',
  'cents',
  'each',
  'avoid',
  'hospital',
  'ers',
  'at',
  'all',
  'costs'],
 ['i',
  'adore',
  'travis',
  'at',
  'the',
  'hard',
  'rock',
  's',
  'new',
  'kelly',
  'cardenas',
  'salon',
  'i',
  'm',
  'always',
  'a',
  'fan',
  'of',
  'a',
  'great',
  'blowout',
  'and',
  'no',
  'stranger',
  'to',
  'the',
  'chains',
  'that',
  'offer',
  'this',
  'service',
  'however',
  'travis',
  'has',
  'taken',
  'the',
  'flawless',
  'blowout',
  'to',
  'a',
  'whole',
  'new',
  'level',
  'travis',
  's',
  'greets',
  'you',
  'with',
  'his',
  'perfectly',
  'green',
  'swoosh',
  'in',
  'his',
  'otherwise',
  'perfectly',
  'styled',
  'black',
  'hair'

Let's find the average words count for each sentences

In [19]:
number_of_words = sum([len(tokenized_sentence) for tokenized_sentence in tokenized_sentences])
print(f'Total number of words: {number_of_words}')

Total number of words: 2225889


In [20]:
print(f"Average words count per sentence: {round(number_of_words / len(data['text']))} words")

Average words count per sentence: 107 words


### Word-weighting

Let's first tokenize those words

In [21]:
tokens = tuple([token for sentence in tokenized_sentences for token in sentence])

In [22]:
tokens

('total',
 'bill',
 'for',
 'this',
 'horrible',
 'service',
 'over',
 '8gs',
 'these',
 'crooks',
 'actually',
 'had',
 'the',
 'nerve',
 'to',
 'charge',
 'us',
 '69',
 'for',
 '3',
 'pills',
 'i',
 'checked',
 'online',
 'the',
 'pills',
 'can',
 'be',
 'had',
 'for',
 '19',
 'cents',
 'each',
 'avoid',
 'hospital',
 'ers',
 'at',
 'all',
 'costs',
 'i',
 'adore',
 'travis',
 'at',
 'the',
 'hard',
 'rock',
 's',
 'new',
 'kelly',
 'cardenas',
 'salon',
 'i',
 'm',
 'always',
 'a',
 'fan',
 'of',
 'a',
 'great',
 'blowout',
 'and',
 'no',
 'stranger',
 'to',
 'the',
 'chains',
 'that',
 'offer',
 'this',
 'service',
 'however',
 'travis',
 'has',
 'taken',
 'the',
 'flawless',
 'blowout',
 'to',
 'a',
 'whole',
 'new',
 'level',
 'travis',
 's',
 'greets',
 'you',
 'with',
 'his',
 'perfectly',
 'green',
 'swoosh',
 'in',
 'his',
 'otherwise',
 'perfectly',
 'styled',
 'black',
 'hair',
 'and',
 'a',
 'vegas',
 'worthy',
 'rockstar',
 'outfit',
 'next',
 'comes',
 'the',
 'most',
 '

In [23]:
print(f'Variation of words: {len(tokens)}')

Variation of words: 2225889


Now let's make the utility functions to weigh those words

In [24]:
# Calculate the term-frequencies (TF)
def calculate_tf(word_list: list[str]):
    tf_dict = {}
    total_words = len(word_list)
    for word in word_list:
        tf_dict[word] = tf_dict.get(word, 0) + 1 / total_words
    
    return tf_dict

In [25]:
# Calculate the inverse document frequenciy (IDF)
def calculate_idf(documents: list[list[str]]):
    idf_dict = {}
    total_docs = len(documents)
    word_doc_count = {}
    
    # Count the number of documents containing each word
    for document in documents:
        unique_words = set(document)
        for word in unique_words:
            if word in word_doc_count:
                word_doc_count[word] += 1
            else:
                word_doc_count[word] = 1
    
    # Calculate IDF for each word
    for word, doc_count in word_doc_count.items():
        idf_dict[word] = math.log(total_docs / (1 + doc_count))
    
    return idf_dict

In [26]:
# Calculate the TF-IDF values for each word in a document
def calculate_tf_idf(tf: dict, idf: dict):
    tfidf_dict = {word: tf[word] * idf[word] for word in tf}
    return tfidf_dict

In [27]:
tfs = [calculate_tf(doc) for doc in tokenized_sentences]

In [28]:
idf = calculate_idf(tokenized_sentences)

In [29]:
tfidf = [calculate_tf_idf(tf, idf) for tf in tfs]

In [30]:
tfidf

[{'total': 0.11046684826824592,
  'bill': 0.10416573587769591,
  'for': 0.03940280525092651,
  'this': 0.017505742177088432,
  'horrible': 0.09622259691737675,
  'service': 0.03186279361837838,
  'over': 0.056159791712295175,
  '8gs': 0.23717562833515113,
  'these': 0.07736926893641981,
  'crooks': 0.20900608247186628,
  'actually': 0.08175365605494558,
  'had': 0.0506236687903529,
  'the': 0.005549555441256579,
  'nerve': 0.16950749167834966,
  'to': 0.007903554169995037,
  'charge': 0.09893582750142742,
  'us': 0.05365393601947949,
  '69': 0.18728049630809185,
  '3': 0.06768786242036001,
  'pills': 0.3745609926161837,
  'i': 0.007303445883786375,
  'checked': 0.11083579025419718,
  'online': 0.11037543631531012,
  'can': 0.03962790608104059,
  'be': 0.0291697832492608,
  '19': 0.14871440921126428,
  'cents': 0.16036198029530524,
  'each': 0.08145653771719207,
  'avoid': 0.11573464247888204,
  'hospital': 0.1471360016984583,
  'ers': 0.21940262370540894,
  'at': 0.02881853082982075,
 

Great! Now let's turn this into a DataFrame

In [32]:
pd.DataFrame(tfidf)

Unnamed: 0,total,bill,for,this,horrible,service,over,8gs,these,crooks,...,restaurent,westisland,thn,dagwoods,doen,meme,depiction,simplification,taster,giardinara
0,0.110467,0.104166,0.039403,0.017506,0.096223,0.031863,0.056160,0.237176,0.077369,0.209006,...,,,,,,,,,,
1,,,0.001816,0.004842,,0.004407,0.007767,,,,...,,,,,,,,,,
2,,,0.013970,0.031033,,,,,,,...,,,,,,,,,,
3,,,0.007533,,,,,,,,...,,,,,,,,,,
4,,,0.006790,0.004022,,0.001830,0.003226,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20801,,,,,,0.035504,,,,,...,,,,,,,,,,
20802,,,,0.018965,,0.034518,,,,,...,,,,,,0.25694,,,,
20803,,,,,,,,,,,...,,,,,,,0.205552,0.205552,,
20804,,,,0.007421,,0.013507,,,,,...,,,,,,,,,0.100542,0.100542
