## Data Visualization

In [1]:
#import packages
import pandas as pd

In [2]:
df = pd.read_csv('data/new_data.csv')
print("File loaded successfully.")
print("Preview of the data:")
df.head()

File loaded successfully.
Preview of the data:


Unnamed: 0,Description,Is_Response
0,Booked at the Marrakech since it was the only ...,0
1,booking did not happen booked hotel trip advis...,0
2,My husband and I just left the Hilton Times Sq...,1
3,"thank god beautiful beaches, husband traveled ...",0
4,My stay here was brief but service was excepti...,1


In [3]:
print('Dataset Info:')
print(df.info())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32250 entries, 0 to 32249
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Description  32250 non-null  object
 1   Is_Response  32250 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 504.0+ KB
None


In [4]:
# checking the distribution of the 'Is_Response' column
print("Distribution of sentiments (happy vs. not happy, 1 v/s 0):")
print(df['Is_Response'].value_counts())
print("\n" + "="*50 + "\n")

Distribution of sentiments (happy vs. not happy, 1 v/s 0):
Is_Response
1    16625
0    15625
Name: count, dtype: int64




In [5]:
# missing values
print("Checking for missing values in each column:")
print(df.isnull().sum())

Checking for missing values in each column:
Description    0
Is_Response    0
dtype: int64


In [6]:
# Add word count and character count columns
df["word_count"] = df["Description"].apply(lambda x: len(str(x).split()))
df["char_count"] = df["Description"].apply(lambda x: len(str(x)))

# Group by class and calculate averages
eda_summary = df.groupby("Is_Response")[["word_count", "char_count"]].mean().reset_index()
eda_summary

Unnamed: 0,Is_Response,word_count,char_count
0,0,175.751552,997.579584
1,1,121.330707,720.532391


In [9]:
# Calculate mean, mode, min, and max for word_count and char_count per class
eda_detailed = df.groupby("Is_Response").agg({
    "word_count": ["mean", lambda x: x.mode().iloc[0] if not x.mode().empty else None, "min", "max"],
    "char_count": ["mean", lambda x: x.mode().iloc[0] if not x.mode().empty else None, "min", "max"]
})

# Rename columns for clarity
eda_detailed.columns = ["_".join([col[0], col[1] if isinstance(col[1], str) else "mode"]) 
                        for col in eda_detailed.columns]
eda_detailed = eda_detailed.reset_index()

In [10]:
eda_detailed

Unnamed: 0,Is_Response,word_count_mean,word_count_<lambda_0>,word_count_min,word_count_max,char_count_mean,char_count_<lambda_0>,char_count_min,char_count_max
0,0,175.751552,84,5,2289,997.579584,514,30,14340
1,1,121.330707,40,5,1780,720.532391,216,28,12738


## Data Preprocessing

In [8]:
#impoting nltk libraries for preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [9]:
# an workaround for ssl certificate verification
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [10]:
#downloading necessary nltk data files
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/shirsa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
nltk.data.path #check path of nltk data
nltk.download('punkt_tab') # punkt tokenizer model

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/shirsa/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [13]:
# Loading the dataset again here
df = pd.read_csv('data/new_data.csv')

# Initializing the lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


In [14]:
def preprocess_text(text):
    """
    Cleans and preprocesses a single piece of text.
    """
    # 1. Lowercase and remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = text.lower()
    
    # 2. Tokenize
    tokens = word_tokenize(text)
    
    # 3. Lemmatize and remove stop words
    processed_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 1]
    
    return " ".join(processed_tokens)

In [15]:
print("\nPreprocessing on the 'Description' column...")
# Applying the function to the desired 'Description' column.
df['Cleaned_Description'] = df['Description'].apply(preprocess_text) #creating a new column for comparison
print("Preprocessing complete.")


Preprocessing on the 'Description' column...
Preprocessing complete.


In [16]:
# Display the comparison
print("\nOriginal vs. cleaned reviews:")
df[['Description', 'Cleaned_Description']].head()


Original vs. cleaned reviews:


Unnamed: 0,Description,Cleaned_Description
0,Booked at the Marrakech since it was the only ...,booked marrakech since hotel could find night ...
1,booking did not happen booked hotel trip advis...,booking happen booked hotel trip advisor sugge...
2,My husband and I just left the Hilton Times Sq...,husband left hilton time square fabulous stay ...
3,"thank god beautiful beaches, husband traveled ...",thank god beautiful beach husband traveled dom...
4,My stay here was brief but service was excepti...,stay brief service exceptional left clothes re...


In [17]:
# Saving preprocessed data
processed_file_path = 'data/third_hotel_reviews.csv'
df.to_csv(processed_file_path, index=False)

print(f"\nSuccessfully saved the fully preprocessed data to '{processed_file_path}'")


Successfully saved the fully preprocessed data to 'data/third_hotel_reviews.csv'
