In [212]:
import numpy as np
import pandas as pd

In [213]:
df = pd.read_csv('tripadvisor_hotel_reviews.csv')
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


## Data Understanding

In [214]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20491 entries, 0 to 20490
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  20491 non-null  object
 1   Rating  20491 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 320.3+ KB


In [215]:
df.shape

(20491, 2)

In [216]:
df.dtypes

Review    object
Rating     int64
dtype: object

## Converting Ratings

In [217]:
df['Rating'].mask(df['Rating'] <= 2, other = 0, inplace=True) 
df['Rating'].mask(df['Rating'] == 3, other = 1, inplace=True) 
df['Rating'].mask(df['Rating'] >= 4, other = 2, inplace=True) 

In [218]:
df

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,2
1,ok nothing special charge diamond member hilto...,0
2,nice rooms not 4* experience hotel monaco seat...,1
3,"unique, great stay, wonderful time hotel monac...",2
4,"great stay great stay, went seahawk game aweso...",2
...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",2
20487,great location price view hotel great quick pl...,2
20488,"ok just looks nice modern outside, desk staff ...",0
20489,hotel theft ruined vacation hotel opened sept ...,0


## Checking for NaN values

In [219]:
df.isna().sum()

Review    0
Rating    0
dtype: int64

## Converting Text to Lower Case

In [220]:
df['Review'] = df['Review'].str.lower()

In [221]:
df

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,2
1,ok nothing special charge diamond member hilto...,0
2,nice rooms not 4* experience hotel monaco seat...,1
3,"unique, great stay, wonderful time hotel monac...",2
4,"great stay great stay, went seahawk game aweso...",2
...,...,...
20486,"best kept secret 3rd time staying charm, not 5...",2
20487,great location price view hotel great quick pl...,2
20488,"ok just looks nice modern outside, desk staff ...",0
20489,hotel theft ruined vacation hotel opened sept ...,0


# Text Cleaning

In [222]:
import re

## Removing Non Alphabetical Characters 

In [223]:
for ind in df.index:
    text = df['Review'][ind]
    text = re.sub(r'[^A-Za-z\s]', '  ', text)
    df.loc[ind:ind,'Review':'Rating']=text,df['Rating'][ind]
df

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,2
1,ok nothing special charge diamond member hilto...,0
2,nice rooms not experience hotel monaco se...,1
3,unique great stay wonderful time hotel mon...,2
4,great stay great stay went seahawk game awes...,2
...,...,...
20486,best kept secret rd time staying charm not...,2
20487,great location price view hotel great quick pl...,2
20488,ok just looks nice modern outside desk staff...,0
20489,hotel theft ruined vacation hotel opened sept ...,0


## Removing Hyperlinks

In [224]:
for ind in df.index:
    text = df['Review'][ind]
    text = re.sub(r'(https?://)?(www\.)?\w+\.\w+', '',text)
    df.loc[ind:ind,'Review':'Rating']=text,df['Rating'][ind]
df

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,2
1,ok nothing special charge diamond member hilto...,0
2,nice rooms not experience hotel monaco se...,1
3,unique great stay wonderful time hotel mon...,2
4,great stay great stay went seahawk game awes...,2
...,...,...
20486,best kept secret rd time staying charm not...,2
20487,great location price view hotel great quick pl...,2
20488,ok just looks nice modern outside desk staff...,0
20489,hotel theft ruined vacation hotel opened sept ...,0


## Removing More than One Spaces

In [225]:
for ind in df.index:
    text = df['Review'][ind]
    text = re.sub(r'\s+', ' ', text)
    df.loc[ind:ind,'Review':'Rating']=text,df['Rating'][ind]
print("Final Data Frame: \n")
df

Final Data Frame: 



Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,2
1,ok nothing special charge diamond member hilto...,0
2,nice rooms not experience hotel monaco seattle...,1
3,unique great stay wonderful time hotel monaco ...,2
4,great stay great stay went seahawk game awesom...,2
...,...,...
20486,best kept secret rd time staying charm not sta...,2
20487,great location price view hotel great quick pl...,2
20488,ok just looks nice modern outside desk staff n...,0
20489,hotel theft ruined vacation hotel opened sept ...,0
