train a LR model based on ratings to predict how likely a certain type of Visitor is to leave a positive review about an attraction place

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('singapore_trip_advisor_data.csv')

In [3]:
df.sample(5)

Unnamed: 0,REVIEW_INDEX,REVIEW_DATE,REVIEW_RATING,REVIEW_TITLE,REVIEW_BODY,DATE_OF_EXPERIENCE,TRIP_TYPE,REVIEW_CRAWLED_TIME,REVIEWER_NAME,HOME_COUNTRY,ATTRACTION_NAME,ATTRACTION_TYPE,ADDRESS
50849,544519678,2017-12-02 00:00:00.000,5,Icon for Singapore,The statue is on the Singapore River and open ...,2017-11-01 00:00:00.000,Friends,2020-01-30 02:15:24.906,peach8054,United States,Merlion Park,"Art, History & Culture","1 Fullerton Road One Fullerton | Merlion Park,..."
95340,420121541,2016-09-19 00:00:00.000,5,Wow,This is the best zoo by far. It is simply the ...,2016-09-01 00:00:00.000,Friends,2020-01-26 13:18:00.898,Mel C,,Singapore Zoo,Nature & Wildlife,"80 Mandai Lake Road, Singapore 729826, Singapore"
122667,343468869,2016-01-29 00:00:00.000,5,Orchid lover's paradise,Located in the heart of The Singapore Botanic ...,2016-01-01 00:00:00.000,Couples,2020-01-29 11:08:01.645,ccelan,United States,National Orchid Garden,Leisure & Recreation,1 Cluny Rd | Within the Singapore Botanic Gard...
117122,356135552,2016-03-17 00:00:00.000,5,Must go place in Singapore,This is one of the most happening places in to...,2016-02-01 00:00:00.000,Couples,2020-01-30 01:25:39.824,Rahulchopra-1212,India,Clarke Quay,Precinct & Street,3 River Valley Road Clarke Quay | Nearest Trai...
58961,520760643,2017-09-03 00:00:00.000,4,A fun family experience,"We took about 10 family members on the flyer, ...",2016-12-01 00:00:00.000,Family,2020-01-28 11:28:34.646,StephNOwen,United States,Singapore Flyer,Leisure & Recreation,"30 Raffles Avenue, Singapore 039803, Singapore"


In [4]:
# defining a positive review and non positive review

# choosing rating 4 and 5 as a proxy for positive experience

# and rating 3,2,1 as a non-positive experience

In [5]:
# adding a experience column based on the above
def positive_experience_segregator(rating_score):
    if rating_score in [4,5]:
        experience = "positive"
    else:
        experience = "non-positive"
    return experience

In [6]:
df['experience'] = df['REVIEW_RATING'].apply(positive_experience_segregator)

In [7]:
########
#### model 1
########
# features used - rating, trip type, home country, attraction name
# target - experience

In [8]:
# dropping rows with na in home country
df1 = df[df['HOME_COUNTRY'].notna()]

In [9]:
df1.shape

(103813, 14)

In [10]:
X = df1[['REVIEW_RATING','TRIP_TYPE','HOME_COUNTRY','ATTRACTION_NAME']]
y = df1['experience']

In [11]:
X_new = pd.get_dummies(X,drop_first=True)

In [12]:
from sklearn.model_selection import train_test_split

train_ratio = 0.6
test_ratio = 0.2
val_ratio = 0.2

# 
X_train_val,X_test,y_train_val,y_test = train_test_split(X_new,y,test_size = test_ratio,stratify = y)
X_train,X_val,y_train,y_val = train_test_split(X_train_val,y_train_val,test_size=val_ratio/(val_ratio+train_ratio),stratify = y_train_val)

In [13]:
len(X_train),len(X_val),len(X_test)

(62287, 20763, 20763)

In [14]:
from sklearn.linear_model import LogisticRegression

In [15]:
lr = LogisticRegression()

In [16]:
lr.fit(X_train,y_train)

LogisticRegression()

In [17]:
lr.score(X_train,y_train)

1.0

In [18]:
lr.score(X_val,y_val)

1.0

In [19]:
lr.score(X_test,y_test)

1.0

In [20]:
lr.predict(X_test)

array(['positive', 'positive', 'positive', ..., 'positive', 'positive',
       'non-positive'], dtype=object)

In [21]:
###########
#### model 2
###########

In [22]:
# features used -  trip type, home country, attraction name
# target - rating

In [23]:
X = df1[['TRIP_TYPE','HOME_COUNTRY','ATTRACTION_NAME']]
y = df1['REVIEW_RATING']

In [24]:
X_new = pd.get_dummies(X,drop_first=True)

In [25]:
X_train_val,X_test,y_train_val,y_test = train_test_split(X_new,y,test_size = test_ratio,stratify = y)
X_train,X_val,y_train,y_val = train_test_split(X_train_val,y_train_val,test_size=val_ratio/(val_ratio+train_ratio),stratify = y_train_val)

In [26]:
lr.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [27]:
lr.score(X_train,y_train)

0.6347391911634852

In [28]:
lr.score(X_val,y_val)

0.6279439387371767

In [29]:
lr.score(X_test,y_test)

0.6293406540480663

In [30]:
from sklearn.metrics import classification_report,confusion_matrix

In [31]:
print(classification_report(y_test,lr.predict(X_test)))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00       175
           2       0.00      0.00      0.00       296
           3       0.00      0.00      0.00      1509
           4       0.41      0.13      0.20      5740
           5       0.65      0.94      0.77     13043

    accuracy                           0.63     20763
   macro avg       0.21      0.22      0.19     20763
weighted avg       0.52      0.63      0.54     20763



In [37]:
print(confusion_matrix(y_test,lr.predict(X_test)))

[[    0     0     0    30   145]
 [    1     0     1    48   246]
 [    0     0     0   272  1237]
 [    1     0     2   762  4975]
 [    1     1     6   730 12305]]
