In [1]:
import pandas as pd
import numpy as np  

In [36]:
data = pd.read_csv("train.csv")

In [37]:
data.shape

(38932, 5)

In [38]:
data.sample(5)

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used,Is_Response
10509,id20835,We stayed at this hotel after we stayed at the...,Edge,Desktop,not happy
33015,id43341,This is a great place if you want to be part o...,Google Chrome,Desktop,happy
3181,id13507,High end hotel adjoining Houston Bush Int'l Ai...,Mozilla,Mobile,happy
110,id10436,"The hotel is pleasant, but a little dated, and...",Edge,Mobile,not happy
9253,id19579,I had booked our room online and was a little ...,Edge,Desktop,happy


In [39]:
data['Is_Response'].value_counts()


happy        26521
not happy    12411
Name: Is_Response, dtype: int64

In [40]:
data.drop(columns = ['User_ID', 'Browser_Used', 'Device_Used'], inplace = True)

In [41]:
data.head()

Unnamed: 0,Description,Is_Response
0,The room was kind of clean but had a VERY stro...,not happy
1,I stayed at the Crown Plaza April -- - April -...,not happy
2,I booked this hotel through Hotwire at the low...,not happy
3,Stayed here with husband and sons on the way t...,happy
4,My girlfriends and I stayed here to celebrate ...,not happy


In [42]:
data["Is_Response"] = data["Is_Response"].map({"happy" : "positive", "not happy" : "negative"})
data.sample(5)

Unnamed: 0,Description,Is_Response
22204,This hotel is fair at best. It is definitely a...,negative
27692,My daughter and I stayed at this hotel a coupl...,positive
4736,My husband and I stayed for only one night dur...,negative
34585,My mother and I stayed at the Warwick for - ni...,negative
17435,"The renovated hotel is nice, but located very ...",negative


In [43]:
#Cleaning text
import re
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer

tokenizer = WordPunctTokenizer()
twitter_handle = r'@[A-Za-z0-9_]+'                         # remove twitter handle (@username)
url_handle = r'http[^ ]+'                                  # remove website URLs that start with 'https?://'
combined_handle = r'|'.join((twitter_handle, url_handle))  # join
www_handle = r'www.[^ ]+'                                  # remove website URLs that start with 'www.'
punctuation_handle = r'\W+'

In [44]:
from nltk.corpus import stopwords
stop_words= set(stopwords.words('english'))

In [45]:
def process_text(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()

    try:
        text = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        text = souped

    cleaned_text = re.sub(punctuation_handle, " ",(re.sub(www_handle, '', re.sub(combined_handle, '', text)).lower()))
    cleaned_text = ' '.join([word for word in cleaned_text.split() if word not in stop_words])

    return (" ".join([word for word in tokenizer.tokenize(cleaned_text) if len(word) > 1])).strip()

In [46]:
example_text = "hahaha if above a ----'-' www.adasd apakah SAYA ingin pergi pada tanggal 15 bulan februari besok ? tidak karena hari kemarin @twitter suka main https://www.twitter.com"

process_text(example_text)

'hahaha apakah saya ingin pergi pada tanggal 15 bulan februari besok tidak karena hari kemarin suka main'

In [47]:

cleaned_text= []
for text in data.Description:
    cleaned_text.append(process_text(text))

clean_text= pd.DataFrame({'clean_text' : cleaned_text})
data = pd.concat([data, clean_text], axis=1)

data.sample(10)

Unnamed: 0,Description,Is_Response,clean_text
1897,Location was excellent and rooms were very nic...,positive,location excellent rooms nice loved overall ex...
5431,Before I arrived at the Hudson I was a little ...,positive,arrived hudson little worried expect read revi...
21341,Check out the prepay deals -- a great bargain ...,positive,check prepay deals great bargain sure plans ho...
31704,We've stayed for four days in this hotel. In t...,negative,stayed four days hotel entrance seems beaultif...
4166,It's all been said in one way or another in th...,positive,said one way another previous reviews take bro...
33575,Advertised as -- min walk to white house but a...,negative,advertised min walk white house actually min w...
6996,While holding an event at this property I foun...,positive,holding event property found nothing good thin...
9232,When we arrived we werre greeted by some of th...,positive,arrived werre greeted friendly doorman come ac...
16003,I cannot praise the Lenox Staff enough for the...,positive,cannot praise lenox staff enough delightful ti...
13360,We stayed at California Suites Hotel for - nig...,positive,stayed california suites hotel nights pleased ...


In [49]:
input_train = data['clean_text']  
output_train = data['Is_Response'] 

In [67]:
from sklearn.model_selection import train_test_split

In [69]:
x_train,x_test,y_train,y_test=train_test_split(input_train,output_train, test_size=0.1, random_state=47 )

In [73]:

len(x_test)

3894

In [72]:
len(y_test)

3894

In [74]:
len(x_train)

35038

In [75]:
len(y_train)


35038

In [55]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [56]:
#We will train the model of this project by Vectorizing using TF-IDF and the Classifier using Logistic Regression and RFC

In [76]:
tfv= TfidfVectorizer()
rfc= RandomForestClassifier()


In [65]:
#Lets create a model Pipeline..

from sklearn.pipeline import Pipeline

model= Pipeline( [('vectorizer', tfv),('Logictic_regression', lgc)])



In [77]:
model.fit(x_train, y_train)

Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('Logictic_regression', LogisticRegression())])

In [78]:
pred = model.predict(x_test)

In [83]:
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score

In [82]:

confusion_matrix(y_test, pred)

array([[ 947,  297],
       [ 176, 2474]], dtype=int64)

In [84]:
classification_report(y_test, pred)

'              precision    recall  f1-score   support\n\n    negative       0.84      0.76      0.80      1244\n    positive       0.89      0.93      0.91      2650\n\n    accuracy                           0.88      3894\n   macro avg       0.87      0.85      0.86      3894\nweighted avg       0.88      0.88      0.88      3894\n'

In [85]:
accuracy_score(y_test, pred)

0.8785310734463276

In [86]:
example_text = ["I'm very happy now"]

example_result = model.predict(example_text)



In [87]:
example_result

array(['positive'], dtype=object)

In [89]:
data_t= pd.read_csv("test.csv")

In [90]:
data_t.head()

Unnamed: 0,User_ID,Description,Browser_Used,Device_Used
0,id80132,Looking for a motel in close proximity to TV t...,Firefox,Mobile
1,id80133,Walking distance to Madison Square Garden and ...,InternetExplorer,Desktop
2,id80134,Visited Seattle on business. Spent - nights in...,IE,Tablet
3,id80135,This hotel location is excellent and the rooms...,Edge,Mobile
4,id80136,This hotel is awesome I love the service Antho...,Mozilla,Mobile


In [92]:
data_t.drop(columns= ['Browser_Used', 'Device_Used'], inplace=True)

In [94]:
data_t.head()

Unnamed: 0,User_ID,Description
0,id80132,Looking for a motel in close proximity to TV t...
1,id80133,Walking distance to Madison Square Garden and ...
2,id80134,Visited Seattle on business. Spent - nights in...
3,id80135,This hotel location is excellent and the rooms...
4,id80136,This hotel is awesome I love the service Antho...


In [96]:
cleaned_text= []
for text in data_t.Description:
    cleaned_text.append(process_text(text))

clean_text= pd.DataFrame({'clean_text' : cleaned_text})
data_t = pd.concat([data_t, clean_text], axis=1)


In [97]:
data_t.head()

Unnamed: 0,User_ID,Description,clean_text
0,id80132,Looking for a motel in close proximity to TV t...,looking motel close proximity tv taping dr phi...
1,id80133,Walking distance to Madison Square Garden and ...,walking distance madison square garden penn su...
2,id80134,Visited Seattle on business. Spent - nights in...,visited seattle business spent nights vintage ...
3,id80135,This hotel location is excellent and the rooms...,hotel location excellent rooms clean suite one...
4,id80136,This hotel is awesome I love the service Antho...,hotel awesome love service anthony really grea...


In [98]:
data_t['output']= model.predict(data_t['clean_text'])

In [99]:
data_t.head()

Unnamed: 0,User_ID,Description,clean_text,output
0,id80132,Looking for a motel in close proximity to TV t...,looking motel close proximity tv taping dr phi...,negative
1,id80133,Walking distance to Madison Square Garden and ...,walking distance madison square garden penn su...,positive
2,id80134,Visited Seattle on business. Spent - nights in...,visited seattle business spent nights vintage ...,positive
3,id80135,This hotel location is excellent and the rooms...,hotel location excellent rooms clean suite one...,negative
4,id80136,This hotel is awesome I love the service Antho...,hotel awesome love service anthony really grea...,positive
