In [1]:
#pip install gensim

In [2]:
import gensim

In [3]:
#INSTANTIATE THE GENSIM DOC2VEC MODEL
#CREATE VOCABULARY
#TRAIN THE GENSIM MODEL USING train_corpus
#https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html

In [4]:
#READING A DATA FILE (TAGS range from 0 to len(data))
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

def read_corpus(fname, tokens_only=False):
    with open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])


#THIS CREATES A TRAINING CORPUS
train_corpus = list(read_corpus("train.txt"))
test_corpus = list(read_corpus("test.txt", tokens_only=True))

In [5]:
train_corpus

[TaggedDocument(words=['we', 'stayed', 'at', 'the', 'schicago', 'hilton', 'for', 'days', 'and', 'nights', 'for', 'conference', 'have', 'to', 'say', 'normally', 'am', 'very', 'easy', 'going', 'about', 'amenities', 'cleanliness', 'and', 'the', 'like', 'however', 'our', 'experience', 'at', 'the', 'hilton', 'was', 'so', 'awful', 'am', 'taking', 'the', 'time', 'to', 'actually', 'write', 'this', 'review', 'truly', 'do', 'not', 'stay', 'at', 'this', 'hotel', 'when', 'we', 'arrived', 'in', 'our', 'room', 'it', 'was', 'clear', 'that', 'the', 'carpet', 'hadn', 'been', 'vacuumed', 'figuered', 'okay', 'it', 'just', 'the', 'carpet', 'until', 'saw', 'the', 'bathroom', 'although', 'the', 'bathroom', 'had', 'all', 'the', 'superficial', 'indicators', 'of', 'housekeeping', 'having', 'recently', 'cleaned', 'paper', 'band', 'across', 'the', 'toilet', 'paper', 'caps', 'on', 'the', 'drinking', 'glasses', 'etc', 'it', 'was', 'clear', 'that', 'no', 'actual', 'cleaning', 'took', 'place', 'there', 'was', 'spot'

In [6]:
test_corpus

[['recent',
  'stay',
  'at',
  'the',
  'james',
  'hotel',
  'chicago',
  'revealed',
  'the',
  'the',
  'recent',
  'updating',
  'at',
  'this',
  'hotel',
  'was',
  'facelift',
  'only',
  'although',
  'the',
  'rooms',
  'and',
  'public',
  'areas',
  'look',
  'nice',
  'the',
  'hotel',
  'still',
  'needs',
  'serious',
  'work',
  'our',
  'first',
  'room',
  'was',
  'not',
  'clean',
  'when',
  'we',
  'arrived',
  'and',
  'this',
  'non',
  'smoking',
  'room',
  'had',
  'serious',
  'odor',
  'of',
  'cigarette',
  'smoke',
  'we',
  'asked',
  'for',
  'new',
  'room',
  'which',
  'was',
  'clean',
  'and',
  'odor',
  'free',
  'we',
  'had',
  'to',
  'involve',
  'the',
  'manager',
  'before',
  'our',
  'request',
  'was',
  'granted',
  'the',
  'new',
  'room',
  'was',
  'better',
  'at',
  'first',
  'glance',
  'but',
  'was',
  'severely',
  'lacking',
  'we',
  'could',
  'not',
  'get',
  'the',
  'temperature',
  'properly',
  'regulated',
  'the',

In [7]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=40, min_count=2, epochs=30)

In [8]:
model.build_vocab(train_corpus)

In [9]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [10]:
print(model.infer_vector(['good', 'place', 'to', 'live', 'in','hotel']))

[ 0.03172754 -0.32195565 -0.16217731 -0.11892631 -0.0836172  -0.28811073
 -0.15342763 -0.21826561 -0.3744636   0.00147302 -0.00114095 -0.16027592
 -0.22321652 -0.0238859   0.3725392   0.01868436  0.07458221  0.17745832
 -0.18597281 -0.08175774 -0.14712557  0.17209361  0.41219017  0.10396248
 -0.20008846 -0.12950179 -0.13002111  0.18149842 -0.43447623 -0.24634382
  0.25377855 -0.15483057  0.02530701 -0.03119128 -0.16594793 -0.07437051
  0.11855969 -0.11318    -0.3635967  -0.15587224]


In [11]:
#Assessing the Model
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

In [12]:
import collections

counter = collections.Counter(ranks)
print(counter)

Counter({0: 960, 1: 3})


In [13]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Document (962): «we just returned from girls shopping sight seeing trip the palmer house was like home away from home the hotel is undergoing renovations but we did not notice any inconveniences or hear any noise from the work the lobby was breathtaking all dressed up for christmas the staff from check in to housekeeping was very welcoming and friendly our room was updated with two beds and two separate baths which was great for everyone to have their own personal space would not hesitate to recommend this hotel to anyone chicago is great town»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d40,n5,w5,mc2,s0.001,t3):

MOST (962, 0.9003791809082031): «we just returned from girls shopping sight seeing trip the palmer house was like home away from home the hotel is undergoing renovations but we did not notice any inconveniences or hear any noise from the work the lobby was breathtaking all dressed up for christmas the staff from check in to housekeeping was very welcoming and friendly our

In [14]:
import random
doc_id = random.randint(0, len(train_corpus) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))

Train Document (343): «have stayed in the talbott few times and had always liked it this time was my first visit in years and was disappointed the hallways are now looking tired and the carpets need to be replaced had asked for corner room but instead was given small and very dark room looking out onto wall feet away with multiple noisy vents immediately outside the window the lights had to be kept on throughout the day even through the days were bright the bathroom was tiny and would be difficult to manage with people in the room the ac unit rumbled on permanently with background white noise even when switched off having stayed there before know there are better rooms but this one was the same price as the others have been in and the risk is getting it again other small things contributed the lobby brownies are no longer there were no cup lids for the lobby coffee so you couldn walk out with the coffee and got newspaper only on the last morning of night stay the construction noise was

In [15]:
#Testing the Model
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (106): «perfect location clean and courteous staff all added up to great stay its in an area with plenty of stores the mile and restaurants hotel lobby is on the th floor and the elevator to get there was annoyingly slow but overall would highly recommend this place regardless if your there with family friends or on business the subway to wrigley is block away from the front door»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d40,n5,w5,mc2,s0.001,t3):

MOST (884, 0.8026394844055176): «we really enjoyed our brief stay at the millennium knickerbocker from the front desk personnel and bellmen to the housekeeping staff and bartender everyone we met was helpful and welcoming our th floor room was clean and beautiful and the view looking north to the drake hotel and apartments across the street was nice the in room coffee service was appreciated too there isn lot of room in the bathroom for your toiletries but still take room in beautiful older hotel with character over spaci

In [16]:
#GET THE FEATURE VECTORS for a specific training instance in train_corpus using infer_vector()
#https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html
#stack all the vectors using numpy.vstack() to obtain X (features for classifier)
#Read the labels in trainlabels.txt to obtain Y

In [17]:
file2 = open('trainlabels.txt', 'r')
lines2=file2.readlines()

In [18]:
y = []
for x in lines2:
    y.append(x.replace("\n", ""))
y

['1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1'

In [19]:
import numpy
X = numpy.vstack((train_corpus))

  return array(a, dtype, copy=False, order=order, subok=True)


In [20]:
X

array([[list(['we', 'stayed', 'at', 'the', 'schicago', 'hilton', 'for', 'days', 'and', 'nights', 'for', 'conference', 'have', 'to', 'say', 'normally', 'am', 'very', 'easy', 'going', 'about', 'amenities', 'cleanliness', 'and', 'the', 'like', 'however', 'our', 'experience', 'at', 'the', 'hilton', 'was', 'so', 'awful', 'am', 'taking', 'the', 'time', 'to', 'actually', 'write', 'this', 'review', 'truly', 'do', 'not', 'stay', 'at', 'this', 'hotel', 'when', 'we', 'arrived', 'in', 'our', 'room', 'it', 'was', 'clear', 'that', 'the', 'carpet', 'hadn', 'been', 'vacuumed', 'figuered', 'okay', 'it', 'just', 'the', 'carpet', 'until', 'saw', 'the', 'bathroom', 'although', 'the', 'bathroom', 'had', 'all', 'the', 'superficial', 'indicators', 'of', 'housekeeping', 'having', 'recently', 'cleaned', 'paper', 'band', 'across', 'the', 'toilet', 'paper', 'caps', 'on', 'the', 'drinking', 'glasses', 'etc', 'it', 'was', 'clear', 'that', 'no', 'actual', 'cleaning', 'took', 'place', 'there', 'was', 'spot', 'probab

# TRAIN THE CLASSIFIERS

# NeuralNetwork

In [21]:
from numpy import exp, array, random, dot


class NeuralNetwork():
    def __init__(self):
        # Seed the random number generator, so it generates the same numbers
        # every time the program runs.
        random.seed(1)

        # We model a single neuron, with 3 input connections and 1 output connection.
        # We assign random weights to a 3 x 1 matrix, with values in the range -1 to 1
        # and mean 0.
        self.synaptic_weights = 2 * random.random((3, 1)) - 1

    # The Sigmoid function, which describes an S shaped curve.
    # We pass the weighted sum of the inputs through this function to
    # normalise them between 0 and 1.
    def __sigmoid(self, x):
        return 1 / (1 + exp(-x))

    # The derivative of the Sigmoid function.
    # This is the gradient of the Sigmoid curve.
    # It indicates how confident we are about the existing weight.
    def __sigmoid_derivative(self, x):
        return x * (1 - x)

    # We train the neural network through a process of trial and error.
    # Adjusting the synaptic weights each time.
    def train(self, training_set_inputs, training_set_outputs, number_of_training_iterations):
        for iteration in xrange(number_of_training_iterations):
            # Pass the training set through our neural network (a single neuron).
            output = self.think(training_set_inputs)

            # Calculate the error (The difference between the desired output
            # and the predicted output).
            error = training_set_outputs - output

            # Multiply the error by the input and again by the gradient of the Sigmoid curve.
            # This means less confident weights are adjusted more.
            # This means inputs, which are zero, do not cause changes to the weights.
            adjustment = dot(training_set_inputs.T, error * self.__sigmoid_derivative(output))

            # Adjust the weights.
            self.synaptic_weights += adjustment

    # The neural network thinks.
    def think(self, inputs):
        # Pass inputs through our neural network (our single neuron).
        return self.__sigmoid(dot(inputs, self.synaptic_weights))


if __name__ == "__main__":

    #Intialise a single neuron neural network.
    neural_network = NeuralNetwork()

    print ("Random starting synaptic weights: ")
    print (neural_network.synaptic_weights)

    # The training set. We have 4 examples, each consisting of 3 input values
    # and 1 output value.
    training_set_inputs = X
    training_set_outputs = y



Random starting synaptic weights: 
[[-0.16595599]
 [ 0.44064899]
 [-0.99977125]]


In [22]:
# Import python libraries required in this example:
from keras.models import Sequential
from keras.layers import Dense, Activation
import numpy as np

# Use numpy arrays to store inputs (x) and outputs (y):
x = X
y = y 

# Define the network model and its arguments. 
# Set the number of neurons/nodes for each layer:
model = Sequential()
model.add(Dense(2, input_shape=(2,)))
model.add(Activation('sigmoid'))
model.add(Dense(1))
model.add(Activation('sigmoid')) 

# Compile the model and calculate its accuracy:
model.compile(loss='mean_squared_error', optimizer='sgd', metrics=['accuracy']) 

# Print a summary of the Keras model:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 2)                 6         
                                                                 
 activation (Activation)     (None, 2)                 0         
                                                                 
 dense_1 (Dense)             (None, 1)                 3         
                                                                 
 activation_1 (Activation)   (None, 1)                 0         
                                                                 
Total params: 9
Trainable params: 9
Non-trainable params: 0
_________________________________________________________________


In [23]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, roc_auc_score, classification_report, accuracy_score, confusion_matrix 
import seaborn as sns
import matplotlib.pyplot as plt

from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.base import TransformerMixin 
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from nltk.tokenize import word_tokenize
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
import string
import pickle

In [24]:
file2 = open('trainlabels.txt', 'r')
lines2=file2.readlines()
y1 = []
for x in lines2:
    y1.append(x.replace("\n", ""))
    
file3 = open('testlabels.txt', 'r')
lines3=file3.readlines()
y2 = []
for x in lines3:
    y2.append(x.replace("\n", ""))

In [25]:
import pandas as pd
df = pd.read_csv("train.txt", sep="\n",header=None)
df_test = pd.read_csv("test.txt", sep="\n",header=None)

In [26]:
df[1]=y1
df_test[1]=y2

In [27]:
df=df.rename(columns={0:"Review"})
df=df.rename(columns={1:"sentiment"})
df_test=df_test.rename(columns={0:"Review"})
df_test=df_test.rename(columns={1:"sentiment"})

In [28]:
df

Unnamed: 0,Review,sentiment
0,We stayed at the Schicago Hilton for 4 days an...,1
1,Hotel is located 1/2 mile from the train stati...,1
2,I made my reservation at the Hilton Chicago be...,1
3,"When most people think Hilton, they think luxu...",1
4,My husband and I recently stayed stayed at the...,1
...,...,...
958,I stayed at the Palmer House for @ $150/night ...,0
959,beautiful place with European charm. No compla...,0
960,It has been a couple of years since I stayed h...,0
961,Stayed here October 31 through November 5 for ...,0


In [29]:
df_test

Unnamed: 0,Review,sentiment
0,"A recent stay at the James Hotel-Chicago, reve...",1
1,I stayed at the Monaco-Chicago back in April. ...,1
2,I needed a place to stay for a business confer...,1
3,I just returned from a long weekend in Chicago...,1
4,This hotel is rather far from the airport and ...,1
...,...,...
117,My wife and I just returned from a 3 night sta...,0
118,Stayed here for 5 nights while visiting Chicag...,0
119,Just returned from a week in Chicago with the ...,0
120,We just returned from a weekend at the Palmer ...,0


In [30]:
X_train=df['Review']
X_test=df_test['Review']
y_train=df['sentiment']
y_test=df_test['sentiment']

In [31]:
#Text Operation

nlp = English()
stopwords = list(STOP_WORDS)
punctuations = string.punctuation

In [32]:
#Transformation and Vectorization


class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic function to clean the text 
def clean_text(text):     
    return text.strip().lower()

In [33]:
vectorizer = CountVectorizer(tokenizer = word_tokenize, ngram_range=(1,1)) 
tfvectorizer = TfidfVectorizer(tokenizer = word_tokenize)

# LogisticRegression

In [34]:
classifier = LogisticRegression()
LRmodel = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

# Train the Model
LRmodel.fit(X_train,y_train)   
LRpred = LRmodel.predict(X_test)
print(f'Confusion Matrix:\n{confusion_matrix(y_test,LRpred)}')
print(f'\nClassification Report:\n{classification_report(y_test,LRpred)}')
print(f'Accuracy: {accuracy_score(y_test,LRpred)*100}%')
pickle.dump(LRmodel, open('LinearRegression_model.sav', 'wb'))
print('Logistic Regression trained Model Saved')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Confusion Matrix:
[[56  9]
 [ 5 52]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.86      0.89        65
           1       0.85      0.91      0.88        57

    accuracy                           0.89       122
   macro avg       0.89      0.89      0.89       122
weighted avg       0.89      0.89      0.89       122

Accuracy: 88.52459016393442%
Logistic Regression trained Model Saved


# RandomForestClassifier

In [35]:
from sklearn.ensemble import RandomForestClassifier
RFclassifier = RandomForestClassifier(n_estimators = 200)
RFmodel = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', RFclassifier)])

# Train the Model
RFmodel.fit(X_train,y_train)   
RFpred = RFmodel.predict(X_test)
print(f'Confusion Matrix:\n{confusion_matrix(y_test,RFpred)}')
print(f'\nClassification Report:\n{classification_report(y_test,RFpred)}')
print(f'Accuracy: {accuracy_score(y_test,RFpred)*100}%')
pickle.dump(RFmodel, open('RandomForest_model.sav', 'wb'))
print('RandomForest trained Model Saved')

Confusion Matrix:
[[56  9]
 [ 4 53]]

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.86      0.90        65
           1       0.85      0.93      0.89        57

    accuracy                           0.89       122
   macro avg       0.89      0.90      0.89       122
weighted avg       0.90      0.89      0.89       122

Accuracy: 89.34426229508196%
RandomForest trained Model Saved


In [36]:
#Tune parameters on the validation dataset
#Follow the same feature extraction procedure for validation data as you did for the training data 

In [37]:
# Validation Data
file4 = open('validationlabels.txt', 'r')
lines4=file4.readlines()
y3 = []
for x in lines4:
    y3.append(x.replace("\n", ""))

df_validation = pd.read_csv("validation.txt", sep="\n",header=None)
df_validation[1]=y3
df_validation=df_validation.rename(columns={0:"Review"})
df_validation=df_validation.rename(columns={1:"sentiment"})
X_val=df_validation['Review']
y_val=df_validation['sentiment']

In [38]:
classifier = LogisticRegression()
LRmodel = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

# Train the Model
LRmodel.fit(X_train,y_train)   
LRpred = LRmodel.predict(X_val)
print(f'Confusion Matrix:\n{confusion_matrix(y_val,LRpred)}')
print(f'\nClassification Report:\n{classification_report(y_val,LRpred)}')
print(f'Accuracy: {accuracy_score(y_val,LRpred)*100}%')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Confusion Matrix:
[[47 11]
 [ 8 52]]

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.81      0.83        58
           1       0.83      0.87      0.85        60

    accuracy                           0.84       118
   macro avg       0.84      0.84      0.84       118
weighted avg       0.84      0.84      0.84       118

Accuracy: 83.89830508474576%


In [39]:
from sklearn.ensemble import RandomForestClassifier
RFclassifier = RandomForestClassifier(n_estimators = 200)
RFmodel = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', RFclassifier)])

# Train the Model
RFmodel.fit(X_train,y_train)   
RFpred = RFmodel.predict(X_val)
print(f'Confusion Matrix:\n{confusion_matrix(y_val,RFpred)}')
print(f'\nClassification Report:\n{classification_report(y_val,RFpred)}')
print(f'Accuracy: {accuracy_score(y_val,RFpred)*100}%')

Confusion Matrix:
[[43 15]
 [ 8 52]]

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.74      0.79        58
           1       0.78      0.87      0.82        60

    accuracy                           0.81       118
   macro avg       0.81      0.80      0.80       118
weighted avg       0.81      0.81      0.80       118

Accuracy: 80.50847457627118%


In [40]:
#Evaluate precision, recall, f1-score on test dataset
#Follow the same feature extraction procedure as you did for the validation data 