In [11]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Load the dataset from CSV file
data = pd.read_csv('Reviews(13).csv')

In [12]:
data

Unnamed: 0,review,Label
0,I do not know how you can say he is a doctor,Negative
1,He is not a doctor. Just know how to charge money,negative
2,Alhmdulillah positive experience,positive
3,Glad to have good experience,positive
4,Fair review.,Neutral
...,...,...
24430,I booked an appointment for my friend for 12:...,Negative
24431,Not treated in a good way,Negative
24432,Doctor Noor Samad khan was a great doctor and ...,Positive
24433,I feel he is not competent in the field of sex...,Negative


In [13]:


## Correcting the spelling of one misspelled label
data['Label'] = data['Label'].str.replace('negative', 'Negative')
data['Label'] = data['Label'].str.replace('neutral', 'Neutral')
data['Label'] = data['Label'].str.replace('positive', 'Positive')
data['Label'] = data['Label'].str.replace('Netural', 'Neutral')
data['Label'] = data['Label'].str.replace(' Neutral', 'Neutral')

data['Label'].unique()

data['Label'].value_counts()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['review'], data['Label'], test_size=0.2, random_state=42)

# Create a CountVectorizer object to convert the text data to numerical features
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Initialize a Random Forest classifier with 100 trees
rfc = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
rfc.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rfc.predict(X_test)

# Calculate the F1-score and accuracy of the predictions
f1 = f1_score(y_test, y_pred, average='weighted')
acc = accuracy_score(y_test, y_pred)
print(f'F1-Score: {f1:.4f}')
print(f'Accuracy: {acc:.4f}')

# Print classification report
report = classification_report(y_test, y_pred)
print(report)

# Make predictions on some new reviews
new_reviews = [
    'This doctor was fantastic!',
    'I hated this hospital.',
    'Ok experience tha',
    'Bohat bura experience tha.'
]
new_reviews_transformed = vectorizer.transform(new_reviews)
new_predictions = rfc.predict(new_reviews_transformed)
print(new_predictions)


F1-Score: 0.9633
Accuracy: 0.9632
              precision    recall  f1-score   support

    Negative       0.95      0.97      0.96      1822
     Neutral       0.93      0.94      0.93       683
    Positive       0.99      0.97      0.98      2382

    accuracy                           0.96      4887
   macro avg       0.95      0.96      0.96      4887
weighted avg       0.96      0.96      0.96      4887

['Positive' 'Negative' 'Neutral' 'Negative']


In [14]:
data['Label'].value_counts()

Positive    11730
Negative     9109
Neutral      3596
Name: Label, dtype: int64