## Importing all the initial necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Importing the dataset

In [None]:
data = pd.read_csv('../Learning/data/Hospital.csv')

In [None]:
# Sample view of the original dataset

data.head()

In [None]:
# List of all the columns in the dataset

data.columns

In [None]:
# Size of the dataset

data.shape

In [None]:
# Analyzing the output variable of the dataset

data['Patient Survey Star Rating'].unique()

In [None]:
# Cleaning the data as per requirement for the further analysis

i = data[((data['Patient Survey Star Rating'] == 'Not Available') | (data['Patient Survey Star Rating'] == 'Not Applicable'))].index
data = data.drop(i)
data.shape

In [None]:
# Preparing a new dataframe with only the required data from the original dataset

data = data[['HCAHPS Question',
       'HCAHPS Answer Description', 'Patient Survey Star Rating']]

In [None]:
# Analyzing tthe data count for each rating value in the dataset

data.groupby('Patient Survey Star Rating')['HCAHPS Answer Description'].count()

In [None]:
data['Patient Survey Star Rating'].unique()

In [None]:
data.groupby('Patient Survey Star Rating')['HCAHPS Answer Description'].count()

In [None]:
# Visual Analysis of the original data

import plotly.express as px

fig = px.histogram(data, x = 'Patient Survey Star Rating')
fig.show()

## Importing the NLTK libraries

In [None]:
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS

### Generating the Word Cloud of the original input data

In [None]:
stopwords = set(STOPWORDS)
text = " ".join(review for review in data['HCAHPS Answer Description'])
fig = WordCloud(stopwords = stopwords).generate(text)

plt.imshow(fig, interpolation = 'bilinear')
plt.axis("off")
plt.show()

In [None]:
# Checking the data type of each column

data.info()

In [None]:
# Changing the data type of the output column

data['Patient Survey Star Rating'] = data['Patient Survey Star Rating'].astype(int)

In [None]:
data['Sentiment'] = data['Patient Survey Star Rating'].apply(lambda rating : +1 if rating >= 3 else -1)
data.head(10)

In [None]:
# Dividing the data based on positive and negative reviews

positive = data[data['Sentiment'] == 1]
negative = data[data['Sentiment'] == -1]

In [None]:
# validating the count in each dataframe

print(positive.shape)
print(negative.shape)

In [None]:
data.groupby('Sentiment')['Sentiment'].count()

In [None]:
# Word Cloud of the data in positive dataframe

pos = " ".join(review for review in positive['HCAHPS Answer Description'])
fig1 = WordCloud(stopwords = stopwords).generate(pos)

plt.imshow(fig1, interpolation = 'bilinear')
plt.axis("off")
plt.show()

In [None]:
# Word Cloud of the data in negative dataframe

neg = " ".join(review for review in positive['HCAHPS Answer Description'])
fig2 = WordCloud(stopwords = stopwords).generate(pos)

plt.imshow(fig2, interpolation = 'bilinear')
plt.axis("off")
plt.show()

In [None]:
# Cleaning the data by removing all the special characters and punctuations

def remove_punctuations(text):
    newtext = "".join(u for u in text if u not in ("?", "-", ",", ".", ":", ";"))
    return newtext


data['HCAHPS Answer Description'] = data['HCAHPS Answer Description'].apply(remove_punctuations)
data.head(10)

In [None]:
sample = data[['HCAHPS Answer Description','Sentiment']]

In [None]:
# randomsample.head()

In [None]:
# Randomly dividng the data into training and testing ones

import random

sample['random_number'] = np.random.randn(len(sample.index))

train = sample[sample['random_number'] <= 0.8]
test = sample[sample['random_number'] > 0.8]

In [None]:
print(train.shape)
print(test.shape)

In [None]:
# Using the Count vectorizer for counting each word occurance

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern = r'\b\w+\b')

train_matrix = vectorizer.fit_transform(train['HCAHPS Answer Description'])
test_matrix = vectorizer.transform(test['HCAHPS Answer Description'])

In [None]:
# Using the Logistic Regression for prediction model

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

In [None]:
x_train = train_matrix
x_test = test_matrix
y_train = train['Sentiment']
y_test = test['Sentiment']

In [None]:
lr.fit(x_train, y_train)

In [None]:
predictions = lr.predict(x_test)

In [None]:
# Testing the accuracy og the model using Confusion Matrix

from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix(predictions, y_test)

In [None]:
print(classification_report(predictions, y_test))

### As per the report, we have 75% accuracy for the model. All the positive review data has been correctly predicted. While the rest of the data are False Negative.