##Reading pre processed file

##EDA

###Correlation Matrix

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

#correlation matrix for the numerical variables
correlation_matrix = df[['rating', 'usefulCount']].corr(numeric_only=True)
plt.figure(figsize=(4, 4))
sns.heatmap(correlation_matrix, annot = True, cmap = "coolwarm", fmt=".2f", linewidths=0.5)
plt.show()

###Rating Distribution

In [None]:
ratings = df['rating']

#histogram showing the frequency of ratings
plt.hist(ratings, bins=10, color='skyblue', edgecolor='black')

plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.title('Distribution of Ratings')

plt.show()

###UsefulCount Distribution

In [None]:
import numpy as np

data = np.random.exponential(scale=50, size=10000)

plt.figure(figsize=(10, 6))

bins = np.linspace(0, max(data), 50)

#plotting the histogram for frequency of usefulcount of reviews
plt.hist(data, bins=bins, color='skyblue',edgecolor="black")

plt.title('Distribution of UsefulCount of Reviews')
plt.xlabel('UsefulCount')
plt.ylabel('Frequency')
plt.grid(axis='y', alpha=0.75)

plt.show()

###Time Series of Reviews

In [None]:
df['date'] = pd.to_datetime(df['date'])

# Group by date and count the number of reviews on each day
review_ts = df.groupby('date').size()

# Plot the time series
plt.figure(figsize=(15, 6))
review_ts.plot()
plt.title('Time Series of Reviews')
plt.xlabel('Year')
plt.ylabel('Review Count')
plt.grid(True)
plt.show()

##Most Common Conditions

In [None]:
#count of mentions for each condition
conditions = df['condition'].value_counts()

#top 10 common conditions
common_conditions = conditions.head(10)

print(common_conditions)

###Best drugs for the conditions

In [None]:
top_conditions = common_conditions.index.tolist()
filtered_df = df[df['condition'].isin(top_conditions)]

#getting the mean of ratings
avgratings = filtered_df.groupby(['condition', 'drugName'])['rating'].mean().reset_index()

drugs_sorted_best = avgratings.sort_values(by=['condition', 'rating'], ascending=[True, False])

# Output the top 5 sorted drugs for each condition
for condition in top_conditions:
    print(f"Best for '{condition}':")
    best_drugs = drugs_sorted_best[drugs_sorted_best['condition'] == condition]['drugName'].head(5).tolist()
    print(best_drugs)
    print('')

###Worst drugs for the conditions

In [None]:
sorted_drugs_worst = avgratings.sort_values(by=['condition', 'rating'], ascending=[True, True])

# Output the worst 5 sorted drugs for each condition
for condition in top_conditions:
    print(f"Worst 5 drugs for '{condition}':")
    worst_drugs = sorted_drugs_worst[sorted_drugs_worst['condition'] == condition]['drugName'].head(5).tolist()
    print(worst_drugs)
    print('')

##Sampling

In [None]:
import numpy as np
np.random.seed = 42

#getting a manageable sample for preliminary analysis
sample_df = df.sample(n=5000, random_state=42).reset_index()
sample_df.shape

In [None]:
sample_df

##Tokenization

In [None]:
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize, WhitespaceTokenizer
from nltk.tokenize.casual import TweetTokenizer
import nltk
nltk.download('punkt')

In [None]:
review_len = len(df['review'])
review_len

###Sentence Tokenization

In [None]:
tokenized = [[sent_tokenize(review)] for review in sample_df['review']]
#tokenized

###Word Tokenization

In [None]:
tokenized_word = [word_tokenize(review) for review in sample_df['review']]
#tokenized_word.head()

###Punctuation Tokenization

In [None]:
tokenized_punct = [WhitespaceTokenizer().tokenize(review) for review in sample_df['review']]
#tokenized_punct

##Embeddings

###Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer() #tokenizer= lambda x: word_tokenize(x), stop_words='english'

cv.fit(sample_df['processed_review'])

print('number of `tokens`', len(cv.vocabulary_))
cv.vocabulary_

In [None]:
print(cv.get_stop_words())

In [None]:
dtm = cv.transform(sample_df['processed_review'])
bow = pd.DataFrame(dtm.toarray(), columns=cv.get_feature_names_out())
bow

In [None]:
recognized_tokens_sentence0 = cv.inverse_transform([bow.iloc[0]])
recognized_tokens_sentence0

###TF-IDF (incomplete)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_model = TfidfVectorizer(norm=None)

tfidf_model.fit(sample_df.processed_review)

tfidf_matrix = tfidf_model.transform(sample_df.processed_review)
tfidf_vectors = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_model.get_feature_names_out())
tfidf_vectors

In [None]:
feature_names = tfidf_model.get_feature_names_out()

doc_keywords = [(feature_names[i], tfidf_matrix[0, i]) for i in tfidf_matrix[0].nonzero()[1]]

sorted_keywords = sorted(doc_keywords, key=lambda x: x[1], reverse=True)

sorted_keywords

###Cosine Similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean
from scipy.spatial.distance import squareform
from scipy.spatial.distance import pdist

# Cosine sim
cos_sim = pd.DataFrame(cosine_similarity(bow, bow))
cos_sim

In [None]:
q = "Acnex has been really good for my acne"

q_vector = cv.transform([q])

simil = pd.DataFrame(cosine_similarity(q_vector, bow), index = [q])

#simil.insert(0, 'Question', q)

In [None]:
simil

In [None]:
sorted_simil = simil.sort_values(by=q, axis =1, ascending=False)

In [None]:
sorted_simil

Using cosine similarity, we are able to get similar sentences to a new one that a user may input into the reviews database. For example, if a user inputs "Acnex has been really good for my acne", we can see that the sentence with index 1181 is similar to this review.

In [None]:
sample_df.iloc[1911]['review']

We can see that the similarity occurs due to the inclusion of the phrase 'really good' in this review.

###Word2Vec

In [None]:
df = df.sample(2000)

In [None]:
from gensim.models import Word2Vec
import numpy as np

In [None]:
#We construct and train our own Word2Vec (Not a common practice but just to see how it works.)
model_word2vec = Word2Vec(sentences=tokenized_word, vector_size=300, window=3, min_count=1, workers=4, negative=20, epochs=5000)

print("All words captured by the model:", model_word2vec.wv.key_to_index)
#print("The embedding of", "love", "is", model_word2vec.wv["love"])

# Get the embedding for each word captured by the model.
embeddings = np.array([model_word2vec.wv[word] for word in model_word2vec.wv.key_to_index])

In [None]:
print("The embedding of", "xanax", "is", model_word2vec.wv["xanax"])

# Get the embedding for each word captured by the model.
embeddings = np.array([model_word2vec.wv[word] for word in model_word2vec.wv.key_to_index])

In [None]:
print("The embedding of", "nexplanon", "is", model_word2vec.wv["nexplanon"])

#Nexplanon

In [None]:
embeddings

In [None]:
embeddings.shape

#Classification Models

In [None]:
y = sample_df['sentiment']
X = bow

In [None]:
from sklearn.model_selection import train_test_split

# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape

In [None]:
y_train.shape

####LogReg without Smote

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import confusion_matrix

# train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on the test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
pd.DataFrame(confusion_matrix(y_test, y_pred, normalize='true'), columns=model.classes_, index=model.classes_ )

####LogReg with Smote

In [None]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Keep the test data the same
X_test_smote, y_test_smote = X_test, y_test

In [None]:
from imblearn.over_sampling import SMOTE

# train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_smote, y_train_smote)

# Predict on the test data
y_pred_smote = model.predict(X_test_smote)

# Evaluate the model
accuracy_smote = accuracy_score(y_test_smote, y_pred_smote)
print(f"Accuracy with SMOTE: {accuracy_smote}")
pd.DataFrame(confusion_matrix(y_test_smote, y_pred_smote, normalize='true'), columns=model.classes_, index=model.classes_)

Accuracy with SMOTE: 0.618


Unnamed: 0,negative,neutral,positive
negative,0.474138,0.237069,0.288793
neutral,0.283333,0.25,0.466667
positive,0.123457,0.138889,0.737654


####RandomForestClassifier without Smote

In [None]:
from sklearn.ensemble import RandomForestClassifier

# train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predict on the test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
pd.DataFrame(confusion_matrix(y_test, y_pred, normalize='true'), columns=model.classes_, index=model.classes_ )

Accuracy: 0.664


Unnamed: 0,negative,neutral,positive
negative,0.12069,0.00431,0.875
neutral,0.075,0.016667,0.908333
positive,0.021605,0.0,0.978395


####RandomForestClassifier with Smote

In [None]:
# train the model
model = RandomForestClassifier()
model.fit(X_train_smote, y_train_smote)

# Predict on the test data
y_pred_smote = model.predict(X_test_smote)

# Evaluate the model
accuracy_smote = accuracy_score(y_test_smote, y_pred_smote)
print(f"Accuracy with SMOTE: {accuracy_smote}")
pd.DataFrame(confusion_matrix(y_test_smote, y_pred_smote, normalize='true'), columns=model.classes_, index=model.classes_)

####GradientBoosting without Smote

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# train the model
model = GradientBoostingClassifier()
model.fit(X_train, y_train)

# Predict on the test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
pd.DataFrame(confusion_matrix(y_test, y_pred, normalize='true'), columns=model.classes_, index=model.classes_ )

Accuracy: 0.673


Unnamed: 0,negative,neutral,positive
negative,0.176724,0.0,0.823276
neutral,0.1,0.0,0.9
positive,0.023148,0.001543,0.975309


####GradientBoosting with Smote

In [None]:
# train the model
model = GradientBoostingClassifier()
model.fit(X_train_smote, y_train_smote)

# Predict on the test data
y_pred_smote = model.predict(X_test_smote)

# Evaluate the model
accuracy_smote = accuracy_score(y_test_smote, y_pred_smote)
print(f"Accuracy with SMOTE: {accuracy_smote}")
pd.DataFrame(confusion_matrix(y_test_smote, y_pred_smote, normalize='true'), columns=model.classes_, index=model.classes_)

Accuracy with SMOTE: 0.605


Unnamed: 0,negative,neutral,positive
negative,0.465517,0.150862,0.383621
neutral,0.216667,0.208333,0.575
positive,0.118827,0.152778,0.728395
