In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, SpatialDropout1D
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf
import re
import pickle

In [None]:
# Load datasets
df1 = pd.read_csv('drug1.csv')
df2 = pd.read_csv('drug4.csv')

# Display the summary statistics and info
print(df1.info())
print(df1.describe())
print(df2.info())
print(df2.describe())


In [None]:
# Fill missing values with empty strings or appropriate values
df1.fillna('', inplace=True)
df2.fillna('', inplace=True)


In [None]:
# Standardize review percentages
review_cols = ['Excellent Review %', 'Average Review %', 'Poor Review %']
scaler = StandardScaler()
df2[review_cols] = scaler.fit_transform(df2[review_cols])


In [None]:
# Encode categorical features
df1['Habit Forming'] = LabelEncoder().fit_transform(df1['Habit Forming'])
df1 = pd.get_dummies(df1, columns=['Therapeutic Class', 'Action Class'], drop_first=True)
df2 = pd.get_dummies(df2, columns=['Manufacturer'], drop_first=True)


In [None]:
# Convert medicine names to lower case for merging
df1['name'] = df1['name'].str.lower()
df2['Medicine Name'] = df2['Medicine Name'].str.lower()

# Merge datasets on 'name' column
merged_df = pd.merge(df1, df2, left_on='name', right_on='Medicine Name', how='inner')
merged_df.drop(columns=['id', 'name', 'Medicine Name', 'Image URL'], inplace=True)


In [None]:
# Count the number of side effects
side_effect_cols = [col for col in merged_df.columns if 'sideEffect' in col]
merged_df['num_side_effects'] = merged_df[side_effect_cols].apply(lambda row: row.astype(bool).sum(), axis=1)

# Drop unnecessary columns
merged_df.drop(columns=side_effect_cols, inplace=True)


In [None]:
# Distribution of review percentages
plt.figure(figsize=(12, 6))
sns.histplot(data=merged_df, x='Excellent Review %', kde=True, label='Excellent')
sns.histplot(data=merged_df, x='Average Review %', kde=True, color='orange', label='Average')
sns.histplot(data=merged_df, x='Poor Review %', kde=True, color='red', label='Poor')
plt.legend()
plt.title('Distribution of Review Percentages')
plt.show()

# Countplot of therapeutic classes
plt.figure(figsize=(12, 6))
sns.countplot(y='Therapeutic Class_ANTI INFECTIVES', data=merged_df)
plt.title('Count of Anti Infective Therapeutic Class')
plt.show()


In [None]:
# Create binary labels for 'Excellent Review %'
threshold = 0.5
merged_df['binary_review'] = (merged_df['Excellent Review %'] > threshold).astype(int)

# Prepare the data for LSTM
tokenizer = Tokenizer(num_words=5000, split=' ')
tokenizer.fit_on_texts(merged_df['substitute0'].values)
X = tokenizer.texts_to_sequences(merged_df['substitute0'].values)
X = pad_sequences(X)

Y = merged_df['binary_review']

# Save the tokenizer
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
# Save the cleaned dataset
merged_df.to_csv('cleaned_data.csv', index=False)

In [None]:
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [None]:
# Define LSTM model
model = Sequential()
model.add(Embedding(5000, 256, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the LSTM model
model.fit(X_train, Y_train, epochs=50, batch_size=64, validation_data=(X_test, Y_test))


In [None]:
# Predict on the test set
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate metrics
accuracy = accuracy_score(Y_test, y_pred)
precision = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


In [None]:
import shap

# Explain predictions
explainer = shap.DeepExplainer(model, X_train)
shap_values = explainer.shap_values(X_test)

# Summary plot
shap.summary_plot(shap_values, X_test)


In [None]:
model.save('medicine_review_model.keras')