# checking data

In [1]:
import os
import re
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
df = pd.read_csv("/content/train.csv")

## basic exploration

In [3]:
df.head(3)

Unnamed: 0,unique_hash,text,drug,sentiment
0,2e180be4c9214c1f5ab51fd8cc32bc80c9f612e0,Autoimmune diseases tend to come in clusters. ...,gilenya,2
1,9eba8f80e7e20f3a2f48685530748fbfa95943e4,I can completely understand why you’d want to ...,gilenya,2
2,fe809672251f6bd0d986e00380f48d047c7e7b76,Interesting that it only targets S1P-1/5 recep...,fingolimod,2


In [6]:
df.shape

(5279, 4)

In [10]:
df['sentiment'].value_counts(dropna=False)
# IMBALANCED DATA, MULTICLASS

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
2,3825
1,837
0,617


In [12]:
df['text'].sample(5)

Unnamed: 0,text
3163,Re 12.37 Greed is good Thanks for fascinating ...
1413,"Tysabri is quite safe if you remain JC- , when..."
1986,"Diagnosed in 2012 Stage IV NSCLC, malignant pl..."
2801,Ok. Thanks - that all makes me feel a little b...
4181,If barts hypothesis is correct that caldribine...


In [14]:
# see complete text
pd.set_option('display.max_colwidth', None)
df['text'][:1]

Unnamed: 0,text
0,"Autoimmune diseases tend to come in clusters. As for Gilenya – if you feel good, don’t think about it, it won’t change anything but waste your time and energy. I’m taking Tysabri and feel amazing, no symptoms (other than dodgy color vision, but I’ve had it since always, so, don’t know) and I don’t know if it will last a month, a year, a decade, ive just decided to enjoy the ride, no point in worrying."


In [8]:
df['drug'].value_counts(dropna=False).head(5)

Unnamed: 0_level_0,count
drug,Unnamed: 1_level_1
ocrevus,676
gilenya,666
ocrelizumab,441
entyvio,303
humira,270


In [11]:
df.dtypes

Unnamed: 0,0
unique_hash,object
text,object
drug,object
sentiment,int64


# data cleaning - target column (if requires cleaing)

In [15]:
 # Convert sentiment to string first to handle mixed types
df['sentiment'] = df['sentiment'].astype(str).str.strip().str.lower()

In [16]:
# Map various representations to standard labels
sentiment_mapping = {
# Numeric representations
'0': 'positive',
'0.0': 'positive',
'1': 'negative',
'1.0': 'negative',
'2': 'neutral',
'2.0': 'neutral',

# Text representations
'negative': 'negative',
'neg': 'negative',
'bad': 'negative',
'poor': 'negative',
'awful': 'negative',
'terrible': 'negative',
'hate': 'negative',
'worst': 'negative',
'horrible': 'negative',

'neutral': 'neutral',
'neut': 'neutral',
'ok': 'neutral',
'okay': 'neutral',
'average': 'neutral',
'fair': 'neutral',
'mixed': 'neutral',

'positive': 'positive',
'pos': 'positive',
'good': 'positive',
'great': 'positive',
'excellent': 'positive',
'amazing': 'positive',
'love': 'positive',
'best': 'positive',
'wonderful': 'positive',
'fantastic': 'positive',

# Handle blanks/nulls/invalid
'nan': 'neutral',
'none': 'neutral',
'': 'neutral',
' ': 'neutral',
}


In [17]:
# Apply mapping
df['sentiment'] = df['sentiment'].map(sentiment_mapping)

In [18]:
# For any unmapped values, try to infer or set to neutral
unmapped_mask = df['sentiment'].isna()
if unmapped_mask.any():
    print(f"Warning: Found {unmapped_mask.sum()} unmapped sentiment values. Setting to 'neutral'.")
    print("Sample unmapped values:", df.loc[unmapped_mask, 'sentiment'].head().tolist())
    df.loc[unmapped_mask, 'sentiment'] = 'neutral'

In [19]:
print(f"Sentiment distribution after cleaning:")
print(df['sentiment'].value_counts())

Sentiment distribution after cleaning:
sentiment
neutral     3825
negative     837
positive     617
Name: count, dtype: int64


## text column

In [29]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')   #  new dependency in recent NLTK
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# install dependencies

In [35]:
! pip install -r requirements.txt

Collecting mlflow>=1.20.0 (from -r requirements.txt (line 5))
  Downloading mlflow-3.3.2-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==3.3.2 (from mlflow>=1.20.0->-r requirements.txt (line 5))
  Downloading mlflow_skinny-3.3.2-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-tracing==3.3.2 (from mlflow>=1.20.0->-r requirements.txt (line 5))
  Downloading mlflow_tracing-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting alembic!=1.10.0,<2 (from mlflow>=1.20.0->-r requirements.txt (line 5))
  Downloading alembic-1.16.5-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow>=1.20.0->-r requirements.txt (line 5))
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow>=1.20.0->-r requirements.txt (line 5))
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow>=1.20.0->-r requirements.txt (line 5))
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
C

# checking data preprocessing script

In [48]:
import data_module
from data_module import DataModule
dm = DataModule()
df = dm.load_csv("train.csv")
df = dm.prepare_dataframe(df)

In [51]:
df[:1]

Unnamed: 0,unique_hash,text,drug,sentiment,clean_text,combined_text
0,2e180be4c9214c1f5ab51fd8cc32bc80c9f612e0,"Autoimmune diseases tend to come in clusters. As for Gilenya – if you feel good, don’t think about it, it won’t change anything but waste your time and energy. I’m taking Tysabri and feel amazing, no symptoms (other than dodgy color vision, but I’ve had it since always, so, don’t know) and I don’t know if it will last a month, a year, a decade, ive just decided to enjoy the ride, no point in worrying.",gilenya,2,autoimmune disease tend come cluster gilenya feel good think change anything waste time energy taking tysabri feel amazing symptom dodgy color vision since always know know last month year decade ive decided enjoy ride point worrying,autoimmune disease tend come cluster gilenya feel good think change anything waste time energy taking tysabri feel amazing symptom dodgy color vision since always know know last month year decade ive decided enjoy ride point worrying gilenya
