# Classifying data into 5 categories Using SVM, Naive Bayes and Logistic Regression

In [2]:
!pip install gensim liwc

Collecting liwc
  Downloading liwc-0.5.0-py2.py3-none-any.whl.metadata (2.7 kB)
Downloading liwc-0.5.0-py2.py3-none-any.whl (5.1 kB)
Installing collected packages: liwc
Successfully installed liwc-0.5.0


In [6]:
!pip install -U liwc



# Training the Models

In [18]:
# Classification

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from nltk.stem import WordNetLemmatizer
import numpy as np
from imblearn.over_sampling import SMOTE
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from gensim.models import FastText
import liwc
import re
from collections import Counter
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from sklearn.impute import SimpleImputer
import pickle # Import the pickle library
nltk.download('vader_lexicon')
nltk.download('wordnet')
# Load the dataset
data = pd.read_csv("2024_Drug_Data.csv")


# Feature Extraction
X = data['body']
y = data['label_classification']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to clean and tokenize text
def clean_and_tokenize(text):
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)  # Remove tweet handles
    text = re.sub(r'[^\w\s]', '', text)  # Remove symbols
    text = re.sub(r'http\S+', '', text)  # Remove hyperlinks
    text = text.lower()  # Convert to lowercase
    return re.findall(r'\w+', text)  # Tokenize

# Initialize Sentiment Intensity Analyzer
sid = SentimentIntensityAnalyzer()

# Extract sentiment polarity features for each post
X_sentiment_train = []
X_sentiment_test = []
for post in X_train:
    sentiment_scores = sid.polarity_scores(post)
    X_sentiment_train.append([sentiment_scores['neg'], sentiment_scores['pos']])
for post in X_test:
    sentiment_scores = sid.polarity_scores(post)
    X_sentiment_test.append([sentiment_scores['neg'], sentiment_scores['pos']])

# Convert sentiment features to DataFrame
X_sentiment_train = pd.DataFrame(X_sentiment_train, columns=['neg_score', 'pos_score'])
X_sentiment_test = pd.DataFrame(X_sentiment_test, columns=['neg_score', 'pos_score'])

# Initialize LIWC analyzer
# Assuming LIWC functionality is encapsulated in a function called load_token_parser
liwc_analyzer, category_names = liwc.load_token_parser('LIWC2007_English100131.dic')

# Extract LIWC-based features for training data
X_liwc_train = []
X_liwc_test = []

##1.8. Lemmatizing Each token
lemmatizer = WordNetLemmatizer()
def lemmatize(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]


for post in X_train:
    words = clean_and_tokenize(post)
    # Lemmatize the post into words
    words = lemmatize(words)
    liwc_counts = Counter()
    for word in words:
        for category in liwc_analyzer(word):
            if category in category_names:
                liwc_counts[category] += 1
    X_liwc_train.append(list(liwc_counts.values()))
for post in X_test:
    words = clean_and_tokenize(post)
    # Lemmatize the post into words
    words = lemmatize(words)
    liwc_counts = Counter()
    for word in words:
        for category in liwc_analyzer(word):
            if category in category_names:
                liwc_counts[category] += 1
    X_liwc_test.append(list(liwc_counts.values()))

# Convert LIWC features to DataFrame
X_liwc_train = pd.DataFrame(X_liwc_train, columns=category_names)
X_liwc_test = pd.DataFrame(X_liwc_test, columns=category_names)

# Unigram and bigram features
tfidf_vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1, 2))
X_tfidf_train = tfidf_vectorizer.fit_transform(X_train)
X_tfidf_test = tfidf_vectorizer.transform(X_test)

# Word-based Features
word_glossaries = ['health', 'low self-esteem stress', 'recovery', 'addiction']
vectorizer = CountVectorizer(vocabulary=word_glossaries)
X_word_train = vectorizer.fit_transform(X_train)
X_word_test = vectorizer.transform(X_test)
X_word_dense_train = X_word_train.toarray()
X_word_dense_test = X_word_test.toarray()

# Concatenate all features
X_features_train = np.concatenate((X_liwc_train, X_sentiment_train, X_tfidf_train.toarray(), X_word_dense_train), axis=1)
X_features_test = np.concatenate((X_liwc_test, X_sentiment_test, X_tfidf_test.toarray(), X_word_dense_test), axis=1)

# Impute missing values with the mean
imputer = SimpleImputer(strategy='mean')
X_imputed_train = imputer.fit_transform(X_features_train)
X_imputed_test = imputer.transform(X_features_test)

# Feature Scaling
scaler = MinMaxScaler()
X_scaled_train = scaler.fit_transform(X_imputed_train)
X_scaled_test = scaler.transform(X_imputed_test)

# Data Balancing using SMOTE
smote = SMOTE()
X_resampled_train, y_resampled_train = smote.fit_resample(X_scaled_train, y_train)

# Add this import at the top
import json

# --- SAVE PREPROCESSING PARAMETERS AS JSON (instead of pickle) ---
preprocessing_params = {
    'tfidf_max_features': int(2000),  # Ensure it's Python int
    'tfidf_ngram_range': [1, 2],
    'tfidf_vocabulary': {str(k): int(v) for k, v in tfidf_vectorizer.vocabulary_.items()},  # Convert keys to str, values to int
    'tfidf_idf_': [float(x) for x in tfidf_vectorizer.idf_],  # Convert to Python floats

    'word_glossaries': word_glossaries,
    'word_vocabulary': {str(k): int(v) for k, v in vectorizer.vocabulary_.items()},

    'imputer_strategy': 'mean',
    'imputer_statistics_': [float(x) for x in imputer.statistics_],
    'imputer_n_features_in_': int(imputer.n_features_in_),  # Add this

    'scaler_min_': [float(x) for x in scaler.min_],
    'scaler_scale_': [float(x) for x in scaler.scale_],
    'scaler_data_min_': [float(x) for x in scaler.data_min_],
    'scaler_data_max_': [float(x) for x in scaler.data_max_],
    'scaler_data_range_': [float(x) for x in scaler.data_range_],
    'scaler_n_features_in_': int(scaler.n_features_in_),  # Add this

    'category_names': list(category_names),  # Ensure it's a Python list
    'feature_names_order': ['liwc', 'sentiment', 'tfidf', 'word']
}

# Save as JSON (much smaller than pickle)
with open('preprocessing_params.json', 'w') as file:
    json.dump(preprocessing_params, file, indent=2)

print("Preprocessing parameters saved as 'preprocessing_params.json'")

# --- CLASSIFIER MODELS ---
classifiers = {
    "SVM": SVC(kernel='linear'),
    "Gaussian Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=3000),
}

for name, classifier in classifiers.items():
    classifier.fit(X_resampled_train, y_resampled_train)
    y_pred = classifier.predict(X_scaled_test)
    print(f"Classification Report for {name}:")
    print(classification_report(y_test, y_pred))
    print(f"Confusion Matrix for {name}:")
    print(confusion_matrix(y_test, y_pred))
    print("----------------------------------------------")

    # Save the trained model
    model_filename = f'trained_{name.lower().replace(" ", "_")}_model.pkl'
    with open(model_filename, 'wb') as file:
        pickle.dump(classifier, file)
    print(f"Model saved as '{model_filename}'")

    # Save model classes for the Logistic Regression model
    if name == "Logistic Regression":
        model_classes = {
            'classes': classifier.classes_.tolist()
        }
        with open('model_classes.json', 'w') as file:
            json.dump(model_classes, file, indent=2)
        print(f"Model classes saved: {classifier.classes_}")

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocessing parameters saved as 'preprocessing_params.json'
Classification Report for SVM:
              precision    recall  f1-score   support

  A-Recovery       0.27      0.32      0.30        37
    Addicted       0.81      0.83      0.82       266
  E-Recovery       0.55      0.54      0.54       138
  M-Recovery       0.56      0.53      0.54       127
      Others       0.69      0.65      0.67        63

    accuracy                           0.66       631
   macro avg       0.58      0.57      0.58       631
weighted avg       0.66      0.66      0.66       631

Confusion Matrix for SVM:
[[ 12   5   9  11   0]
 [ 12 222  13   7  12]
 [  8  22  74  30   4]
 [  9  15  34  67   2]
 [  3  10   5   4  41]]
----------------------------------------------
Model saved as 'trained_svm_model.pkl'
Classification Report for Gaussian Naive Bayes:
              precision    recall  f1-score   support

  A-Recovery       0.24      0.27      0.26        37
    Addicted       0.81      0.83



Classification Report for Logistic Regression:
              precision    recall  f1-score   support

  A-Recovery       0.34      0.32      0.33        37
    Addicted       0.85      0.84      0.84       266
  E-Recovery       0.58      0.57      0.58       138
  M-Recovery       0.60      0.58      0.59       127
      Others       0.67      0.76      0.71        63

    accuracy                           0.69       631
   macro avg       0.61      0.62      0.61       631
weighted avg       0.69      0.69      0.69       631

Confusion Matrix for Logistic Regression:
[[ 12   5  11   9   0]
 [  5 224  12   8  17]
 [  7  19  79  29   4]
 [ 11  10  29  74   3]
 [  0   7   5   3  48]]
----------------------------------------------
Model saved as 'trained_logistic_regression_model.pkl'
Model classes saved: ['A-Recovery' 'Addicted' 'E-Recovery' 'M-Recovery' 'Others']


# Finding Unique Drugs and Related Mentor Data


In [9]:
# Finding Drugs from Each Post

import pandas as pd

# Read drug names from the TSV file
file_path = 'drug_names.tsv'
data = pd.read_csv(file_path, sep='\t')
drug_names_sider = ["carnitine"] + data.iloc[:, 1].tolist()

# Read drug names from the CSV file
file_path_csv = 'drugsComTest_raw.csv'
df = pd.read_csv(file_path_csv)

# Extract drug names column
drug_names_csv = df['drugName'].str.lower().tolist()

# Combine the drug names from both sources
combined_drug_names = drug_names_sider + drug_names_csv

# Remove duplicates and convert to lowercase
unique_drug_names = list(set(map(str.lower, combined_drug_names)))

# Write the unique drug names to a new CSV file
unique_drugs_df = pd.DataFrame(unique_drug_names, columns=['DrugName'])
unique_drugs_df.to_csv('unique_drug_names.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'drug_names.tsv'

Checking the Drug Data

In [10]:
# Load the data from the uploaded file to see its structure and contents.
import pandas as pd

# Load the data
drug = pd.read_csv("unique_drug_names.csv")
drug.head()

Unnamed: 0,DrugName
0,remifentanil
1,ortho-novum 1 / 35
2,cosyntropin
3,femara
4,poly iron


In [11]:
drug["DrugName"].str.len()

Unnamed: 0,DrugName
0,12
1,18
2,11
3,6
4,9
...,...
3413,10
3414,5
3415,13
3416,10


In [12]:
import pandas as pd
import re

def find_mentioned_drugs(post_text, drug_list):
    mentioned_drugs = []
    post_text = post_text.lower()
    for drug in drug_list:
        # Using regular expression to find drug names in post text
        if re.search(r'\b{}\b'.format(re.escape(drug.lower())), post_text):
            mentioned_drugs.append(drug)
    return mentioned_drugs

# Load Reddit posts CSV
reddit_df = pd.read_csv("2024_Drug_Data.csv")



# Add a new column to store mentioned drugs for each post
reddit_df['Mentioned Drugs'] = reddit_df['body'].apply(lambda x: find_mentioned_drugs(x, drug['DrugName']))

# Create a new CSV file with post information and mentioned drugs
new_csv_file = 'reddit_posts_with_drugs.csv'
reddit_df.to_csv(new_csv_file, index=False)

print("New CSV file created with mentioned drugs:", new_csv_file)

KeyboardInterrupt: 

### Preprocessing Mentor and Drug Data


In [19]:
# preprocess_mentor_data.py

import pandas as pd
import numpy as np
import re
from collections import defaultdict, Counter
import liwc
import nltk
import pickle
import ast

# Download NLTK resources (if not already downloaded)
nltk.download('vader_lexicon')
nltk.download('wordnet')

# Load the data files
print("Loading data files...")
data = pd.read_csv("2024_Drug_Data.csv")
drug_names_df = pd.read_csv("unique_drug_names.csv")
drug_list = drug_names_df['DrugName'].tolist()

# Initialize LIWC analyzer (static tool)
try:
    liwc_analyzer, category_names = liwc.load_token_parser('LIWC2007_English100131.dic')
except FileNotFoundError:
    raise FileNotFoundError("LIWC dictionary file missing.")
lemmatizer = nltk.WordNetLemmatizer()

def find_mentioned_drugs(post_text, drugs):
    mentioned = []
    if not isinstance(post_text, str):
        return mentioned
    post_text = post_text.lower()
    for drug in drugs:
        if re.search(r'\b{}\b'.format(re.escape(drug.lower())), post_text):
            mentioned.append(drug)
    return mentioned

# --- Perform the slow processing here ---
print("Processing data to find mentioned drugs...")
data['Mentioned Drugs'] = data['body'].apply(lambda x: find_mentioned_drugs(x, drug_list))
print("Finding mentioned drugs complete.")

# Identify potential mentors from users in recovery stages
recovery_users = data[data['label_classification'].isin(['E-Recovery', 'M-Recovery', 'A-Recovery'])]

# Create a dictionary to hold the recovery information for users
recovery_dict = defaultdict(list)
for _, row in recovery_users.iterrows():
    drugs_recovered_from = row['Mentioned Drugs']
    if drugs_recovered_from:
        recovery_dict[row['username']].extend(drugs_recovered_from)

# Save the preprocessed data to a file
with open('mentor_data.pkl', 'wb') as f:
    pickle.dump(recovery_dict, f)

print("Mentor data successfully preprocessed and saved to 'mentor_data.pkl'")

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Loading data files...
Processing data to find mentioned drugs...
Finding mentioned drugs complete.
Mentor data successfully preprocessed and saved to 'mentor_data.pkl'


# Drug mentor Determination

Recommending the users who wish to revive from their addiction names of users who have recovered from the same drugs and are under Maintaining recovery stage(M-recovery) or Advanced recovery stage(A-recovery)

In [None]:
from collections import defaultdict
import ast

# Step 1: Identify users who intend to recover
# Filter out rows where users show intention to recover
intention_to_recover = data[data['label_recommendation'] == 'Addicted with intention to recover']

# Step 2: Extract the list of drugs for those who wish to recover
# Convert the 'Mentioned Drugs' from string to actual list
intention_to_recover['Mentioned Drugs'] = intention_to_recover['Mentioned Drugs'].apply(ast.literal_eval)
intention_to_recover = intention_to_recover[intention_to_recover['Mentioned Drugs'].map(bool)]
# Step 3: Identify potential mentors from users who have recovered
# Filter out users who are in recovery classes
recovery_users = data[data['label_recommendation'].isin(['E-recovery', 'M-Recovery', 'A-Recovery'])]

# Create a dictionary to hold the recovery information for users
recovery_dict = defaultdict(list)
for _, row in recovery_users.iterrows():
    drugs_recovered_from = ast.literal_eval(row['Mentioned Drugs'])
    if drugs_recovered_from:  # We only consider users who have mentioned specific drugs
        recovery_dict[row['username']].extend(drugs_recovered_from)

# Step 4: Create a function to match and rank mentors for a user wishing to recover
def find_mentors_for_user(user_row):
    user_drugs = set(user_row['Mentioned Drugs'])
    if not user_drugs:
        return []

    # Calculate the number of drugs in common with potential mentors
    common_drugs_count = {
        mentor: len(user_drugs.intersection(set(mentor_drugs)))
        for mentor, mentor_drugs in recovery_dict.items()
    }

    # Filter out mentors with zero drugs in common
    potential_mentors = {mentor: count for mentor, count in common_drugs_count.items() if count > 0}

    # Rank mentors based on the number of drugs in common
    ranked_mentors = sorted(potential_mentors.items(), key=lambda item: item[1], reverse=True)

    # Return the sorted list of potential mentors
    return [mentor for mentor, _ in ranked_mentors]

# Apply the function to each user wishing to recover to find their potential mentors
intention_to_recover['Potential Mentors'] = intention_to_recover.apply(find_mentors_for_user, axis=1)

# Display the first few entries to verify
intention_to_recover['Top 3 Mentors'] = intention_to_recover['Potential Mentors'].apply(
    lambda x: x[:3] if len(x) >= 3 else x
)

# Display the dataframe with the top 3 potential mentors
intention_to_recover[['username', 'Mentioned Drugs', 'Top 3 Mentors']].head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intention_to_recover['Mentioned Drugs'] = intention_to_recover['Mentioned Drugs'].apply(ast.literal_eval)


Unnamed: 0,username,Mentioned Drugs,Top 3 Mentors
2,NAME_2,"[cocaine, amphetamine]","[NAME_768, NAME_831, NAME_918]"
3,NAME_3,"[percocet, valium]","[NAME_768, NAME_916, NAME_1084]"
7,NAME_7,[adderall],"[NAME_976, NAME_1282, NAME_1312]"
10,NAME_10,[ketamine],[NAME_1629]
11,NAME_11,[nitrous],[]
