In [1]:

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from scipy.sparse import csr_matrix

# Function to load CSV safely with encoding handling
def load_csv_safely(filepath, encodings=['utf-8', 'latin1', 'ISO-8859-1']):
    for encoding in encodings:
        try:
            print(f"Trying encoding: {encoding}")
            return pd.read_csv(filepath, encoding=encoding, on_bad_lines='skip', low_memory=False)
        except Exception as e:
            print(f"Error loading {filepath} with encoding {encoding}: {e}")
    # If no encoding works, return an empty dataframe
    print(f"Unable to load {filepath} with any of the provided encodings.")
    return pd.DataFrame()

# Function to clean text data (remove unwanted characters, etc.)
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII characters
    return text

# Load the datasets with different encoding options
questions_file = '/content/drive/MyDrive/MCML/Questions.csv'
tags_file = '/content/drive/MyDrive/MCML/Tags.csv'
answers_file = '/content/drive/MyDrive/MCML/Answers.csv'

questions = load_csv_safely(questions_file)
tags = load_csv_safely(tags_file)
answers = load_csv_safely(answers_file)

# Check shapes and sample data to ensure proper loading
print(f"Questions shape: {questions.shape}")
print(f"Tags shape: {tags.shape}")
print(f"Answers shape: {answers.shape}")

# Print column names to inspect the data structure
print(f"Questions columns: {questions.columns}")
print(f"Tags columns: {tags.columns}")
print(f"Answers columns: {answers.columns}")



Trying encoding: utf-8
Error loading /content/drive/MyDrive/MCML/Questions.csv with encoding utf-8: 'utf-8' codec can't decode byte 0xed in position 27: invalid continuation byte
Trying encoding: latin1
Trying encoding: utf-8
Trying encoding: utf-8
Error loading /content/drive/MyDrive/MCML/Answers.csv with encoding utf-8: 'utf-8' codec can't decode byte 0xed in position 1211: invalid continuation byte
Trying encoding: latin1
Questions shape: (1264216, 7)
Tags shape: (3750994, 2)
Answers shape: (2014516, 6)
Questions columns: Index(['Id', 'OwnerUserId', 'CreationDate', 'ClosedDate', 'Score', 'Title',
       'Body'],
      dtype='object')
Tags columns: Index(['Id', 'Tag'], dtype='object')
Answers columns: Index(['Id', 'OwnerUserId', 'CreationDate', 'ParentId', 'Score', 'Body'], dtype='object')


In [None]:
# Ensure the required columns are present in the Questions DataFrame
required_columns = ['Id', 'Title', 'Body']
if all(col in questions.columns for col in required_columns):
    # Merge Questions and Tags
    questions_tags = pd.merge(questions[['Id', 'Title', 'Body']], tags, left_on='Id', right_on='Id', how='inner')

    # Clean the text data
    questions_tags['Cleaned_Title'] = questions_tags['Title'].apply(clean_text)
    questions_tags['Cleaned_Body'] = questions_tags['Body'].apply(clean_text)

    # Sample data to reduce memory usage
    questions_tags_sampled = questions_tags.sample(n=10000, random_state=42)  # Adjust sample size as needed

    # Handle missing values and ensure 'Tag' column is treated as string
    questions_tags_sampled['Tag'] = questions_tags_sampled['Tag'].fillna('').astype(str)

    # Preprocess the target labels (tags)
    mlb = MultiLabelBinarizer()
    tags_labels = mlb.fit_transform(questions_tags_sampled['Tag'].str.split(','))


    # Preprocess the target labels (tags)
    #mlb = MultiLabelBinarizer()
    #tags_labels = mlb.fit_transform(questions_tags_sampled['Tag'].str.split(','))

    # Feature engineering: TF-IDF Vectorization
    vectorizer = TfidfVectorizer(max_features=3000)  # Limit features to reduce memory usage
    X = vectorizer.fit_transform(questions_tags_sampled['Cleaned_Title'] + " " + questions_tags_sampled['Cleaned_Body'])

    # Convert to sparse matrix to optimize memory
    X_sparse = csr_matrix(X)

    # Train-test split
    X_train, X_val, y_train, y_val = train_test_split(X_sparse, tags_labels, test_size=0.1, random_state=42)
    # Check distribution of labels
    tag_counts = y_train.sum(axis=0)
    print("Tag counts in training set:", tag_counts)
    # Identify tags with sufficient samples
    valid_tags = [i for i, count in enumerate(tag_counts) if count > 1]
    # Filter out invalid tags
    y_train = y_train[:, valid_tags]
    y_val = y_val[:, valid_tags]

    # Update the MultiLabelBinarizer classes
    #mlb.classes_ = [mlb.classes_[i] for i in valid_tags]
    # Update the MultiLabelBinarizer classes
    import numpy as np  # Add this at the top if missing

    mlb.classes_ = np.array([mlb.classes_[i] for i in valid_tags])

    # Increase the sample size
    questions_tags_sampled = questions_tags.sample(n=20000, random_state=42)  # Adjust size as needed

    #from sklearn.base import clone

    # Custom training loop with iteration tracking
    #n_classes = y_train.shape[1]
    #base_estimator = LogisticRegression(max_iter=500, random_state=42, solver='saga')
    #models = []

    #for i in range(n_classes):
     # print(f"Training model for tag {i+1}/{n_classes}")
      #clf = clone(base_estimator)  # Clone base model to ensure independence
      #clf.fit(X_train, y_train[:, i])
      #models.append(clf)

    # Combine models into a MultiOutputClassifier
    #model.estimators_ = models

    # Train the model using MultiOutputClassifier
    model = MultiOutputClassifier(LogisticRegression(max_iter=1100, random_state=42, solver='saga', verbose=1))
    model.fit(X_train, y_train)

    # Function to predict tags for a new question
    def predict_tags_for_new_question(question, model, vectorizer, mlb):
        cleaned_question = clean_text(question)
        question_vectorized = vectorizer.transform([cleaned_question])
        predicted_tags_binary = model.predict(question_vectorized)
        predicted_tags = mlb.inverse_transform(predicted_tags_binary)
        return predicted_tags[0]  # Return the predicted tags for the given question

    # Function to fetch answers for the predicted tags
    def fetch_answers_for_tags(predicted_tags, answers_df, tags_column='Tag'):
        filtered_answers = answers_df[answers_df[tags_column].apply(lambda x: any(tag in x.split(',') for tag in predicted_tags))]
        return filtered_answers[['Id', 'Body']]

    # Example usage: Predicting tags and fetching answers for a new question
    new_question = "How do I solve a quadratic equation?"

    # Step 1: Predict tags
    predicted_tags = predict_tags_for_new_question(new_question, model, vectorizer, mlb)
    print("Predicted Tags:", predicted_tags)

    # Step 2: Fetch answers based on predicted tags
    answers_for_tags = fetch_answers_for_tags(predicted_tags, answers)
    print(f"Fetched {len(answers_for_tags)} answers for the predicted tags:")
    print(answers_for_tags.head())

else:
    print("The required columns ('Id', 'Title', 'Body') are missing from the Questions DataFrame.")


Tag counts in training set: [ 6 10 47 ...  0  1  0]
convergence after 527 epochs took 9 seconds
convergence after 450 epochs took 9 seconds
convergence after 340 epochs took 6 seconds
convergence after 634 epochs took 12 seconds
convergence after 731 epochs took 14 seconds
convergence after 699 epochs took 13 seconds
convergence after 736 epochs took 14 seconds
convergence after 653 epochs took 13 seconds
convergence after 700 epochs took 14 seconds
convergence after 572 epochs took 11 seconds
convergence after 674 epochs took 13 seconds
convergence after 596 epochs took 10 seconds
convergence after 271 epochs took 6 seconds
convergence after 680 epochs took 14 seconds
convergence after 365 epochs took 6 seconds
convergence after 569 epochs took 11 seconds
convergence after 607 epochs took 12 seconds
convergence after 626 epochs took 11 seconds
convergence after 572 epochs took 11 seconds
convergence after 496 epochs took 10 seconds
convergence after 136 epochs took 2 seconds
convergen

KeyError: 'Tag'

In [None]:
from joblib import dump
from google.colab import drive



# Define file paths in Google Drive
model_path = '/content/drive/MyDrive/MCML/multioutput_model.joblib'
vectorizer_path = '/content/drive/MyDrive/MCML/tfidf_vectorizer.joblib'
mlb_path = '/content/drive/MyDrive/MCML/multilabel_binarizer.joblib'

# Save the model and preprocessing components
dump(model, model_path)
dump(vectorizer, vectorizer_path)
dump(mlb, mlb_path)

print("Model and components saved successfully:")
print(f"Model saved at: {model_path}")
print(f"TF-IDF Vectorizer saved at: {vectorizer_path}")
print(f"MultiLabel Binarizer saved at: {mlb_path}")
