In [51]:
import pandas as pd
import string
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from tqdm import tqdm

# Ensure you have nltk resources downloaded (if not, run these lines once)
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
tqdm.pandas()

# Load your dataset (replace 'your_dataset.csv' with the actual file path)
df = pd.read_csv('/home/mdafifal.mamun/notebooks/triagerX/data/typescript/ts_bug_data.csv')

# Combine issue_title and issue_body into a single feature
df['text'] = df['issue_title'] + ' ' + df['issue_body']

data = df[df["owner"].notna()]

df = df[df["owner"].notna()]

num_issues = len(df)
print(f"Total number of issues after processing: {num_issues}")

num_cv = 10
block = 9

samples_per_block = len(df) // num_cv
sliced_df = df[: samples_per_block * (block + 1)]

print(f"Samples per block: {samples_per_block}, Selected block: {block}")

# Train and Validation preparation

df_train = sliced_df[: samples_per_block * block]
df_test = sliced_df[samples_per_block * block : samples_per_block * (block + 1)]

sample_threshold = 20
developers = df_train["owner"].value_counts()
filtered_developers = developers.index[developers >= sample_threshold]
df_train = df_train[df_train["owner"].isin(filtered_developers)]

train_owners = set(df_train["owner"])
test_owners = set(df_test["owner"])

unwanted = list(test_owners - train_owners)

df_test = df_test[~df_test["owner"].isin(unwanted)]

print(f"Training data: {len(df_train)}, Validation data: {len(df_test)}")
print(f"Number of developers in train: {len(df_train.owner.unique())}")
print(f"Number of developers in test: {len(df_test.owner.unique())}")

print(f"Train dataset size: {len(df_train)}")
print(f"Test dataset size: {len(df_test)}")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mdafifal.mamun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/mdafifal.mamun/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Total number of issues after processing: 14877
Samples per block: 1487, Selected block: 9
Training data: 9925, Validation data: 1118
Number of developers in train: 40
Number of developers in test: 22
Train dataset size: 9925
Test dataset size: 1118


In [52]:
data =  pd.concat([df_train, df_test], ignore_index=True, sort=False)

In [53]:
# Preprocessing function
def preprocess_text(text):
    # Lowercase the text
    text = str(text).lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization (split into words)
    tokens = text.split()
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join the tokens back into a single string
    return ' '.join(tokens)

# Apply preprocessing to the text data
data['cleaned_text'] = data['text'].progress_apply(preprocess_text)

# Define the input features (X) and the target labels (y)
X = data['cleaned_text']
y = data['owner']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data, and transform the test data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


100%|██████████| 11043/11043 [00:09<00:00, 1220.10it/s]


# SVM

In [54]:
from sklearn.svm import SVC
import numpy as np

In [55]:
# Initialize the SVM classifier
classifier = SVC(probability=True)  # Set probability=True to enable probability estimates

# Train the classifier
classifier.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the model
print("Overall Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Overall Accuracy: 0.3601809954751131
                   precision    recall  f1-score   support

       a-tarasyuk       0.31      0.29      0.30        34
       ahejlsberg       0.43      0.77      0.55       135
           ajafff       0.00      0.00      0.00         3
          amcasey       0.00      0.00      0.00        17
         andarist       0.44      0.21      0.29        19
     andrewbranch       0.36      0.29      0.32        62
           aozgaa       0.00      0.00      0.00         6
       armanio123       0.00      0.00      0.00         6
          basarat       0.00      0.00      0.00         1
           billti       0.00      0.00      0.00         3
danielrosenwasser       0.27      0.19      0.22        78
   dragomirtitian       0.00      0.00      0.00         5
      elibarzilay       0.00      0.00      0.00        12
         gabritto       0.00      0.00      0.00         8
  graphemecluster       0.00      0.00      0.00         4
         iisaduan 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [56]:
decision_scores = classifier.decision_function(X_test_tfidf)

In [57]:
def top_n_accuracy_report(decision_scores, y_true, classifier, top_ns=[1, 3, 5, 10, 20]):
    accuracies = {}
    for n in top_ns:
        # Get the top n predicted classes for each sample
        top_n_indices = np.argsort(-decision_scores, axis=1)[:, :n]
        
        # Map indices to original class labels
        top_n_labels = [[classifier.classes_[index] for index in indices] for indices in top_n_indices]

        # Calculate accuracy based on whether true labels are in top n predictions
        accuracy = np.mean([y_true.iloc[i] in top_n_labels[i] for i in range(len(y_true))])
        accuracies[n] = accuracy
    return accuracies

# Calculate and display top 1, 3, 5, 10, 20 accuracies
top_n_accuracies = top_n_accuracy_report(decision_scores, y_test, classifier)
for n, acc in top_n_accuracies.items():
    print(f"Top-{n} Accuracy: {acc:.4f}")

Top-1 Accuracy: 0.3584
Top-3 Accuracy: 0.5819
Top-5 Accuracy: 0.6679
Top-10 Accuracy: 0.8072
Top-20 Accuracy: 0.9348
