In [None]:
import sys
ON_COLAB = 'google.colab' in sys.modules

if ON_COLAB:
    BASE_DIR = "/content"
    print("You are working on Google Colab.")
    print(f'Files will be downloaded to "{BASE_DIR}".')
    # adjust release
    GIT_ROOT = "https://github.com/blueprints-for-text-analytics-python/early-release/raw/master"
else:
    BASE_DIR = ".."
    print("You are working on a local system.")
    print(f'Files will be searched relative to "{BASE_DIR}".')

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'png'

%run "$BASE_DIR/settings.py"

%reload_ext autoreload
%autoreload 2

In [None]:
import matplotlib.pyplot as plt
import html 
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.dummy import DummyClassifier

# How to use text classification algorithms to identify and classify text into multiple categories

## What you'll learn and what we will build

# Introducing the Java Development Tools Bug Dataset

In [None]:
df = pd.read_csv('../data/jdt-bugs-dataset/eclipse_jdt.csv.gz')
print (df.columns)
df.sample(2, random_state=42)

In [None]:
df = df.drop(columns=['Duplicated_issue']) ###
pd.set_option('display.max_colwidth', -1)
df.sample(1, random_state=123).T

In [None]:
df['Priority'].value_counts().sort_index().plot(kind='bar')

In [None]:
df['Component'].value_counts()

# Blueprint: Building a Text Classification system

## Step 1 - Data Preparation

In [None]:
df = df[['Title','Description','Priority']]
df = df.dropna()
df['text'] = df['Title'] + ' ' + df['Description']
df = df.drop(columns=['Title','Description'])
df.columns

In [None]:
df['text'] = df['text'].apply(clean)
df = df[df['text'].str.len() > 50]
df.sample(2, random_state=0)

## Step 2 - Train-Test Split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(df['text'], df['Priority'], 
                                                    test_size=0.2, random_state=42,
                                                    stratify=df['Priority'])

print ('Size of Training Data ', X_train.shape[0])
print ('Size of Test Data ', X_test.shape[0])

## Step 3 - Training the machine learning model

In [None]:
tfidf = TfidfVectorizer(min_df = 10, ngram_range=(1,2), stop_words="english")
X_train_tf = tfidf.fit_transform(X_train)

In [None]:
model1 = LinearSVC(random_state=0, tol=1e-5)
model1.fit(X_train_tf, Y_train)

## Step 4 - Model Evaluation

In [None]:
X_test_tf = tfidf.transform(X_test)

In [None]:
Y_pred = model1.predict(X_test_tf)
print ('Accuracy Score - ', accuracy_score(Y_test, Y_pred))

In [None]:
clf = DummyClassifier(strategy='most_frequent', random_state=42)
clf.fit(X_train, Y_train)
Y_pred_baseline = clf.predict(X_test)
print ('Accuracy Score - ', accuracy_score(Y_test, Y_pred_baseline))

### Precision and Recall


In [None]:
Y_pred = model1.predict(X_test_tf)
confusion_matrix(Y_test, Y_pred)

In [None]:
plot_confusion_matrix(model1, X_test_tf, Y_test, values_format='d', cmap=plt.cm.Blues)
plt.show()

In [None]:
print(classification_report(Y_test, Y_pred))

### Class Imbalance


In [None]:
# Filter bug reports with priority P3 and sample 4000 rows from it
df_sampleP3 = df[df['Priority'] == 'P3'].sample(n=4000, random_state=123)

# Create a separate dataframe containing all other bug reports
df_sampleRest = df[df['Priority'] != 'P3']

# Concatenate the two dataframes to create the new balanced bug reports dataset
df_balanced = pd.concat([df_sampleRest, df_sampleP3])

# Check the status of the class imbalance
df_balanced['Priority'].value_counts()

# Final Blueprint for Text Classification

In [None]:
# Loading the balanced dataframe

df = df_balanced[['text','Priority']]
df = df.dropna()

# Step 1 - Data Preparation

df['text'] = df['text'].apply(clean)

# Step 2 - Train-Test Split
X_train, X_test, Y_train, Y_test = train_test_split(df['text'], df['Priority'], 
                                                    test_size=0.2, random_state=42, stratify=df['Priority'])
print ('Size of Training Data ', X_train.shape[0])
print ('Size of Test Data ', X_test.shape[0])

# Step 3 - Training the Machine Learning model

tfidf = TfidfVectorizer(min_df = 10, ngram_range=(1,2), stop_words="english")
X_train_tf = tfidf.fit_transform(X_train)

model1 = LinearSVC(random_state=0, tol=1e-5)
model1.fit(X_train_tf, Y_train)

# Step 4 - Model Evaluation

X_test_tf = tfidf.transform(X_test)
Y_pred = model1.predict(X_test_tf)
print ('Accuracy Score - ', accuracy_score(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))

In [None]:
clf = DummyClassifier(strategy='stratified', random_state=21)
clf.fit(X_train, Y_train)
Y_pred_baseline = clf.predict(X_test)
print ('Accuracy Score - ', accuracy_score(Y_test, Y_pred_baseline))

In [None]:
## Create a dataframe combining the Title and Description, 
## Actual and Predicted values that we can explore
frame = { 'text': X_test, 'actual': Y_test, 'predicted': Y_pred }
result = pd.DataFrame(frame)

result[((result['actual'] == 'P1') | (result['actual'] == 'P2')) &
       (result['actual'] == result['predicted'])].sample(2, random_state=22)

In [None]:
result[((result['actual'] == 'P1') | (result['actual'] == 'P2')) &
       (result['actual'] != result['predicted'])].sample(2, random_state=33)

# Cross-Validation

In [None]:
# Vectorization

tfidf = TfidfVectorizer(min_df = 10, ngram_range=(1,2), stop_words="english")
df_tf = tfidf.fit_transform(df['text']).toarray()

# Cross Validation with 5 folds

scores = cross_val_score(estimator=model1,
                         X=df_tf,
                         y=df['Priority'],
                         cv=5)

print ("Validation scores from each iteration of the cross validation ", scores)
print ("Mean value across of validation scores ", scores.mean())
print ("Standard deviation of validation scores ", scores.std())

# Hyperparameter Tuning with Grid Search

In [None]:
training_pipeline = Pipeline(steps = [('tfidf', TfidfVectorizer(stop_words="english")),
                                      ('model', LinearSVC(random_state=21, tol=1e-5))])

grid_param = [
  {'tfidf__min_df':[5, 10], 'tfidf__ngram_range': [(1, 3), (1, 6)], 'model__penalty': ['l2'], 'model__loss':['hinge'], 'model__max_iter':[10000]},
  {'tfidf__min_df':[5, 10], 'tfidf__ngram_range': [(1, 3), (1, 6)], 'model__C': [1, 10], 'model__tol':[1e-2, 1e-3]}
]

gridSearchProcessor = GridSearchCV(estimator=training_pipeline,
                            param_grid=grid_param,
                            cv=5)
gridSearchProcessor.fit(df['text'], df['Priority'])

best_params = gridSearchProcessor.best_params_
print ("Best alpha parameter identified by grid search ", best_params)

best_result = gridSearchProcessor.best_score_
print ("Best result identified by grid search ", best_result)

In [None]:
gridsearch_results = pd.DataFrame(gridSearchProcessor.cv_results_)
gridsearch_results[['rank_test_score','mean_test_score','params']].sort_values(by=['rank_test_score'])[:5]

# Blueprint recap and conclusion


In [None]:
# Flag that determines the choice of SVC and LinearSVC
runSVC = False

# Loading the dataframe

df = pd.read_csv('../data/jdt-bugs-dataset/eclipse_jdt.csv')
df = df[['Title','Description','Component']]
df = df.dropna()
df['text'] = df['Title'] + df['Description']
df = df.drop(columns=['Title','Description'])

# Step 1 - Data Preparation
df['text'] = df['text'].apply(clean)
df = df[df['text'].str.len() > 50]


if (runSVC):
    # Sample the data when running SVC to ensure reasonable run-times
    df = df.groupby('Component', as_index=False).apply(pd.DataFrame.sample, random_state=42, frac=.2)

# Step 2 - Train-Test Split
X_train, X_test, Y_train, Y_test = train_test_split(df['text'], df['Component'], 
                                                    test_size=0.2, random_state=42,
                                                    stratify=df['Component'])
print ('Size of Training Data ', X_train.shape[0])
print ('Size of Test Data ', X_test.shape[0])

# Step 3 - Training the Machine Learning model
tfidf = TfidfVectorizer(stop_words="english")

if (runSVC):
    model = SVC(random_state=42, probability=True)
    grid_param = [{'tfidf__min_df':[5, 10], 'tfidf__ngram_range': [(1, 3), (1, 6)], 
                   'model__C':[1, 100], 'model__kernel': ['linear']}]
else:
    model = LinearSVC(random_state=42, tol=1e-5)
    grid_param = {'tfidf__min_df':[5, 10], 'tfidf__ngram_range': [(1, 3), (1, 6)], 
                  'model__C': [1, 100], 'model__loss':['hinge']}

training_pipeline = Pipeline(steps = [('tfidf', TfidfVectorizer(stop_words="english")),
                                      ('model', model)])

gridSearchProcessor = GridSearchCV(estimator=training_pipeline,
                            param_grid=grid_param,
                            cv=5)

gridSearchProcessor.fit(X_train, Y_train)

best_params = gridSearchProcessor.best_params_
print ("Best alpha parameter identified by grid search ", best_params)

best_result = gridSearchProcessor.best_score_
print ("Best result identified by grid search ", best_result)

best_model = gridSearchProcessor.best_estimator_

# Step 4 - Model Evaluation

Y_pred = best_model.predict(X_test)
print ('Accuracy Score - ', accuracy_score(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))

In [None]:
clf = DummyClassifier(strategy='most_frequent', random_state=21)
clf.fit(X_train, Y_train)
Y_pred_baseline = clf.predict(X_test)
print ('Accuracy Score - ', accuracy_score(Y_test, Y_pred_baseline))

In [None]:
## Create a dataframe combining the Title and Description, 
## Actual and Predicted values that we can explore
frame = { 'text': X_test, 'actual': Y_test, 'predicted': Y_pred } 
result = pd.DataFrame(frame)

result[result['actual'] == result['predicted']].sample(2, random_state=11)

In [None]:
result[result['actual'] != result['predicted']].sample(2, random_state=11)

# Closing Remarks


# Further Reading
