In [1]:
from IPython.display import clear_output

In [None]:
!pip install pandas numpy scikit-learn
!pip install transformers
!pip install lightgbm xgboost

In [3]:
!pip install tensorflow

In [4]:
import os
import pandas as pd

In [5]:
data = pd.read_json("technical_test_data.json")

## Data Analysis

In [None]:
# find out the number of observations in the dataset
data.shape

In [None]:
# find out the column names, if any na exists and the data type
data.info()

In [None]:
#lets see some records
data.head()

In [None]:
data.tags.value_counts()

In [None]:
#check if number of unique values for each column
for col in data.drop(columns = 'tags').columns:
  print(col, ':', data[col].nunique())

## Data Cleansing

Since all the above features except Description has got just one unique value hence dropping those columns because it won't add any value to our machine learning model

In [None]:
from transformers import pipeline

# Instantiating bart model with zero- shot classification
pipe = pipeline(model="facebook/bart-large-mnli")
clear_output()

# testing a positive sample
output = pipe("- __EMAIL__ Hi , I have just ordered a pair of __PRODUCT_NAME__ and put the relevant code for free __COMPANY__ bag as new customer . I have received trainers but no bag . Will this be sent separately? \n Seems to be an inefficient system, or was this an oversight?\n Kind regards\n __NAME__",
         candidate_labels = ['Is customer asking for the status of his order?', 'Is it just a statement'])
output

In [None]:
# testing a negative sample
pipe("The meal was awesome", candidate_labels = ['Is customer asking for the status of his order?', 'Is it just a statement'])

In [13]:
def is_order_status_request(text):
  '''
  Classify the input text whether it is a query by customer asking for the status or is it just a statement
  using 'pre-trained NLI-based zero shot classification model'
  '''
  import numpy as np
  output = pipe(text, candidate_labels = ['Is customer asking for the status of his order?', 'Is it just a statement'])
  max_id = np.argmax(output['scores'])
  return 1 if "status" in output['labels'][max_id] else 0

In [None]:
%%time
# Applying the above function to the whole dataset
data['is_order_status_request'] = data['Description'].apply(is_order_status_request) #takes almost 25 mins to run but just one time so ok

In [None]:
pd.set_option("display.max_colwidth", -1)
final_df = data[['Description', 'is_order_status_request']]
final_df.head()
final_df.to_csv(os.path.join(os.getcwd(),"final_df.csv"),index = False)

## Vectorise the data with TfidfVectorizer

In [None]:
%%time
# Perform Vectorisation of Description
from sklearn.feature_extraction.text import TfidfVectorizer


# Create a TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and Transform the 'Description' column
tfidf_matrix = vectorizer.fit_transform(final_df['Description'])

# Now, tfidf_matrix is the feature matrix. We can use it in ML model.
# To see the matrix as a DataFrame:
vectorised_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

In [None]:
vectorised_df.head(3)

In [None]:
vectorised_df.shape

In [None]:
vectorised_df_with_target = pd.concat([vectorised_df, final_df['is_order_status_request']],axis = 1 )
vectorised_df_with_target.head(3)

## Training Light GBM classifier and XGBClassifier models

In [20]:
from sklearn import preprocessing
X = vectorised_df
y = final_df['is_order_status_request']

# Train Test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [21]:
def train(model, params):
  return model(**params).fit(X_train, y_train)


In [None]:
%%time
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
params  = {'num_leaves': 128,
           'max_depth': 35,
           'learning_rate': 0.012385137298860933,
           'colsample_bytree': 0.64,
           'n_estimators': 500}

In [None]:
%%time
lgbClf = train(LGBMClassifier, params)
clear_output()
lgbClf

In [None]:
%%time
xgbClf = train(XGBClassifier, params)
clear_output()
xgbClf

## Evaluating different models with roc auc score and classification report

In [25]:
from sklearn.metrics import roc_auc_score, classification_report

### Evaluating LGBMClassifier


In [None]:
preds = lgbClf.predict_proba(X_test)
clear_output()
roc_auc_score(y_test, preds[:,1])

In [None]:
y_pred = lgbClf.predict(X_test)
print(classification_report(y_test, y_pred))

### Evaluating XGBClassifier

In [None]:
preds = xgbClf.predict_proba(X_test)
clear_output()
roc_auc_score(y_test, preds[:,1])

In [None]:
y_pred = xgbClf.predict(X_test)
print(classification_report(y_test, y_pred))

### Since the dataset is small, it might be more important to focus on gathering more data before fine-tuning hyperparameters. Having a larger dataset can help ensure that the hyperparameters we find are more likely to generalize to new data. Hence I do not perform hyper parameter tuning of the models in this case.


In [30]:
pip freeze > requirements.txt