# **Ford-Sentence Classification using Naive Bayes Classifier (NBC)**

Mount the GDrive in this Google colab - 

In [None]:
# Code for mounting your Gdrive

from google.colab import drive
drive.mount('/content/drive') 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Use text data for text classification:
Now, the Kaggle dataset - https://www.kaggle.com/datasets/gaveshjain/ford-sentence-classifiaction-dataset?resource=download, is uploaded in my GDrive and renamed the archive folder as NBC. Let's look at the contents of each file that the dataset contains.

In [None]:
import pandas as pd
# base folder name
folder_name = "/content/drive/MyDrive/NBC/"
sample_submission_csv = folder_name + "sample_submission.csv"

sample = pd.read_csv(sample_submission_csv)
# contents of sample_submission.csv
sample

Unnamed: 0,Sentence_id,Type
0,GERRES15609,Responsibility
1,PHERES15784,Responsibility
2,GERREQ10457,Requirement
3,GERSKL27235,Skill
4,HONSSK18415,SoftSkill


In [None]:
test_csv = folder_name + "test_data.csv"
test = pd.read_csv(test_csv)
# contents of test_data
test

Unnamed: 0.1,Unnamed: 0,Sentence_id,New_Sentence
0,0,UAERES16346,Collaborate across all of DB&T practices and o...
1,1,COGREQ15586,Strong technology expertise in Identity and A...
2,2,UAEREQ12722,Strong knowledge on Service Virtualization.
3,3,COGSKL29155,Architect scalable data processing and analyti...
4,4,PHERES12551,"Map client organization, build outstanding rel..."
...,...,...,...
15024,15024,HONEXP17084,A minimum of 3 years of experience in Industri...
15025,15025,UAESSK45553,Track record of managing multiple projects wit...
15026,15026,GERSSK14317,"In addition, the candidate should have effecti..."
15027,15027,PHEREQ4980,Candidate should have sufficient knowledge on ...


In [None]:
train_csv = folder_name + "train_data.csv"
train = pd.read_csv(train_csv)
# contents of test_data
train

Unnamed: 0.1,Unnamed: 0,Sentence_id,New_Sentence,Type
0,0,GERRES15609,Author and/or Review architecture/design and o...,Responsibility
1,1,PHERES15784,Should be able to develop custom dynamic shape...,Responsibility
2,2,GERREQ10457,Experience in working crosslly with a larger ...,Requirement
3,3,GERSKL27235,"Previous business experience, including but no...",Skill
4,4,HONSSK18415,Delivering fast and right the first time.,SoftSkill
...,...,...,...,...
60110,60110,UAERES18030,"In this position, you will utilize your progr...",Responsibility
60111,60111,GERRES3026,"In addition, this individual will be responsib...",Responsibility
60112,60112,INDSSK5492,Good problem solving skills.,SoftSkill
60113,60113,PHESSK15092,Good Excel knowledge .,SoftSkill


# Unique Categories/Classes:
Since we have to classify ford sentence into categories/types, let's find unique types or classes.

In [None]:
# unique categories/classes
unique_sentence_categories_type = train['Type'].unique()
print("Total number of Categories : " + str(len(unique_sentence_categories_type)))
print("Categories are : ")
count = 1
for type in unique_sentence_categories_type:
  print(str(count) + ". " + type)
  count += 1

Total number of Categories : 6
Categories are : 
1. Responsibility
2. Requirement
3. Skill
4. SoftSkill
5. Education
6. Experience


# **Process to build NBC:**
**Step 1:** Merge the dataset into one - test_data.csv & train_data.csv
In our case dataframes 'test' and 'train'. Used - https://pandas.pydata.org/pandas-docs/version/0.20/merging.html

In [None]:
# merge the two dataframes
dataframes = [train, test]
ford_dataset = pd.concat(dataframes)
# 15029 rows in test file + 60115 rows in train file = 75144
ford_dataset

Unnamed: 0.1,Unnamed: 0,Sentence_id,New_Sentence,Type
0,0,GERRES15609,Author and/or Review architecture/design and o...,Responsibility
1,1,PHERES15784,Should be able to develop custom dynamic shape...,Responsibility
2,2,GERREQ10457,Experience in working crosslly with a larger ...,Requirement
3,3,GERSKL27235,"Previous business experience, including but no...",Skill
4,4,HONSSK18415,Delivering fast and right the first time.,SoftSkill
...,...,...,...,...
15024,15024,HONEXP17084,A minimum of 3 years of experience in Industri...,
15025,15025,UAESSK45553,Track record of managing multiple projects wit...,
15026,15026,GERSSK14317,"In addition, the candidate should have effecti...",
15027,15027,PHEREQ4980,Candidate should have sufficient knowledge on ...,


In [None]:
# unique categories/classes after merging data
unique_sentence_categories_type_after_merging = ford_dataset['Type'].unique()
print("Total number of Categories after merging the data: " + str(len(unique_sentence_categories_type_after_merging)))
print("Unique Categories after merging data are : ")
# NaN for uncategorized type/class
print(unique_sentence_categories_type_after_merging)

Total number of Categories after merging the data: 7
Unique Categories after merging data are : 
['Responsibility' 'Requirement' 'Skill' 'SoftSkill' 'Education'
 'Experience' nan]


Here NaN type is for uncategorized sentence that we will do using Naive bayes method.
# Visualizing Category Types and their counts:

In [None]:
import matplotlib.pyplot as plt
labels = list(ford_dataset['Type'].unique())
labels

['Responsibility',
 'Requirement',
 'Skill',
 'SoftSkill',
 'Education',
 'Experience',
 nan]

# Data Pre-processing:
We will clean up the data. Firstly, we will remove punctuation or any special symbols present in the sentence text.

In [None]:
# Remove punctuation/any special symbols, 
# first convert into lower case letters and then replace the special symbols with a white space.
# drop Unnamed: 0, sentence_id column as its not needed.

ford_dataset['New_Sentence'] = ford_dataset['New_Sentence'].str.lower()
ford_dataset['New_Sentence'] = ford_dataset['New_Sentence'].str.replace('[^\w\s]',' ')
ford_dataset = ford_dataset.drop('Sentence_id', axis=1)
ford_dataset = ford_dataset.drop('Unnamed: 0', axis=1)
ford_dataset

  


Unnamed: 0,New_Sentence,Type
0,author and or review architecture design and o...,Responsibility
1,should be able to develop custom dynamic shape...,Responsibility
2,experience in working crosslly with a larger ...,Requirement
3,previous business experience including but no...,Skill
4,delivering fast and right the first time,SoftSkill
...,...,...
15024,a minimum of 3 years of experience in industri...,
15025,track record of managing multiple projects wit...,
15026,in addition the candidate should have effecti...,
15027,candidate should have sufficient knowledge on ...,


**Step 2:** Split the dataset into training, test and dev data.

In [None]:
# Let's say we want to split the data in 80:10:10 for train:dev:test dataset
train_size = 0.8
dev_size = 0.1
test_size = 0.1

train_index = int(len(ford_dataset)*train_size)

# First we need to sort the dataset by the desired column 
ford_dataset.sort_values(by = 'Type', ascending=True, inplace=True)

ford_dataset_train = ford_dataset[0:train_index]
ford_dataset_rem = ford_dataset[train_index:]
ford_dataset_rem

dev_index = int(len(ford_dataset)*dev_size)
ford_dataset_dev = ford_dataset[train_index:train_index+dev_index]
ford_dataset_test = ford_dataset[train_index+dev_index:]

Unnamed: 0,New_Sentence,Type
60114,bachelors degree in electrical engineering m...,Education
51198,master en rh idement avec une sp en communic...,Education
16967,masters degree would be preferred msc,Education
29316,safety systems fsc sm,Education
3030,his is a dummy block of text and this is repre...,Education
...,...,...
56898,excellent soft skill communications skills v...,SoftSkill
23136,small scale project management solve clearly ...,SoftSkill
56910,good teamwork and interpersonal skills,SoftSkill
56900,use your agility to balance business needs a...,SoftSkill


In [None]:
print("********** TRAINING DATASET *************")
print(ford_dataset_train.info())
print(ford_dataset_train.shape)
print("\n********** TESTING DATASET *************")
print(ford_dataset_test.info())
print(ford_dataset_test.shape)
print("\n********** DEVELOPMENT DATASET *************")
print(ford_dataset_dev.info())
print(ford_dataset_dev.shape)

********** TRAINING DATASET *************
<class 'pandas.core.frame.DataFrame'>
Int64Index: 60115 entries, 60114 to 34616
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   New_Sentence  59002 non-null  object
 1   Type          60115 non-null  object
dtypes: object(2)
memory usage: 1.4+ MB
None
(60115, 2)

********** TESTING DATASET *************
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7515 entries, 7514 to 15028
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   New_Sentence  7377 non-null   object
 1   Type          0 non-null      object
dtypes: object(2)
memory usage: 176.1+ KB
None
(7515, 2)

********** DEVELOPMENT DATASET *************
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7514 entries, 0 to 7513
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   New_

**Step 3:** Build Vocabulary list and dictionary of words and their ocurrence.

Before that let's analyse the training dataset by calculating prior probability of each sentence type.

In [96]:
# Segregate the features & target data
# Feature = New_Sentence
# Target = Type
ford_dataset_train_X = ford_dataset_train['New_Sentence']
ford_dataset_dev_X = ford_dataset_dev['New_Sentence']
ford_dataset_test_X = ford_dataset_test['New_Sentence']

ford_dataset_train_y = ford_dataset_train['Type']
ford_dataset_dev_y = ford_dataset_dev['Type']
ford_dataset_test_y = ford_dataset_test['Type']

unique_training_types = list(ford_dataset_train_y.unique())
#print(unique_training_types)
dict_types = {}
for class_type in unique_training_types:
  dict_types[class_type] = sum(ford_dataset_train_y == class_type)

for key_type, count in dict_types.items():
  print(key_type + ": " + str(count))

Education: 4637
Experience: 9248
Requirement: 14132
Responsibility: 15561
Skill: 6956
SoftSkill: 9581


In [97]:
# Calculate Prior Probability of each class type
sentence_type_prior_probability = dict()
total_sentence_type_count = ford_dataset_train_y.size
for key_type, value_count in dict_types.items():
  sentence_type_prior_probability[key_type] = value_count / total_sentence_type_count;

for key_type, probability in sentence_type_prior_probability.items():
  print(key_type + ": " + str(probability))


Education: 0.07713549031023871
Experience: 0.1538384762538468
Requirement: 0.23508275804707643
Responsibility: 0.25885386342842887
Skill: 0.11571155285702403
SoftSkill: 0.15937785910338517
